# Belegungsquote der Frauenhäuser in Deutschland

## Cleaning up original data

In [None]:
import pandas as pd
import numpy as np

In [None]:
fpath = "data/scraped/230103_datenstand.json"

In [None]:
# load data
raw = pd.read_json(fpath, orient="index")
raw["shelter_id"] = raw.index
raw = raw.reset_index(drop=True)

# clean up geometry columns
raw.loc[raw.geography.notnull(), "geometry"] = raw.loc[raw.geography.notnull(), "geography"]

In [None]:
# get keys
keys = pd.read_csv("data/helpers/free_places_key.csv")

In [None]:
# get all timeseries data
df = pd.DataFrame()

for i, row in raw.iterrows():
    
    temp = pd.DataFrame(row["data"])
    temp["shelter_id"] = row["shelter_id"]
    temp["shelter_name"] = row["title"]
    temp["latitude"] = row["geometry"]["coordinates"][1]
    temp["longitude"] = row["geometry"]["coordinates"][0]
    df = pd.concat([df, temp])

In [None]:
# reformat timeseries
df.timestamp = pd.to_datetime(df.timestamp, dayfirst=True)
df["date"] = df.timestamp.dt.date

KeyboardInterrupt: 

In [None]:
# add text keys
df.loc[df.freePlaces=="", "freePlaces"] = np.nan
df.freePlaces = df.freePlaces.astype(float)
df = pd.merge(df, keys, on="freePlaces")

# fill nas
df.loc[df.description.isnull(), "description"] = "k.A."

# add simplified NAs
desc_short = {
    "Aufnahme möglich für Frauen mit 4 oder mehr Kindern":"Aufnahme möglich für Frauen mit mehreren Kindern",
    "Aufnahme möglich für Frauen mit 3 Kindern":"Aufnahme möglich für Frauen mit mehreren Kindern",
    "Aufnahme möglich für Frauen mit 2 Kindern":"Aufnahme möglich für Frauen mit mehreren Kindern",
    "Aufnahme möglich für Frauen ohne Kinder, Aufnahme möglich für Frauen mit 1 Kind":"Aufnahme möglich für Frauen mit bis zu einem Kind",
    "Aufnahme möglich für Frauen ohne Kinder":"Platz für Frauen ohne Kinder",
    "Keine Aufnahme möglich":"Keine Aufnahme möglich",
    "Aufnahme möglich ohne detaillierte Angabe":"Aufnahme möglich ohne detaillierte Angabe",
    "k.A.":"keine Angabe"
}

df["desc_short"] = df.description.map(desc_short)

In [None]:
# add bundesland
geocoded = pd.read_csv("data/helpers/shelters_geocoded.csv").drop(["latitude","longitude"], axis=1)
df = pd.merge(df, geocoded, on=["shelter_name","shelter_id"])

In [None]:
df.head()

## Monthly shelter-level summary

In [None]:
# first get one value per day based on most common
df_shelter_date = df.groupby(["shelter_name","shelter_id","bundesland","bez","gen","date","desc_short","latitude","longitude"]).agg(
    n = ("desc_short", "count")
).reset_index().sort_values(by=["shelter_id","date"], ascending=False) \
.groupby(["shelter_name","shelter_id","bundesland","bez","gen","latitude","longitude","date"]).agg(
    status = ("desc_short", "first")
).reset_index()

In [None]:
# get month-year
df_shelter_date["date"] = pd.to_datetime(df_shelter_date["date"])
df_shelter_date["monthyear"] = df_shelter_date.date.dt.to_period("M")
df_shelter_date.head()

In [None]:
df_shelter_monthyear = df_shelter_date.groupby(["shelter_name","shelter_id","bundesland","bez","gen","monthyear","status"]).agg(
    n = ("status", "count")
).reset_index()
df_shelter_monthyear.head()

In [None]:
# pivot to wide and fill values
df_shelter_monthly = pd.pivot(
    df_shelter_monthyear,
    index=["shelter_name","shelter_id","bundesland","bez","gen","monthyear"],
    columns="status",
    values="n"
).reset_index().replace(np.nan, 0)
df_shelter_monthly.head()

In [None]:
# calculate total
df_shelter_monthly["n"] = df_shelter_monthly.iloc[:,6:].sum(axis=1)

In [None]:
# get columns as pct
df_shelter_monthly.iloc[:,6:-1] = df_shelter_monthly.iloc[:,6:-1].apply(lambda x : x / df_shelter_monthly.n, axis=0)

In [None]:
df_shelter_monthly = df_shelter_monthly.drop("n", axis=1)
df_shelter_monthly.to_csv("./data/cleaned/belegungsquote_nach_year_month_shelter.csv", index=False)

## Overall shelter-level summary

In [None]:
df_shelter = df_shelter_date.groupby(["shelter_name","shelter_id","bundesland","bez","gen","status","latitude","longitude"]).agg(
    n = ("status", "count")
).reset_index()
df_shelter.head()

In [None]:
# pivot to wide and fill values
df_shelter = pd.pivot(
    df_shelter,
    index=["shelter_name","shelter_id","bundesland","bez","gen","latitude","longitude"],
    columns="status",
    values="n"
).reset_index().replace(np.nan, 0)

# calculate total
df_shelter["n"] = df_shelter.iloc[:,7:].sum(axis=1)

# get columns as pct
df_shelter.iloc[:,7:-1] = df_shelter.iloc[:,7:-1].apply(lambda x : x / df_shelter.n, axis=0)

df_shelter = df_shelter.drop("n", axis=1)
df_shelter.to_csv("./data/cleaned/belegungsquote_nach_shelter.csv", index=False)

## Get all shelters with 50% or more keine Angabe

In [None]:
no_data = df_shelter.loc[df_shelter["keine Angabe"] >= 0.50, "shelter_id"].values

## Gesamtübersicht nach Monat

In [None]:
monthly = df_shelter_date.loc[~df_shelter_date.shelter_id.isin(no_data),].groupby(["monthyear","status"]).agg(
    n = ("status", "count")
).reset_index()

monthly = pd.pivot(
    monthly,
    index=["monthyear"],
    columns="status",
    values="n"
).reset_index().replace(np.nan, 0)

monthly["n"] = monthly.iloc[:,1:].sum(axis=1)
monthly.iloc[:,1:-1] = monthly.iloc[:,1:-1].apply(lambda x : x / monthly.n, axis=0)

monthly = monthly.drop("n", axis=1)

monthly.head()

In [None]:
monthly.to_csv("./data/cleaned/monthly_overview.csv", index=False)

## Gesamtübersicht nach Bundesland

In [None]:
# filter out where no data and mean of each status % by bundesland
nach_bundesland = df_shelter.loc[~df_shelter.shelter_id.isin(no_data),] \
    .groupby(["bundesland"]) \
    .mean() \
    .reset_index()

nach_bundesland = nach_bundesland.drop(["shelter_id","latitude","longitude"], axis=1)

nach_bundesland

In [None]:
nach_bundesland.to_csv("./data/cleaned/bundesland_overview.csv", index=False)

## Weihnachts Beispiel - 12.25.2022 um 8 Uhr

In [None]:
weihnacht = df.loc[df.timestamp==pd.to_datetime("2022-25-12 08:01:00", dayfirst=True),] # filter by day
weihnacht = weihnacht[["shelter_name","shelter_id","latitude","longitude","timestamp","description","desc_short","gen","bez","bundesland"]]
weihnacht.to_csv("./data/cleaned/belegungsstatus_25-12-2022_8-01.csv", index=False)