# Belegungsquote der Frauenhäuser in Deutschland

## Cleaning up original data

In [1]:
import pandas as pd
import numpy as np

In [2]:
fpath = "data/scraped/230103_datenstand.json"

In [3]:
# load data
raw = pd.read_json(fpath, orient="index")
raw["shelter_id"] = raw.index
raw = raw.reset_index(drop=True)

# clean up geometry columns
raw.loc[raw.geography.notnull(), "geometry"] = raw.loc[raw.geography.notnull(), "geography"]

In [4]:
# get keys
keys = pd.read_csv("data/helpers/free_places_key.csv")

In [5]:
# get all timeseries data
df = pd.DataFrame()

for i, row in raw.iterrows():
    
    temp = pd.DataFrame(row["data"])
    temp["shelter_id"] = row["shelter_id"]
    temp["shelter_name"] = row["title"]
    temp["latitude"] = row["geometry"]["coordinates"][1]
    temp["longitude"] = row["geometry"]["coordinates"][0]
    df = pd.concat([df, temp])

In [6]:
# reformat timeseries
df.timestamp = pd.to_datetime(df.timestamp, dayfirst=True)
df["date"] = df.timestamp.dt.date

In [7]:
# add text keys
df.loc[df.freePlaces=="", "freePlaces"] = np.nan
df.freePlaces = df.freePlaces.astype(float)
df = pd.merge(df, keys, on="freePlaces")

# fill nas
df.loc[df.description.isnull(), "description"] = "k.A."

# add simplified NAs
desc_short = {
    "Aufnahme möglich für Frauen mit 4 oder mehr Kindern":"Aufnahme möglich für Frauen mit mehreren Kindern",
    "Aufnahme möglich für Frauen mit 3 Kindern":"Aufnahme möglich für Frauen mit mehreren Kindern",
    "Aufnahme möglich für Frauen mit 2 Kindern":"Aufnahme möglich für Frauen mit mehreren Kindern",
    "Aufnahme möglich für Frauen ohne Kinder, Aufnahme möglich für Frauen mit 1 Kind":"Aufnahme möglich für Frauen mit bis zu einem Kind",
    "Aufnahme möglich für Frauen ohne Kinder":"Platz für Frauen ohne Kinder",
    "Keine Aufnahme möglich":"Keine Aufnahme möglich",
    "Aufnahme möglich ohne detaillierte Angabe":"Aufnahme möglich ohne detaillierte Angabe",
    "k.A.":"keine Angabe"
}

df["status"] = df.description.map(desc_short)

In [8]:
# add bundesland
geocoded = pd.read_csv("data/helpers/shelters_geocoded.csv").drop(["latitude","longitude"], axis=1)
df = pd.merge(df, geocoded, on=["shelter_name","shelter_id"])

In [9]:
# get monthyear as column
df["monthyear"] = df.timestamp.dt.to_period("M")

In [10]:
counts = df[["monthyear","timestamp"]].drop_duplicates().groupby(["monthyear"]).count().reset_index()
counts = counts.rename(columns={"timestamp":"timestamp_count"})

## Monthly shelter-level summary

In [11]:
# get count of values per day
df_monthyear = df.groupby(["monthyear","shelter_name","shelter_id","bundesland","bez","gen","status","latitude","longitude"]).agg(
    status_count = ("status", "count")
).reset_index()
df_monthyear.head()

Unnamed: 0,monthyear,shelter_name,shelter_id,bundesland,bez,gen,status,latitude,longitude,status_count
0,2021-12,1. &amp 3. Hamburger Frauenhaus,2382,Hamburg,Kreisfreie Stadt,Hamburg,keine Angabe,53.552828,9.99664,34
1,2021-12,1. Autonomes Frauenhaus,2027,Sachsen,Kreisfreie Stadt,Leipzig,Keine Aufnahme möglich,51.325188,12.373901,34
2,2021-12,1. Frauenhaus Köln,2140,Nordrhein-Westfalen,Kreisfreie Stadt,Köln,Aufnahme möglich für Frauen mit bis zu einem Kind,50.933467,6.998638,4
3,2021-12,1. Frauenhaus Köln,2140,Nordrhein-Westfalen,Kreisfreie Stadt,Köln,Keine Aufnahme möglich,50.933467,6.998638,30
4,2021-12,2. Autonomes Frauenhaus Berlin,2274,Berlin,Kreisfreie Stadt,Berlin,keine Angabe,52.516227,13.377745,34


In [12]:
# pivot to wide and fill values
df_monthyear_wide = pd.pivot(
    df_monthyear,
    index=["shelter_name","shelter_id","bundesland","bez","gen","monthyear"],
    columns="status",
    values="status_count"
).reset_index().replace(np.nan, 0)
df_monthyear_wide.head()

status,shelter_name,shelter_id,bundesland,bez,gen,monthyear,Aufnahme möglich für Frauen mit bis zu einem Kind,Aufnahme möglich für Frauen mit mehreren Kindern,Aufnahme möglich ohne detaillierte Angabe,Keine Aufnahme möglich,Platz für Frauen ohne Kinder,keine Angabe
0,1. &amp 3. Hamburger Frauenhaus,2382,Hamburg,Kreisfreie Stadt,Hamburg,2021-12,0.0,0.0,0.0,0.0,0.0,34.0
1,1. &amp 3. Hamburger Frauenhaus,2382,Hamburg,Kreisfreie Stadt,Hamburg,2022-01,0.0,0.0,0.0,0.0,0.0,92.0
2,1. &amp 3. Hamburger Frauenhaus,2382,Hamburg,Kreisfreie Stadt,Hamburg,2022-02,0.0,0.0,0.0,0.0,0.0,80.0
3,1. &amp 3. Hamburger Frauenhaus,2382,Hamburg,Kreisfreie Stadt,Hamburg,2022-03,0.0,0.0,0.0,0.0,0.0,91.0
4,1. &amp 3. Hamburger Frauenhaus,2382,Hamburg,Kreisfreie Stadt,Hamburg,2022-04,0.0,0.0,0.0,0.0,0.0,85.0


In [13]:
# get columns as percent
df_monthyear_wide = pd.merge(df_monthyear_wide, counts, how="outer")
df_monthyear_wide.head()

Unnamed: 0,shelter_name,shelter_id,bundesland,bez,gen,monthyear,Aufnahme möglich für Frauen mit bis zu einem Kind,Aufnahme möglich für Frauen mit mehreren Kindern,Aufnahme möglich ohne detaillierte Angabe,Keine Aufnahme möglich,Platz für Frauen ohne Kinder,keine Angabe,timestamp_count
0,1. &amp 3. Hamburger Frauenhaus,2382,Hamburg,Kreisfreie Stadt,Hamburg,2021-12,0.0,0.0,0.0,0.0,0.0,34.0,34
1,1. Autonomes Frauenhaus,2027,Sachsen,Kreisfreie Stadt,Leipzig,2021-12,0.0,0.0,0.0,34.0,0.0,0.0,34
2,1. Frauenhaus Köln,2140,Nordrhein-Westfalen,Kreisfreie Stadt,Köln,2021-12,4.0,0.0,0.0,30.0,0.0,0.0,34
3,2. Autonomes Frauenhaus Berlin,2274,Berlin,Kreisfreie Stadt,Berlin,2021-12,0.0,0.0,0.0,0.0,0.0,34.0,34
4,2. Autonomes Frauenhaus Köln,2253,Nordrhein-Westfalen,Kreisfreie Stadt,Köln,2021-12,0.0,0.0,0.0,33.0,1.0,0.0,34


In [14]:
df_monthyear_wide.iloc[:,6:-1] = df_monthyear_wide.iloc[:,6:-1].apply(lambda x : x / df_monthyear_wide.timestamp_count, axis=0)
df_monthyear_wide.head()

Unnamed: 0,shelter_name,shelter_id,bundesland,bez,gen,monthyear,Aufnahme möglich für Frauen mit bis zu einem Kind,Aufnahme möglich für Frauen mit mehreren Kindern,Aufnahme möglich ohne detaillierte Angabe,Keine Aufnahme möglich,Platz für Frauen ohne Kinder,keine Angabe,timestamp_count
0,1. &amp 3. Hamburger Frauenhaus,2382,Hamburg,Kreisfreie Stadt,Hamburg,2021-12,0.0,0.0,0.0,0.0,0.0,1.0,34
1,1. Autonomes Frauenhaus,2027,Sachsen,Kreisfreie Stadt,Leipzig,2021-12,0.0,0.0,0.0,1.0,0.0,0.0,34
2,1. Frauenhaus Köln,2140,Nordrhein-Westfalen,Kreisfreie Stadt,Köln,2021-12,0.117647,0.0,0.0,0.882353,0.0,0.0,34
3,2. Autonomes Frauenhaus Berlin,2274,Berlin,Kreisfreie Stadt,Berlin,2021-12,0.0,0.0,0.0,0.0,0.0,1.0,34
4,2. Autonomes Frauenhaus Köln,2253,Nordrhein-Westfalen,Kreisfreie Stadt,Köln,2021-12,0.0,0.0,0.0,0.970588,0.029412,0.0,34


In [15]:
df_monthyear_wide = df_monthyear_wide.drop(["timestamp_count"], axis=1)
df_monthyear_wide.to_csv("./data/cleaned/belegungsquote_nach_year_month_shelter.csv", index=False)

## Overall shelter-level summary

In [16]:
df_shelter = df.groupby(["shelter_name","shelter_id","bundesland","bez","gen","status","latitude","longitude"]).agg(
    n = ("status", "count")
).reset_index()
df_shelter.head()

Unnamed: 0,shelter_name,shelter_id,bundesland,bez,gen,status,latitude,longitude,n
0,1. &amp 3. Hamburger Frauenhaus,2382,Hamburg,Kreisfreie Stadt,Hamburg,Aufnahme möglich ohne detaillierte Angabe,53.552828,9.99664,18
1,1. &amp 3. Hamburger Frauenhaus,2382,Hamburg,Kreisfreie Stadt,Hamburg,keine Angabe,53.552828,9.99664,1094
2,1. Autonomes Frauenhaus,2027,Sachsen,Kreisfreie Stadt,Leipzig,Aufnahme möglich für Frauen mit bis zu einem Kind,51.325188,12.373901,151
3,1. Autonomes Frauenhaus,2027,Sachsen,Kreisfreie Stadt,Leipzig,Keine Aufnahme möglich,51.325188,12.373901,270
4,1. Autonomes Frauenhaus,2027,Sachsen,Kreisfreie Stadt,Leipzig,Platz für Frauen ohne Kinder,51.325188,12.373901,691


In [17]:
# pivot to wide and fill values
df_shelter_wide = pd.pivot(
    df_shelter,
    index=["shelter_name","shelter_id","bundesland","bez","gen","latitude","longitude"],
    columns="status",
    values="n"
).reset_index().replace(np.nan, 0)

n = sum(counts.timestamp_count)

# get columns as pct
df_shelter_wide.iloc[:,7:] = df_shelter_wide.iloc[:,7:]/n

df_shelter_wide.to_csv("./data/cleaned/belegungsquote_nach_shelter.csv", index=False)

## Get all shelters with 20% or more keine Angabe

In [18]:
no_data = df_shelter_wide.loc[df_shelter_wide["keine Angabe"] >= 0.20, "shelter_id"].values

## Gesamtübersicht nach Monat

In [19]:
monthly = df_monthyear_wide.loc[~df_monthyear_wide.shelter_id.isin(no_data),] \
    .groupby(["monthyear"]) \
    .mean() \
    .reset_index()
monthly = monthly.drop(["shelter_id"], axis=1)
monthly.head()

Unnamed: 0,monthyear,Aufnahme möglich für Frauen mit bis zu einem Kind,Aufnahme möglich für Frauen mit mehreren Kindern,Aufnahme möglich ohne detaillierte Angabe,Keine Aufnahme möglich,Platz für Frauen ohne Kinder,keine Angabe
0,2021-12,0.015803,0.103161,0.083114,0.765876,0.016242,0.015803
1,2022-01,0.017103,0.121986,0.091026,0.725376,0.024418,0.008191
2,2022-02,0.019988,0.103066,0.073585,0.766745,0.027889,0.003302
3,2022-03,0.018868,0.078323,0.071325,0.786025,0.033641,0.004665
4,2022-04,0.020742,0.069194,0.052969,0.811151,0.032283,0.010817


In [20]:
monthly.to_csv("./data/cleaned/monthly_overview.csv", index=False)

## Gesamtübersicht nach Bundesland

In [21]:
# filter out where no data and mean of each status % by bundesland
bundesland = df_monthyear_wide.loc[~df_monthyear_wide.shelter_id.isin(no_data),] \
    .groupby(["bundesland"]) \
    .mean() \
    .reset_index()

bundesland = bundesland.drop(["shelter_id"], axis=1)
bundesland.head()

Unnamed: 0,bundesland,Aufnahme möglich für Frauen mit bis zu einem Kind,Aufnahme möglich für Frauen mit mehreren Kindern,Aufnahme möglich ohne detaillierte Angabe,Keine Aufnahme möglich,Platz für Frauen ohne Kinder,keine Angabe
0,Baden-Württemberg,0.016353,0.083043,0.024369,0.827222,0.006661,0.026315
1,Bayern,0.01107,0.09858,0.06377,0.784203,0.014086,0.021044
2,Berlin,0.005322,0.009954,0.0,0.936291,0.048434,0.0
3,Brandenburg,0.088074,0.166747,0.029309,0.662962,0.0039,0.049008
4,Bremen,0.0,0.030494,0.0,0.870629,0.0,0.0


In [22]:
bundesland.to_csv("./data/cleaned/bundesland_overview.csv", index=False)

## Nach Bundesland und Monat

In [23]:
# filter out where no data and mean of each status % by bundesland
bundesland_monthyear = df_monthyear_wide.loc[~df_monthyear_wide.shelter_id.isin(no_data),] \
    .groupby(["bundesland", "monthyear"]) \
    .mean() \
    .reset_index()

bundesland_monthyear = bundesland_monthyear.drop(["shelter_id"], axis=1)
bundesland_monthyear.head()

Unnamed: 0,bundesland,monthyear,Aufnahme möglich für Frauen mit bis zu einem Kind,Aufnahme möglich für Frauen mit mehreren Kindern,Aufnahme möglich ohne detaillierte Angabe,Keine Aufnahme möglich,Platz für Frauen ohne Kinder,keine Angabe
0,Baden-Württemberg,2021-12,0.006394,0.164962,0.016624,0.769821,0.001279,0.040921
1,Baden-Württemberg,2022-01,0.017958,0.213138,0.037335,0.713611,0.000945,0.0
2,Baden-Württemberg,2022-02,0.053409,0.157386,0.009659,0.745455,0.017614,0.0
3,Baden-Württemberg,2022-03,0.015984,0.052448,0.015485,0.843656,0.007493,0.04046
4,Baden-Württemberg,2022-04,0.016578,0.08877,0.002139,0.806952,0.013904,0.060963


In [24]:
monthly_no_places = bundesland_monthyear[["bundesland","monthyear","Keine Aufnahme möglich"]]
pd.pivot(monthly_no_places, columns="bundesland", values="Keine Aufnahme möglich", index="monthyear").reset_index()

bundesland,monthyear,Baden-Württemberg,Bayern,Berlin,Brandenburg,Bremen,Hessen,Mecklenburg-Vorpommern,Niedersachsen,Nordrhein-Westfalen,Rheinland-Pfalz,Saarland,Sachsen,Sachsen-Anhalt,Schleswig-Holstein,Thüringen
0,2021-12,0.769821,0.727273,1.0,0.632353,,0.95362,0.642157,0.524064,0.861765,0.960784,0.666667,0.222222,0.4,0.792017,0.47549
1,2022-01,0.713611,0.684783,0.876812,0.415761,,0.824074,0.621377,0.699605,0.813792,0.723785,0.811594,0.243961,0.4,0.889752,0.429348
2,2022-02,0.745455,0.7375,0.9875,0.590625,,0.894907,0.63125,0.531818,0.874798,0.90625,0.666667,0.111111,0.3375,0.839286,0.635417
3,2022-03,0.843656,0.708791,0.945055,0.75,,0.889703,0.787546,0.491508,0.875222,0.91453,0.710623,0.212454,0.305495,0.830455,0.761905
4,2022-04,0.806952,0.665241,1.0,0.505882,,0.951634,0.796078,0.505882,0.935863,0.922222,0.815686,0.237908,0.642353,0.943891,0.501961
5,2022-05,0.75942,0.797884,0.848148,0.683333,,0.931276,0.596296,0.567593,0.912903,0.937654,0.488889,0.333333,0.635556,0.942735,0.5
6,2022-06,0.831863,0.846524,0.909804,0.567647,,0.977778,0.75098,0.668778,0.959962,0.969281,0.760784,0.333333,0.785882,0.978281,0.5
7,2022-07,0.738225,0.912008,0.666667,0.63587,,0.974638,0.76087,0.773411,0.921985,0.949879,0.880435,0.333333,0.886957,0.954013,0.586957
8,2022-08,0.777015,0.913658,0.915751,0.75,,0.969068,0.785714,0.765004,0.945409,0.962149,0.897436,0.29304,0.953846,0.983939,0.97619
9,2022-09,0.841292,0.82611,1.0,0.75,0.460674,0.970037,0.940075,0.711322,0.93929,0.963795,0.925094,0.571785,0.858427,0.966292,0.835206


In [25]:
bundesland_monthyear.to_csv("./data/cleaned/monthly_nach_bundesland.csv", index=False)

## Weihnachts Beispiel - 12.25.2022 um 8 Uhr

In [26]:
weihnacht = df.loc[df.timestamp==pd.to_datetime("2022-25-12 08:01:00", dayfirst=True),] # filter by day
weihnacht = weihnacht[["shelter_name","shelter_id","latitude","longitude","timestamp","description","status","gen","bez","bundesland"]]
weihnacht.to_csv("./data/cleaned/belegungsstatus_25-12-2022_8-01.csv", index=False)