# Belegungsquote der Frauenhäuser in Deutschland

## Load source data

In [1]:
import pandas as pd
import numpy as np

In [2]:
fpath = "frauenhaus_suche.json"

In [3]:
# load data
raw = pd.read_json(fpath, orient="index")
raw["shelter_id"] = raw.index
raw = raw.reset_index(drop=True)

# clean up geometry columns
raw.loc[raw.geography.notnull(), "geometry"] = raw.loc[raw.geography.notnull(), "geography"]

In [4]:
# get keys
keys = pd.read_csv("data/helpers/free_places_key.csv")

In [5]:
# get all timeseries data
df = pd.DataFrame()

for i, row in raw.iterrows():
    
    temp = pd.DataFrame(row["data"])
    temp["shelter_id"] = row["shelter_id"]
    temp["shelter_name"] = row["title"]
    temp["latitude"] = row["geometry"]["coordinates"][1]
    temp["longitude"] = row["geometry"]["coordinates"][0]
    df = pd.concat([df, temp])

In [6]:
# reformat timeseries
df.timestamp = pd.to_datetime(df.timestamp, dayfirst=True)
df["date"] = df.timestamp.dt.date

In [7]:
# add text keys
df.loc[df.freePlaces=="", "freePlaces"] = np.nan
df.freePlaces = df.freePlaces.astype(float)
df = pd.merge(df, keys, on="freePlaces")

# fill nas
df.loc[df.description.isnull(), "description"] = "k.A."

# add simplified NAs
desc_short = {
    "Aufnahme möglich für Frauen mit 4 oder mehr Kindern":"Aufnahme möglich für Frauen mit mehreren Kindern",
    "Aufnahme möglich für Frauen mit 3 Kindern":"Aufnahme möglich für Frauen mit mehreren Kindern",
    "Aufnahme möglich für Frauen mit 2 Kindern":"Aufnahme möglich für Frauen mit mehreren Kindern",
    "Aufnahme möglich für Frauen ohne Kinder, Aufnahme möglich für Frauen mit 1 Kind":"Aufnahme möglich für Frauen mit bis zu einem Kind",
    "Aufnahme möglich für Frauen ohne Kinder":"Platz für Frauen ohne Kinder",
    "Keine Aufnahme möglich":"Keine Aufnahme möglich",
    "Aufnahme möglich ohne detaillierte Angabe":"Aufnahme möglich ohne detaillierte Angabe",
    "k.A.":"keine Angabe"
}

df["status"] = df.description.map(desc_short)

In [8]:
# add bundesland
geocoded = pd.read_csv("data/helpers/shelters_geocoded.csv").drop(["latitude","longitude"], axis=1)
df = pd.merge(df, geocoded, on=["shelter_name","shelter_id"])

In [9]:
# load metadata
metadata = pd.read_csv("data/helpers/shelters_metadata.csv")
metadata = metadata.drop(['title'], axis=1)

In [10]:
# get monthyear as column
df["monthyear"] = df.timestamp.dt.to_period("M")

In [11]:
# trim dates
df = df.loc[(df.monthyear >= "2022-01") & (df.monthyear <= "2023-01"),]

In [12]:
# get counts of timestamp to use as denominator
counts = df[["monthyear","timestamp"]].drop_duplicates().groupby(["monthyear"]).count().reset_index()
counts = counts.rename(columns={"timestamp":"timestamp_count"})

In [13]:
# and total counts
n = counts.timestamp_count.sum()

In [14]:
# get list of shelters with less than 80% data completeness (not in entire timestamp)
data_completeness = df.groupby(["shelter_id"]).timestamp.count().reset_index()
data_completeness["pct_data_availability"] = data_completeness["timestamp"]/n
incomplete = data_completeness.loc[data_completeness.pct_data_availability < .8, "shelter_id"].values

In [15]:
# filter out shelters with less than 80% data availability
df = df.loc[~df.shelter_id.isin(incomplete),]

## Shelter-level data by month

In [16]:
# get count of values per day
df_monthyear = df.groupby(["monthyear","shelter_name","shelter_id","bundesland","bez","gen","status","latitude","longitude"]).agg(
    status_count = ("status", "count")
).reset_index()
df_monthyear.head()

Unnamed: 0,monthyear,shelter_name,shelter_id,bundesland,bez,gen,status,latitude,longitude,status_count
0,2022-01,1. &amp 3. Hamburger Frauenhaus,2382,Hamburg,Kreisfreie Stadt,Hamburg,keine Angabe,53.552828,9.99664,92
1,2022-01,1. Autonomes Frauenhaus,2027,Sachsen,Kreisfreie Stadt,Leipzig,Keine Aufnahme möglich,51.325188,12.373901,91
2,2022-01,1. Autonomes Frauenhaus,2027,Sachsen,Kreisfreie Stadt,Leipzig,Platz für Frauen ohne Kinder,51.325188,12.373901,1
3,2022-01,1. Frauenhaus Köln,2140,Nordrhein-Westfalen,Kreisfreie Stadt,Köln,Aufnahme möglich für Frauen mit bis zu einem Kind,50.933467,6.998638,6
4,2022-01,1. Frauenhaus Köln,2140,Nordrhein-Westfalen,Kreisfreie Stadt,Köln,Aufnahme möglich für Frauen mit mehreren Kindern,50.933467,6.998638,2


In [17]:
# pivot to wide and fill values
df_monthyear_wide = pd.pivot(
    df_monthyear,
    index=["shelter_name","shelter_id","bundesland","bez","gen","monthyear"],
    columns="status",
    values="status_count"
).reset_index().replace(np.nan, 0)
df_monthyear_wide.head()

status,shelter_name,shelter_id,bundesland,bez,gen,monthyear,Aufnahme möglich für Frauen mit bis zu einem Kind,Aufnahme möglich für Frauen mit mehreren Kindern,Aufnahme möglich ohne detaillierte Angabe,Keine Aufnahme möglich,Platz für Frauen ohne Kinder,keine Angabe
0,1. &amp 3. Hamburger Frauenhaus,2382,Hamburg,Kreisfreie Stadt,Hamburg,2022-01,0.0,0.0,0.0,0.0,0.0,92.0
1,1. &amp 3. Hamburger Frauenhaus,2382,Hamburg,Kreisfreie Stadt,Hamburg,2022-02,0.0,0.0,0.0,0.0,0.0,80.0
2,1. &amp 3. Hamburger Frauenhaus,2382,Hamburg,Kreisfreie Stadt,Hamburg,2022-03,0.0,0.0,0.0,0.0,0.0,91.0
3,1. &amp 3. Hamburger Frauenhaus,2382,Hamburg,Kreisfreie Stadt,Hamburg,2022-04,0.0,0.0,0.0,0.0,0.0,85.0
4,1. &amp 3. Hamburger Frauenhaus,2382,Hamburg,Kreisfreie Stadt,Hamburg,2022-05,0.0,0.0,0.0,0.0,0.0,90.0


In [18]:
# add total count of timestamp for percent denominator
df_monthyear_wide = pd.merge(df_monthyear_wide, counts, how="outer")
df_monthyear_wide.head()

Unnamed: 0,shelter_name,shelter_id,bundesland,bez,gen,monthyear,Aufnahme möglich für Frauen mit bis zu einem Kind,Aufnahme möglich für Frauen mit mehreren Kindern,Aufnahme möglich ohne detaillierte Angabe,Keine Aufnahme möglich,Platz für Frauen ohne Kinder,keine Angabe,timestamp_count
0,1. &amp 3. Hamburger Frauenhaus,2382,Hamburg,Kreisfreie Stadt,Hamburg,2022-01,0.0,0.0,0.0,0.0,0.0,92.0,92
1,1. Autonomes Frauenhaus,2027,Sachsen,Kreisfreie Stadt,Leipzig,2022-01,0.0,0.0,0.0,91.0,1.0,0.0,92
2,1. Frauenhaus Köln,2140,Nordrhein-Westfalen,Kreisfreie Stadt,Köln,2022-01,6.0,2.0,0.0,84.0,0.0,0.0,92
3,2. Autonomes Frauenhaus Berlin,2274,Berlin,Kreisfreie Stadt,Berlin,2022-01,0.0,0.0,0.0,0.0,0.0,92.0,92
4,2. Autonomes Frauenhaus Köln,2253,Nordrhein-Westfalen,Kreisfreie Stadt,Köln,2022-01,0.0,0.0,0.0,88.0,4.0,0.0,92


In [19]:
# calculate percentages
df_monthyear_wide.iloc[:,6:-1] = df_monthyear_wide.iloc[:,6:-1].apply(lambda x : x / df_monthyear_wide.timestamp_count, axis=0)
df_monthyear_wide.head()

Unnamed: 0,shelter_name,shelter_id,bundesland,bez,gen,monthyear,Aufnahme möglich für Frauen mit bis zu einem Kind,Aufnahme möglich für Frauen mit mehreren Kindern,Aufnahme möglich ohne detaillierte Angabe,Keine Aufnahme möglich,Platz für Frauen ohne Kinder,keine Angabe,timestamp_count
0,1. &amp 3. Hamburger Frauenhaus,2382,Hamburg,Kreisfreie Stadt,Hamburg,2022-01,0.0,0.0,0.0,0.0,0.0,1.0,92
1,1. Autonomes Frauenhaus,2027,Sachsen,Kreisfreie Stadt,Leipzig,2022-01,0.0,0.0,0.0,0.98913,0.01087,0.0,92
2,1. Frauenhaus Köln,2140,Nordrhein-Westfalen,Kreisfreie Stadt,Köln,2022-01,0.065217,0.021739,0.0,0.913043,0.0,0.0,92
3,2. Autonomes Frauenhaus Berlin,2274,Berlin,Kreisfreie Stadt,Berlin,2022-01,0.0,0.0,0.0,0.0,0.0,1.0,92
4,2. Autonomes Frauenhaus Köln,2253,Nordrhein-Westfalen,Kreisfreie Stadt,Köln,2022-01,0.0,0.0,0.0,0.956522,0.043478,0.0,92


In [20]:
df_monthyear_wide = pd.merge(df_monthyear_wide, metadata, on='shelter_id')

In [21]:
# drop timestamp camp and save file
df_monthyear_wide = df_monthyear_wide.drop(["timestamp_count"], axis=1)
df_monthyear_wide.to_csv("./data/cleaned/belegungsquote_nach_year_month_shelter.csv", index=False)

## Overall shelter-level summary

In [22]:
# get counts of status by shelter
df_shelter = df.groupby(["shelter_name","shelter_id","bundesland","bez","gen","status","latitude","longitude"]).agg(
    n = ("status", "count")
).reset_index()
df_shelter.head()

Unnamed: 0,shelter_name,shelter_id,bundesland,bez,gen,status,latitude,longitude,n
0,1. &amp 3. Hamburger Frauenhaus,2382,Hamburg,Kreisfreie Stadt,Hamburg,Aufnahme möglich ohne detaillierte Angabe,53.552828,9.99664,18
1,1. &amp 3. Hamburger Frauenhaus,2382,Hamburg,Kreisfreie Stadt,Hamburg,keine Angabe,53.552828,9.99664,1144
2,1. Autonomes Frauenhaus,2027,Sachsen,Kreisfreie Stadt,Leipzig,Aufnahme möglich für Frauen mit bis zu einem Kind,51.325188,12.373901,194
3,1. Autonomes Frauenhaus,2027,Sachsen,Kreisfreie Stadt,Leipzig,Keine Aufnahme möglich,51.325188,12.373901,277
4,1. Autonomes Frauenhaus,2027,Sachsen,Kreisfreie Stadt,Leipzig,Platz für Frauen ohne Kinder,51.325188,12.373901,691


In [23]:
# pivot to wide and fill values
df_shelter_wide = pd.pivot(
    df_shelter,
    index=["shelter_name","shelter_id","bundesland","bez","gen","latitude","longitude"],
    columns="status",
    values="n"
).reset_index().replace(np.nan, 0)

In [24]:
# calculate total unique timestamp counts
n = sum(counts.timestamp_count)

# get columns as pct
df_shelter_wide.iloc[:,-6:] = df_shelter_wide.iloc[:,-6:]/n

df_shelter_wide.head()

status,shelter_name,shelter_id,bundesland,bez,gen,latitude,longitude,Aufnahme möglich für Frauen mit bis zu einem Kind,Aufnahme möglich für Frauen mit mehreren Kindern,Aufnahme möglich ohne detaillierte Angabe,Keine Aufnahme möglich,Platz für Frauen ohne Kinder,keine Angabe
0,1. &amp 3. Hamburger Frauenhaus,2382,Hamburg,Kreisfreie Stadt,Hamburg,53.552828,9.99664,0.0,0.0,0.015491,0.0,0.0,0.984509
1,1. Autonomes Frauenhaus,2027,Sachsen,Kreisfreie Stadt,Leipzig,51.325188,12.373901,0.166954,0.0,0.0,0.238382,0.594664,0.0
2,1. Frauenhaus Köln,2140,Nordrhein-Westfalen,Kreisfreie Stadt,Köln,50.933467,6.998638,0.005164,0.003442,0.000861,0.987952,0.002582,0.0
3,2. Autonomes Frauenhaus Berlin,2274,Berlin,Kreisfreie Stadt,Berlin,52.516227,13.377745,0.0,0.0,0.0,0.0,0.0,1.0
4,2. Autonomes Frauenhaus Köln,2253,Nordrhein-Westfalen,Kreisfreie Stadt,Köln,50.921396,6.995416,0.005164,0.01463,0.0,0.975904,0.004303,0.0


In [25]:
df_shelter_wide = pd.merge(df_shelter_wide, metadata, on='shelter_id')

In [26]:
# save to csv
df_shelter_wide.to_csv("./data/cleaned/belegungsquote_nach_shelter.csv", index=False)

## Get all shelters with 20% or more keine Angabe

In [27]:
no_data = df_shelter_wide.loc[df_shelter_wide["keine Angabe"] >= 0.20, "shelter_id"].values

## Filter monthly data by good data quality and type: Frauenhaus

In [28]:
# filter no data
df_monthyear_wide = df_monthyear_wide.loc[~df_monthyear_wide.shelter_id.isin(no_data),]

In [29]:
# only look at actual shelters
df_monthyear_wide = df_monthyear_wide.loc[df_monthyear_wide.einrichtungsart=="Frauenhaus",]

## Bundesweit monthly overview

In [30]:
# remove shelters with no data
monthly = df_monthyear_wide \
    .groupby(["monthyear"]) \
    .mean() \
    .reset_index()
monthly = monthly.drop(["shelter_id","Gehbehinderung","Hörbehinderung/Taubheit","Sehbehinderung/Blindheit","Suchtmittelabhängigkeit"], axis=1)
monthly.head()

Unnamed: 0,monthyear,Aufnahme möglich für Frauen mit bis zu einem Kind,Aufnahme möglich für Frauen mit mehreren Kindern,Aufnahme möglich ohne detaillierte Angabe,Keine Aufnahme möglich,Platz für Frauen ohne Kinder,keine Angabe
0,2022-01,0.016135,0.109318,0.08639,0.74332,0.026721,0.007077
1,2022-02,0.02107,0.089046,0.069008,0.787371,0.028093,0.003608
2,2022-03,0.018259,0.066667,0.067287,0.809355,0.0306,0.005072
3,2022-04,0.022383,0.056109,0.047059,0.833786,0.028959,0.011704
4,2022-05,0.014986,0.06245,0.060342,0.839658,0.016695,0.005869


In [31]:
monthly.to_csv("./data/cleaned/monthly_overview.csv", index=False)

## Bundesland-level summary

In [32]:
# filter out where no data and mean of each status % by bundesland
bundesland = df_monthyear_wide \
    .groupby(["bundesland"]) \
    .mean() \
    .reset_index()

bundesland = bundesland.drop(["shelter_id","Gehbehinderung","Hörbehinderung/Taubheit","Sehbehinderung/Blindheit","Suchtmittelabhängigkeit"], axis=1)
bundesland.head()

Unnamed: 0,bundesland,Aufnahme möglich für Frauen mit bis zu einem Kind,Aufnahme möglich für Frauen mit mehreren Kindern,Aufnahme möglich ohne detaillierte Angabe,Keine Aufnahme möglich,Platz für Frauen ohne Kinder,keine Angabe
0,Baden-Württemberg,0.016656,0.077388,0.01664,0.850489,0.006142,0.024643
1,Bayern,0.013288,0.062522,0.073452,0.827366,0.018319,0.004727
2,Brandenburg,0.03747,0.080694,0.016444,0.865071,0.000321,0.0
3,Hessen,0.008426,0.027754,0.019129,0.938517,0.005958,0.0
4,Mecklenburg-Vorpommern,0.01208,0.038527,0.118091,0.765041,0.025538,0.039384


In [33]:
bundesland.to_csv("./data/cleaned/bundesland_overview.csv", index=False)

## Bundesland and month-level summary

In [34]:
# filter out where no data and mean of each status % by bundesland
bundesland_monthyear = df_monthyear_wide \
    .groupby(["bundesland", "monthyear"]) \
    .mean() \
    .reset_index()

bundesland_monthyear = bundesland_monthyear.drop(["shelter_id","Gehbehinderung","Hörbehinderung/Taubheit","Sehbehinderung/Blindheit","Suchtmittelabhängigkeit"], axis=1)
bundesland_monthyear.head()

Unnamed: 0,bundesland,monthyear,Aufnahme möglich für Frauen mit bis zu einem Kind,Aufnahme möglich für Frauen mit mehreren Kindern,Aufnahme möglich ohne detaillierte Angabe,Keine Aufnahme möglich,Platz für Frauen ohne Kinder,keine Angabe
0,Baden-Württemberg,2022-01,0.007437,0.225973,0.017162,0.749428,0.0,0.0
1,Baden-Württemberg,2022-02,0.061842,0.182237,0.0,0.735526,0.020395,0.0
2,Baden-Württemberg,2022-03,0.017582,0.055495,0.017033,0.83022,0.008242,0.044505
3,Baden-Württemberg,2022-04,0.018235,0.095882,0.002353,0.809412,0.007059,0.067059
4,Baden-Württemberg,2022-05,0.018889,0.054444,0.043889,0.825556,0.010556,0.046667


In [35]:
monthly_no_places = bundesland_monthyear[["bundesland","monthyear","Keine Aufnahme möglich"]]
pd.pivot(monthly_no_places, columns="bundesland", values="Keine Aufnahme möglich", index="monthyear").reset_index()

bundesland,monthyear,Baden-Württemberg,Bayern,Brandenburg,Hessen,Mecklenburg-Vorpommern,Niedersachsen,Nordrhein-Westfalen,Rheinland-Pfalz,Saarland,Sachsen,Sachsen-Anhalt,Schleswig-Holstein,Thüringen
0,2022-01,0.749428,0.684783,0.5,0.824074,0.545652,0.769565,0.813792,0.723785,0.811594,0.23913,0.333333,0.909699,0.515217
1,2022-02,0.735526,0.790278,0.7875,0.894907,0.5875,0.585,0.874798,0.90625,0.666667,0.0,0.28125,0.860577,0.7625
2,2022-03,0.83022,0.755189,1.0,0.889703,0.745055,0.540659,0.875222,0.91453,0.710623,0.182418,0.254579,0.892646,0.914286
3,2022-04,0.809412,0.701961,0.67451,0.951634,0.755294,0.556471,0.935863,0.922222,0.815686,0.228235,0.535294,0.943891,0.602353
4,2022-05,0.825556,0.809877,0.911111,0.931276,0.515556,0.681111,0.912903,0.937654,0.488889,0.4,0.52963,0.942735,0.6
5,2022-06,0.888824,0.892157,0.756863,0.977778,0.701176,0.722353,0.959962,0.969281,0.760784,0.4,0.654902,0.978281,0.6
6,2022-07,0.784783,0.916667,0.847826,0.974638,0.713043,0.826087,0.921985,0.949879,0.880435,0.4,0.73913,0.954013,0.704348
7,2022-08,0.835714,0.924298,1.0,0.969068,0.749451,0.923077,0.945409,0.962149,0.897436,0.327473,0.901099,0.983939,0.973626
8,2022-09,0.848876,0.852684,1.0,0.970037,0.92809,0.902247,0.93929,0.963795,0.925094,0.829213,0.882022,0.966292,0.802247
9,2022-10,0.917204,0.799881,1.0,0.941856,0.965591,0.817204,0.898717,0.870968,0.913978,0.935484,0.949821,0.956989,0.8


In [36]:
bundesland_monthyear.to_csv("./data/cleaned/monthly_nach_bundesland.csv", index=False)

## Weihnachts Beispiel - 12.25.2022 um 8 Uhr

In [37]:
# filter by data and get relevant variables
weihnacht = df.loc[df.timestamp==pd.to_datetime("2022-25-12 08:01:00", dayfirst=True),] # filter by day
weihnacht = weihnacht[["shelter_name","shelter_id","latitude","longitude","timestamp","description","status","gen","bez","bundesland"]]

In [38]:
# add metadata
weihnacht = pd.merge(weihnacht, metadata, on='shelter_id')

In [39]:
weihnacht.to_csv("./data/cleaned/belegungsstatus_25-12-2022_8-01.csv", index=False)