In [1]:
from glob import glob

import pandas as pd
import numpy as np

import qgrid
qgrid.nbinstall()

In [2]:
def get_frames():
    for filename in glob('csvs/*.csv'):
        df = pd.read_csv(filename)
        df['ags'] = df['ags'].apply(lambda x: None if pd.isnull(x) else str(int(x)).zfill(5))
        df['value'] = pd.to_numeric(df['value'], errors='coerce')
        df = df.drop([c for c in df.columns if 'Unnamed' in c], 1)
        yield df

df = pd.concat(get_frames())
df.head()

Unnamed: 0,ags,key,state,type,value
0,8111,costs_nursing_class_2,Baden-Württemberg,costs,75.29
1,8111,costs_nursing_class_1,Baden-Württemberg,costs,58.54
2,8111,food,Baden-Württemberg,costs,24.31
3,8111,costs_nursing_class_3,Baden-Württemberg,costs,95.98
4,8115,costs_nursing_class_2,Baden-Württemberg,costs,72.46


In [3]:
df.to_csv('states.csv', index=False)

In [3]:
counts = df.groupby('ags').size().sort_values()
len(counts)

402

In [5]:
assert len(counts[counts != 22]) == 0

In [6]:
counts = df.groupby('state').size()
assert len(counts) == 16

In [7]:
# NRW nulls come from empty cells
# other are hidden for privacy reasons
df[df['value'].isnull()]

Unnamed: 0,ags,key,state,type,value
1824,09161,state_certified,Bayern,personal,
1825,09162,state_certified,Bayern,personal,
1826,09163,state_certified,Bayern,personal,
1827,09171,state_certified,Bayern,personal,
1828,09172,state_certified,Bayern,personal,
1829,09173,state_certified,Bayern,personal,
1830,09174,state_certified,Bayern,personal,
1831,09175,state_certified,Bayern,personal,
1832,09176,state_certified,Bayern,personal,
1833,09177,state_certified,Bayern,personal,


In [8]:
df[df['key'] == '4-bed'].groupby('state')['value'].mean()

state
Baden-Württemberg         0.431818
Bayern                    0.572917
Berlin                    7.000000
Brandenburg               0.000000
Bremen                    0.000000
Hamburg                   6.000000
Hessen                    2.153846
Mecklenburg-Vorpommern    6.500000
Niedersachsen             1.130435
Nordrhein-Westfalen       0.132075
Rheinland-Pfalz           0.388889
Saarland                  0.666667
Sachsen                   0.000000
Sachsen-Anhalt            0.785714
Schleswig-Holstein        1.933333
Thüringen                 0.521739
Name: value, dtype: float64

In [9]:
qgrid.show_grid(df)