In [1]:
from glob import glob

import pandas as pd
import numpy as np

import qgrid
qgrid.nbinstall()

In [2]:
def get_frames():
    for filename in glob('csvs/*.csv'):
        df = pd.read_csv(filename)
        df['ags'] = df['ags'].apply(lambda x: None if pd.isnull(x) else str(int(x)).zfill(5))
        df['value'] = pd.to_numeric(df['value'], errors='coerce')
        df = df.drop([c for c in df.columns if 'Unnamed' in c], 1)
        yield df

df = pd.concat(get_frames())
df.head()

Unnamed: 0,ags,key,state,type,value
0,8111,food,Baden-Württemberg,costs,24.31
1,8111,costs_nursing_class_3,Baden-Württemberg,costs,95.98
2,8111,costs_nursing_class_1,Baden-Württemberg,costs,58.54
3,8111,costs_nursing_class_2,Baden-Württemberg,costs,75.29
4,8115,food,Baden-Württemberg,costs,23.55


In [3]:
df.to_csv('states.csv', index=False)

In [4]:
counts = df.groupby('ags').size()
assert len(counts[counts != 18]) == 0

In [5]:
counts = df.groupby('state').size()
assert len(counts) == 16

In [6]:
# NRW nulls come from empty cells
# other are hidden for privacy reasons
df[df['value'].isnull()]

Unnamed: 0,ags,key,state,type,value
115,13003,recipients_nursing_class_unknown,Mecklenburg-Vorpommern,recipients,
119,13004,recipients_nursing_class_unknown,Mecklenburg-Vorpommern,recipients,
123,13071,recipients_nursing_class_unknown,Mecklenburg-Vorpommern,recipients,
116,5554,costs_nursing_class_1,Nordrhein-Westfalen,costs,
117,5554,costs_nursing_class_2,Nordrhein-Westfalen,costs,
118,5554,costs_nursing_class_3,Nordrhein-Westfalen,costs,
119,5554,food,Nordrhein-Westfalen,costs,
210,15081,recipients_nursing_class_3,Sachsen-Anhalt,recipients,
211,15081,recipients_nursing_class_unknown,Sachsen-Anhalt,recipients,
230,15086,recipients_nursing_class_3,Sachsen-Anhalt,recipients,


In [7]:
df[df['key'] == '4-bed'].groupby('state')['value'].mean()

state
Baden-Württemberg         0.431818
Bayern                    0.572917
Berlin                    7.000000
Brandenburg               0.000000
Bremen                    0.000000
Hamburg                   6.000000
Hessen                    2.153846
Mecklenburg-Vorpommern    6.500000
Niedersachsen             1.130435
Nordrhein-Westfalen       0.132075
Rheinland-Pfalz           0.388889
Saarland                  0.666667
Sachsen                   0.000000
Sachsen-Anhalt            0.785714
Schleswig-Holstein        1.933333
Thüringen                 0.521739
Name: value, dtype: float64

In [8]:
qgrid.show_grid(df)