In [1]:
import re
import json

import pandas as pd

In [2]:
KREIS_RE = re.compile('^K: ?(.*)', re.M)
data = []
PATH = 'raw/Pflegestatistik_2013_NRW.xls'

In [3]:
counties = json.load(open('../geodata/landkreise.geojson'))
counties = [f['properties'] for f in counties['features']]
county_mapping = {x['GEN']: x['RS'] for x in counties}

In [4]:
def get_ags_for_name(name):
    name = name.split(',')[0].strip()
    ags = county_mapping.get(name)
    if ags is None:
        print(name)
    return ags

In [5]:
def get_values(df, mapping, prefix='personal', indicator='key', skip_on=None):
    ags = None
    cache = set()
    for i, row in df.iterrows():
        ind = str(row[indicator])
        match = KREIS_RE.match(ind)
        if not pd.isnull(ind) and match is not None:
            kreis = match.group(1).strip()
            ags = get_ags_for_name(kreis)
        if ags is None:
            continue
        if pd.isnull(row['key']):
            continue
        if skip_on is not None and row['key'] in skip_on:
            ags = None
            continue
        for k, v in mapping.items():
            if k in row['key']:
                if isinstance(row['value'], str):
                    val = row['value'].replace('-', '0').strip()
                    if val == '.':
                        val = None
                    else:
                        val = float(val)
                else:
                    val = row['value']
                cache_key = '%s-%s' % (ags, v)
                if cache_key not in cache:
                    yield ags, prefix, v, val
                    cache.add(cache_key)


In [6]:
df = pd.read_excel(PATH, sheetname='Vergütung', skiprows=14)

df = df.rename(columns={
    'Unnamed: 0': 'key',
    'Insgesamt                               ': 'value'
    
})
df.head()
cost_mapping = {
    'Pflegeklasse 1': 'costs_nursing_class_1',
    'Pflegeklasse 2': 'costs_nursing_class_2',
    'Pflegeklasse 3': 'costs_nursing_class_3',
    'Verpflegung': 'food',
}


data.extend(list(get_values(df, cost_mapping, prefix='costs', skip_on=['Vergütung für Kurzzeitpflege  '])))
df.head()

Unnamed: 0,key,value,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10
0,Vergütung für vollstationäre,,,,,,,,,,
1,Dauerpflege,,,,,,,,,,
2,Pflegesatz,,,,,,,,,,
3,Pflegeklasse 1,46.67,,,,,,,,,
4,Pflegeklasse 2,66.07,,,,,,,,,


In [7]:
df = pd.read_excel(PATH, sheetname='verfügbare Plätze Betten', skiprows=14)

df = df.rename(columns={
    'Unnamed: 1': 'value',
    '                                Anzahl                                                                                              ': 'key'
    
})
df.head()
bed_mapping = {
  '1-Bett-Zimmern': '1-bed',
  '2-Bett-Zimmern': '2-bed',
  '3-Bett-Zimmern': '3-bed',
  '4 und mehr-Bett-Zimmern': '4-bed'
}

data.extend(list(get_values(df, bed_mapping, prefix='beds', skip_on=['Vergütung für Kurzzeitpflege  '])))
df.head()

Unnamed: 0,key,value,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10
0,Verfügbare Plätze insgesamt,181670.0,,,,,,,,,
1,vollstationäre Pflege,,,,,,,,,,
2,zusammen,174567.0,,,,,,,,,
3,in 1-Bett-Zimmern,119520.0,,,,,,,,,
4,in 2-Bett-Zimmern,54917.0,,,,,,,,,


In [8]:
df = pd.read_excel(PATH, sheetname='Personal', skiprows=10)

df = df.rename(columns={
    'Unnamed: 0': 'key',
    'Insgesamt': 'value',
})
df.head()

personal_mapping = {
    'Vollzeitbeschäftigt': 'fully_employed',
    'Teilzeitbeschäftigt': 'part-time',
    'Pflege und Betreuung': 'nursing_and_care',
    'soziale Betreuung': 'social_care',
    'zusätzliche Betreuung (§ 87b SGB XI)': 'other_care',
    'Personal insgesamt': 'total_personnel',
    'Auszubildende': 'trainee',
    'staatlich anerkannte': 'state_certified',
    'Gesundheits- und Krankenpfleger': 'nurse',
    'Gesundheits- und Kinderkrankenpfleger': 'nurse_children',
}

data.extend(list(get_values(df, personal_mapping, prefix='personal')))
df.head()

Unnamed: 0,key,value,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,Personal insgesamt,158336.0,,,
1,,,,,
2,"Auszubildende/-r, (Um-)Schüler/-in",11994.0,,,
3,Überwiegender Tätigkeitsbereich im Pflegeheim,,,,
4,Pflege und Betreuung,102141.0,,,


In [9]:
df = pd.read_excel(PATH, sheetname='Pflegebedürftige', skiprows=10)

df = df.rename(columns={
    'Unnamed: 0': 'key',
    'Insgesamt': 'value'
    
})
df.head()
recipient_mapping = {
    'Pflegestufe I ': 'recipients_nursing_class_1',
    'Pflegestufe II ': 'recipients_nursing_class_2',
    'Pflegestufe III ': 'recipients_nursing_class_3',
    'Pflegestufe zugeordnet': 'recipients_nursing_class_unknown',
}


data.extend(list(get_values(df, recipient_mapping,  prefix='recipients')))
df.head()

Unnamed: 0,key,value,Unnamed: 2
0,Insgesamt,169978.0,160324.0
1,Pflegestufe I,65101.0,60934.0
2,Pflegestufe II,67282.0,63156.0
3,Pflegestufe III,35363.0,34170.0
4,Bisher noch keiner,,


In [10]:
df = pd.DataFrame(data)
df = df.rename(columns={
    0: 'ags',
    1: 'type',
    2: 'key',
    3: 'value'
})
df['state'] = 'Nordrhein-Westfalen'
df.to_csv('csvs/nordrhein-westfalen.csv', index=False)
df.head()

Unnamed: 0,ags,type,key,value,state
0,5111,costs,costs_nursing_class_1,47.11,Nordrhein-Westfalen
1,5111,costs,costs_nursing_class_2,67.22,Nordrhein-Westfalen
2,5111,costs,costs_nursing_class_3,88.11,Nordrhein-Westfalen
3,5111,costs,food,30.53,Nordrhein-Westfalen
4,5112,costs,costs_nursing_class_1,47.69,Nordrhein-Westfalen


In [11]:
df[(df['type'] == 'personal') & (df['ags'] == '05111')]

Unnamed: 0,ags,type,key,value,state
424,5111,personal,total_personnel,4025.0,Nordrhein-Westfalen
425,5111,personal,trainee,316.0,Nordrhein-Westfalen
426,5111,personal,nursing_and_care,2689.0,Nordrhein-Westfalen
427,5111,personal,social_care,195.0,Nordrhein-Westfalen
428,5111,personal,other_care,143.0,Nordrhein-Westfalen
429,5111,personal,state_certified,,Nordrhein-Westfalen
430,5111,personal,nurse,,Nordrhein-Westfalen
431,5111,personal,nurse_children,,Nordrhein-Westfalen
432,5111,personal,fully_employed,747.0,Nordrhein-Westfalen
433,5111,personal,part-time,548.0,Nordrhein-Westfalen
