In [1]:
import re
import pandas as pd

In [2]:
AGS_RE = re.compile('^(\d{5}).*', re.M)
data = []
PATH = 'raw/Pflegestatistik_2013_Sachsen.xls'

In [3]:
df = pd.read_excel(PATH, sheetname='tab_s61_Vergütung', skiprows=8)

df = df.rename(columns={
    'Unnamed: 0': 'key'
})
cost_mapping = {
    'Pflegeklasse 1': 'costs_nursing_class_1',
    'Pflegeklasse 2': 'costs_nursing_class_2',
    'Pflegeklasse 3': 'costs_nursing_class_3',
    'Verpflegung': 'food',
}

def get_column_values(df, mapping, prefix):
    for c in df.columns:
        ags = AGS_RE.match(c)
        if ags is None:
            continue
        ags = ags.group(1)

        for k, v in mapping.items():
            val = df[df['key'].str.contains(k).fillna(False)].iloc[0][c]
            yield ags, prefix, v, val


data.extend(list(get_column_values(df, cost_mapping, 'costs')))
df.head()

Unnamed: 0,key,Unnamed: 1,145 Nuts2-Region Chemnitz,14511 Kreisfrei Chemnitz,14521 Erzgebirgskreis,14522 LK Mittelsachsen,14523 Vogtlandkreis,14524 LK Zwickau,146 Nuts2-Region Dresden,14612 Kreisfrei Dresden,14625 LK Bautzen,14626 LK Görlitz,14627 LK Meißen,14628 LK Sächsische Schweiz- Osterzgebirge,147 Nuts2-Region Leipzig,14713 Kreisfrei Leipzig,14729 LK Leipzig,14730 LK Nordsachsen
0,,,,,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,,,,,
3,Vergütung für vollstationäre,,,,,,,,,,,,,,,,,
4,Dauerpflege,,,,,,,,,,,,,,,,,


In [4]:
df = pd.read_excel(PATH, sheetname='tab_s22_Verfügbare Plätze', skiprows=7)

df = df.rename(columns={
    'Unnamed: 0': 'key'
})
bed_mapping = {
  '1-Bett-Zimmern': '1-bed',
  '2-Bett-Zimmern': '2-bed',
  '3-Bett-Zimmern': '3-bed',
  '4 und mehr-Bett-Zimmern': '4-bed'
}

data.extend(list(get_column_values(df, bed_mapping, 'beds')))
df.head()

Unnamed: 0,key,Unnamed: 1,145 Nuts2-Region Chemnitz,14511 Kreisfrei Chemnitz,14521 Erzgebirgskreis,14522 LK Mittelsachsen,14523 Vogtlandkreis,14524 LK Zwickau,146 Nuts2-Region Dresden,14612 Kreisfrei Dresden,14625 LK Bautzen,14626 LK Görlitz,14627 LK Meißen,14628 LK Sächsische Schweiz- Osterzgebirge,147 Nuts2-Region Leipzig,14713 Kreisfrei Leipzig,14729 LK Leipzig,14730 LK Nordsachsen
0,,,,,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,,,,,
2,...,,,,,,,,,,,,,,,,,
3,Anzahl verfügbare Plätze insgesamt,51741.0,19945.0,3362.0,4188.0,3997.0,3401.0,4997.0,19744.0,5942.0,3672.0,3865.0,2878.0,3387.0,12052.0,6351.0,3169.0,2532.0
4,,,,,,,,,,,,,,,,,,


In [5]:
def get_values(df, mapping, prefix='personal', indicator='key', skip_on=None):
    ags = None
    cache = set()
    for i, row in df.iterrows():
        ind = str(row[indicator])
        if not pd.isnull(ind) and AGS_RE.match(ind) is not None:
            ags = ind.replace(' ', '')
        if ags is None:
            continue
        if pd.isnull(row['key']):
            continue
        if skip_on is not None and row['key'] in skip_on:
            ags = None
            continue
        for k, v in mapping.items():
            if k in row['key']:
                if isinstance(row['value'], str):
                    val = float(row['value'].replace('-', '0').strip())
                else:
                    val = row['value']
                cache_key = '%s-%s' % (ags, v)
                if cache_key not in cache:
                    yield ags, prefix, v, val
                    cache.add(cache_key)

                
df = pd.read_excel(PATH, sheetname='tab_s32_Personal', skiprows=11)

df = df.rename(columns={
    '14 Sachsen                   ': 'key',
    'Insgesamt                               ': 'value'
    
})

personal_mapping = {
    'Vollzeitbeschäftigt': 'fully_employed',
    'Teilzeitbeschäftigt': 'part-time',
    'Pflege und Betreuung': 'nursing_and_care',
    'soziale Betreuung': 'social_care',
    'zusätzliche Betreuung (§ 87b SGB XI)': 'other_care',
    'Personal insgesamt': 'total_personnel'
}


data.extend(list(get_values(df, personal_mapping)))
df.head()

Unnamed: 0,key,value,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6
0,,,,,,,
1,Personal insgesamt ..............................,34997.0,25446.0,5581.0,2209.0,765.0,996.0
2,,,,,,,
3,"Auszubildende/-r, (Um-)Schüler/-in ............",2123.0,1848.0,122.0,58.0,67.0,28.0
4,,,,,,,


In [6]:
recipient_mapping = {
    'Pflegestufe I ': 'recipients_nursing_class_1',
    'Pflegestufe II ': 'recipients_nursing_class_2',
    'Pflegestufe III ': 'recipients_nursing_class_3',
    'Pflegestufe zugeordnet': 'recipients_nursing_class_unknown',
}

df = pd.read_excel(PATH, sheetname='tab_s42_Pflegebedürftige', skiprows=12)
df = df.rename(columns={
    'Sachsen': 'indicator',
    'Insgesamt': 'key',
    46509: 'value'
})

data.extend(list(get_values(df, recipient_mapping, indicator='indicator', prefix='recipients')))
df.head()

Unnamed: 0,indicator,key,50534,value
0,14.0,Pflegestufe I,18000.0,15999.0
1,,Pflegestufe II,22301.0,20674.0
2,,Pflegestufe III,9966.0,9646.0
3,,Bisher noch keiner,,
4,,Pflegestufe zugeordnet,267.0,190.0


In [7]:
df = pd.DataFrame(data)
df = df.rename(columns={
    0: 'ags',
    1: 'type',
    2: 'key',
    3: 'value'
})
df['state'] = 'Sachsen'
df.to_csv('csvs/sachsen.csv', index=False)
df.head()

Unnamed: 0,ags,type,key,value,state
0,14511,costs,costs_nursing_class_3,65.28,Sachsen
1,14511,costs,costs_nursing_class_2,49.03,Sachsen
2,14511,costs,food,15.24,Sachsen
3,14511,costs,costs_nursing_class_1,37.42,Sachsen
4,14521,costs,costs_nursing_class_3,65.36,Sachsen
