## Process the raw data

Read the raw data, consolidate as one Dataframe and save

In [8]:
import pandas as pd

In [9]:
def process_data():
    " Parses data and saves dataframe "
    bezirke = [
        'charlottenburg-wilmersdorf',
        'friedrichshain-kreuzberg',
        'lichtenberg',
        'marzahn-hellersdorf',
        'mitte',
        'neukoelln',
        'pankow',
        'reinickendorf',
        'spandau',
        'steglitz-zehlendorf',
        'tempelhof-schoeneberg',
        'treptow-koepenick'
    ]
    years = [2012, 2013, 2014, 2015, 2016, 2017]
#     years = [2017]
    data_path = '../data/'

    dfs = []
    for year in years:
        for bezirk in bezirke:
            df = pd.read_csv( data_path + 'raw/' + str(year) +'-'+ bezirk + '.csv', sep=';')
            df = df.dropna()
            if year==2017:
                # Special handling: 2017 data contains "Position" of name.
                # Solution: Sum over it, drop position
                df = df.groupby(by=['vorname', 'geschlecht']).sum().drop(columns='position').reset_index()
            df = df.assign(bezirk=bezirk, year=year)
            total_number_names = df['anzahl'].sum()
            df = df.assign(frequency = lambda df: df['anzahl']/total_number_names )
            df['bezirk'] = df['bezirk'].apply(str.title).str.replace('oe', 'ö')
            #df_m = df[df['geschlecht']=='m']
            #print(df_m[ df_m['frequency'] == df_m['frequency'].max() ]['vorname'])
            dfs.append(df)

    df = pd.concat(dfs)
    df = df[df['vorname']!='noch'] # Possible error in data?
    
    df.to_csv(data_path + 'processed/namedata.csv', sep=',', index=False)
    
#     return df

# df = load_data()
process_data()

In [10]:
def load_data():
    " Loads and returns the dataframe "    
    data_path = '../data/'
    df = pd.read_csv(data_path + 'processed/namedata.csv', sep=',')
    return df

df = load_data()
df.head()    

Unnamed: 0,anzahl,bezirk,frequency,geschlecht,vorname,year
0,122,Charlottenburg-Wilmersdorf,0.013676,w,Marie,2012
1,105,Charlottenburg-Wilmersdorf,0.01177,w,Sophie,2012
2,78,Charlottenburg-Wilmersdorf,0.008743,w,Charlotte,2012
3,69,Charlottenburg-Wilmersdorf,0.007735,w,Maria,2012
4,66,Charlottenburg-Wilmersdorf,0.007398,m,Paul,2012


## Demography data

In [11]:
def load_demography():
    demography = pd.read_csv('../data/raw/EWR_Ortsteile_2016-06-30.csv', sep=';', encoding='latin-1')
    demography = demography.drop(['Bezirk','Ortsteil','Ortst-Name'], axis=1)
    demography = demography.rename(columns={'Bez-Name': 'bezirk'})
    return demography

demography = load_demography()
demography.head()

Unnamed: 0,bezirk,Geschl,Staatsangeh,Altersgr,Häufigkeit
0,Mitte,1,A,00_05,375
1,Mitte,1,A,05_10,335
2,Mitte,1,A,10_15,249
3,Mitte,1,A,15_20,291
4,Mitte,1,A,20_25,933
