In [1]:
import pandas as pd

In [2]:
def flattenColumn(input, column):
    column_flat = pd.DataFrame([[i, c_flattened] for i, y in input[column].apply(list).iteritems() for c_flattened in y], columns=['I', column])
    column_flat = column_flat.set_index('I')
    return input.drop(column, 1).merge(column_flat, left_index=True, right_index=True)

### Build rent prices data set [2009-2017]

In [3]:
rent_prices = pd.read_json("./data/renting_prices.json")
plz_to_district = pd.read_json("./data/berlin_plz.json", orient="records")

plz_to_district = plz_to_district.T.reset_index().rename(columns={'index': 'district',0: 'post_code'})
plz_to_district = flattenColumn(plz_to_district, 'post_code')

In [4]:
rent_prices = rent_prices.merge(plz_to_district, left_on="zip", right_on="post_code").drop(['zip','post_code'], 
                                                                                               axis=1)
rent_prices = rent_prices.groupby(['district','year'], as_index=False).mean()

In [5]:
rent_prices.to_csv('./data/rent_prices_until_15.csv', index=False, encoding="utf-8")

In [6]:
rent_prices = pd.read_csv("./data/rent_prices_until_17.csv",sep="\t")

In [7]:
rent_prices.head(2)

Unnamed: 0,district,year,rent_cold
0,Charlottenburg-Wilmersdorf,2009,6.994545
1,Charlottenburg-Wilmersdorf,2010,7.370606


### Build real state transactions dataset

In [8]:
imm_trans = pd.read_csv("./data/wohnimmobilien_transaktion.csv")

In [9]:
imm_trans['Käufer'] = imm_trans['Käufer'].fillna("unknown")
imm_trans['Verkäufer'] = imm_trans['Verkäufer'].fillna("unknown")

In [10]:
imm_trans.year.unique()

array([2017, 2016, 2015, 2014, 2013, 2011])

In [11]:
imm_trans = imm_trans.sort_values(by="year")

In [12]:
names = list(imm_trans['Käufer'].unique())
add = list(imm_trans['Verkäufer'].unique())

In [13]:
names.extend(x for x in add if x not in names)

In [14]:
len(names)

97

In [15]:
mill_buy = imm_trans.loc[:,('Käufer','Kaufpreis in Mio. €  ca.')].groupby(['Käufer'], as_index=False).sum()
n_flat_buy = imm_trans.loc[:,('Käufer','Anzahl der Wohnungen  ca.')].groupby(['Käufer'], as_index=False).sum()
mill_sell = imm_trans.loc[:,('Verkäufer','Kaufpreis in Mio. €  ca.')].groupby(['Verkäufer'], as_index=False).sum()
n_flat_sell = imm_trans.loc[:,('Verkäufer','Anzahl der Wohnungen  ca.')].groupby(['Verkäufer'], as_index=False).sum()

In [16]:
print(mill_buy['Kaufpreis in Mio. €  ca.'].sum(), n_flat_buy['Anzahl der Wohnungen  ca.'].sum(),
      mill_sell['Kaufpreis in Mio. €  ca.'].sum(), n_flat_sell['Anzahl der Wohnungen  ca.'].sum())

15813.35 252747.0 15813.349999999999 252747.0


In [17]:
imm_trans.head()

Unnamed: 0,Käufer,Verkäufer,Anzahl der Wohnungen ca.,Kaufpreis in Mio. € ca.,year
86,Caleus Capital Investors Gmbh / epsi,Zentral Boden immobilien aG,600.0,38.025,2011
72,GSW Immobilien AG,Cerberus Whitehall-Fonds (Goldman Sachs),49684.0,467.4,2011
73,FFire Immobilienverwaltung,Private Equity Fonds,1087.0,27.0,2011
74,degewo / Gesobau,Corpus Sireo,4739.0,,2011
75,GSW Immobilien AG,GAGFAH S.A.,4832.0,330.0,2011


In [18]:
dfs = imm_trans.dropna(subset=['Anzahl der Wohnungen  ca.']).copy()

dfs['year'] = dfs['year'].replace(2011, 2012)

In [19]:
years = sorted(dfs.year.unique())

In [21]:
dfs.head()

Unnamed: 0,Käufer,Verkäufer,Anzahl der Wohnungen ca.,Kaufpreis in Mio. € ca.,year
86,Caleus Capital Investors Gmbh / epsi,Zentral Boden immobilien aG,600.0,38.025,2012
72,GSW Immobilien AG,Cerberus Whitehall-Fonds (Goldman Sachs),49684.0,467.4,2012
73,FFire Immobilienverwaltung,Private Equity Fonds,1087.0,27.0,2012
74,degewo / Gesobau,Corpus Sireo,4739.0,,2012
75,GSW Immobilien AG,GAGFAH S.A.,4832.0,330.0,2012


In [18]:
all_year = []
first = True
for y in years:
    df = dfs[dfs.year == y].copy()
    df['flat_int'] = (df['Anzahl der Wohnungen  ca.'] / 100).round().astype(int)
    l = []
    for idx, row in df.iterrows():
        for i in range(row['flat_int']):
            l.append({y:row['Verkäufer'], y+1:row['Käufer']})
    df2 = pd.DataFrame().from_records(l)
    
    df2[f'i_{y+1}'] = 1
    df2[f'i_{y+1}'] = df2.groupby(y+1)[f'i_{y+1}'].cumsum()
    df2[f'i_{y}'] = 1
    df2[f'i_{y}'] = df2.groupby(y)[f'i_{y}'].cumsum()
    all_year.append(df2)
    if first:
        alldf = df2
        first = False
    else:
        alldf = alldf.merge(df2, left_on=[y, f'i_{y}'], right_on=[y, f'i_{y}'], how="outer")
        w = alldf[y+1].isna()
        alldf.loc[w, y+1] = alldf.loc[w, y]
        alldf[f'i_{y+1}'] = 1
        alldf[f'i_{y+1}'] = alldf.groupby(y+1)[f'i_{y+1}'].cumsum()

In [19]:
test = alldf[years].fillna(method='backfill', axis=1)
sr = test.apply(lambda sr: sr.value_counts()).max(1)
sr = sr[sr < 20]
rename = {name: 'other' for name in sr.index}

In [20]:
test = test.replace(rename)

In [21]:
test.to_csv('test.csv', index=False, encoding='utf-8')

In [22]:
w = (test[years] == 'other').all(1)

In [23]:
test[~w].to_csv('test2.csv', index=False, encoding='utf-8')

In [24]:
test

Unnamed: 0,2012,2013,2014,2015,2016,2017
0,other,other,other,other,other,other
1,other,other,other,other,other,other
2,other,other,other,other,other,other
3,other,other,other,other,other,other
4,other,other,other,other,other,other
5,other,other,other,other,other,other
6,Cerberus Whitehall-Fonds (Goldman Sachs),GSW Immobilien AG,GSW Immobilien AG,Deutsche Wohnen AG,ADO Group Ltd,ADO Group Ltd
7,Cerberus Whitehall-Fonds (Goldman Sachs),GSW Immobilien AG,GSW Immobilien AG,Deutsche Wohnen AG,ADO Group Ltd,ADO Group Ltd
8,Cerberus Whitehall-Fonds (Goldman Sachs),GSW Immobilien AG,GSW Immobilien AG,Deutsche Wohnen AG,ADO Group Ltd,ADO Group Ltd
9,Cerberus Whitehall-Fonds (Goldman Sachs),GSW Immobilien AG,GSW Immobilien AG,Deutsche Wohnen AG,ADO Group Ltd,ADO Group Ltd
