# 파트 07
----
앞부분의 전처리 된 파일들을 마지막으로 하나로 합쳐서 출력한다. 이 파트에서도 자잘한 추가적인 전처리가 이루어진다.

In [14]:
%matplotlib inline
import re
from __future__ import division
from __future__ import print_function

In [15]:
# setting the folder path and read in data
path = 'raw_data\\'
df_pop = pd.read_csv(path + '06_pop_export.csv', encoding='utf-8')
df_estate = pd.read_csv(path + '06_estate_export.csv', encoding='utf-8')

In [16]:
# create new columns for price per area and log(price per area)
df_estate['ppa'] = df_estate.price / df_estate.area

In [17]:
df_estate['ppa_mean'] = np.nan
df_estate['ppa_median'] = np.nan
df_estate['trade_vol'] = 0

In [18]:
# custom aggregate functions the real estate data
def ppa_mean_agg(series):
    return df_estate.loc[series.index, 'ppa'].mean()

def ppa_median_agg(series):
    return df_estate.loc[series.index, 'ppa'].median()   

estate_agg_func_dic = {
    'built': 'mean',
    'ppa_mean': ppa_mean_agg,
    'ppa_median': ppa_median_agg,
    'trade_vol': 'count',
}

In [20]:
df_estate_agg_year = df_estate.groupby(['year', 'area_code']).agg(estate_agg_func_dic)
df_estate_agg_month = df_estate.groupby(['year', 'month', 'area_code']).agg(estate_agg_func_dic)

In [21]:
df_estate_agg_year['ppa_median_log'] = np.log(df_estate_agg_year.ppa_median)
df_estate_agg_month['ppa_median_log'] = np.log(df_estate_agg_month.ppa_median)

In [22]:
df_estate_agg_year['ppa_change_pre']  = np.nan
df_estate_agg_year['ppa_change_post']  = np.nan

In [23]:
for i in df_estate_agg_year.index.levels[0][:-1]:
    df_estate_agg_year.loc[i].ppa_change_pre = \
        df_estate_agg_year.loc[i + 1, 'ppa_median'] / df_estate_agg_year.loc[i, 'ppa_median']
    df_estate_agg_year.loc[i + 1].ppa_change_post = \
        df_estate_agg_year.loc[i + 1, 'ppa_median'] / df_estate_agg_year.loc[i, 'ppa_median']

In [27]:
# just another sanity check sanity check
df_pop.loc[df_pop.reason == -1].head()

Unnamed: 0,year,month,area_code,move_in,foreign,reason,house_total,head,head_male,head_age,multi,moved_ppl,moved_male
423,2006,1,26260,1,1,-1,1,1,0,52.0,1,2,0
571,2006,1,27110,1,1,-1,2,2,0,36.0,0,2,0
962,2006,1,31140,1,0,-1,1,1,1,44.0,1,2,1
970,2006,1,31140,1,1,-1,2,0,0,-1.0,1,5,0
1323,2006,1,41310,1,1,-1,1,1,1,39.0,1,4,2


In [28]:
# remove entries with missing values
df_pop = df_pop.loc[df_pop.reason != -1]

In [29]:
# turning categorical values into binary vectors
df_r = pd.get_dummies(df_pop.reason, prefix='r')
df_pop = pd.concat([df_pop, df_r], axis=1).drop('reason', axis=1)

In [30]:
df_pop.head()

Unnamed: 0,year,month,area_code,move_in,foreign,house_total,head,head_male,head_age,multi,moved_ppl,moved_male,r_1,r_2,r_3,r_4,r_5,r_6,r_9
0,2006,1,11110,1,0,309,258,178,42.344961,121,588,326,1,0,0,0,0,0,0
1,2006,1,11110,1,0,32,8,6,48.5,8,50,23,0,1,0,0,0,0,0
2,2006,1,11110,1,0,134,129,87,41.945736,59,264,135,0,0,1,0,0,0,0
3,2006,1,11110,1,0,12,11,3,38.0,8,26,8,0,0,0,1,0,0,0
4,2006,1,11110,1,0,2,2,1,30.5,0,2,1,0,0,0,0,1,0,0


In [31]:
# getting ready for aggregation
df_pop['mov_vol'] = np.nan
df_pop['mov_ppl_vol'] = np.nan
df_pop['pop_change_ratio'] = np.nan
df_pop['head_ratio'] = np.nan
df_pop['head_male_ratio'] = np.nan
df_pop['ppl_per_house'] = np.nan
df_pop['mov_male_ratio'] = np.nan
df_pop['multi_ratio'] = np.nan
df_pop['sign'] = (-1) ** (df_pop.move_in + 1)

In [32]:
# custom aggregate functions for the population data
def mov_vol_agg(series):
    return df_pop.loc[series.index, 'house_total'].sum()

def mov_ppl_vol_agg(series):
    return df_pop.loc[series.index, 'moved_ppl'].sum()

def pop_change_ratio_agg(series):
    return ((df_pop.loc[series.index, 'moved_ppl'] * df_pop.loc[series.index, 'sign']).sum() /
             df_pop.loc[series.index, 'moved_ppl'].sum())

def head_ratio_agg(series):
    return df_pop.loc[series.index, 'head'].sum() / df_pop.loc[series.index, 'house_total'].sum()

def head_male_ratio_agg(series):
    return df_pop.loc[series.index, 'head_male'].sum() / df_pop.loc[series.index, 'head'].sum()

def ppl_per_house_agg(series):
    return df_pop.loc[series.index, 'moved_ppl'].sum() / df_pop.loc[series.index, 'house_total'].sum()

def mov_male_ratio_agg(series):
    return df_pop.loc[series.index, 'moved_male'].sum() / df_pop.loc[series.index, 'moved_ppl'].sum()

def head_age_agg(series):
    return ((df_pop.loc[series.index, 'head_age'] * df_pop.loc[series.index, 'head']).sum() /
             df_pop.loc[series.index, 'head'].sum())

def reason_ratio_agg(series):
    return ((series * df_pop.loc[series.index, 'house_total'] * df_pop.loc[series.index, 'sign']).sum() /
             df_pop.loc[series.index, 'house_total'].sum())

def multi_ratio_agg(series):
    return df_pop.loc[series.index, 'multi'].sum() / df_pop.loc[series.index, 'house_total'].sum()

pop_agg_func_dic = {
    'mov_vol': mov_vol_agg,
    'mov_ppl_vol': mov_ppl_vol_agg,
    'pop_change_ratio': pop_change_ratio_agg,
    'head_ratio': head_ratio_agg,
    'head_male_ratio': head_male_ratio_agg,
    'ppl_per_house': ppl_per_house_agg,
    'mov_male_ratio': mov_male_ratio_agg,
    'multi_ratio': multi_ratio_agg,
    'head_age': head_age_agg,
    'r_1': reason_ratio_agg,
    'r_2': reason_ratio_agg,
    'r_3': reason_ratio_agg,
    'r_4': reason_ratio_agg,
    'r_5': reason_ratio_agg,
    'r_6': reason_ratio_agg,
    'r_9': reason_ratio_agg,
}

In [34]:
# group things accordingly and aggregate the data using custom aggregate functions
df_pop_agg_year = df_pop.groupby(['year', 'area_code']).agg(pop_agg_func_dic)
df_pop_agg_month = df_pop.groupby(['year', 'month', 'area_code']).agg(pop_agg_func_dic)

In [35]:
# combine the population data and the real estate data into one
df_year = pd.concat([df_pop_agg_year, df_estate_agg_year], axis=1)
df_month = pd.concat([df_pop_agg_month, df_estate_agg_month], axis=1)

In [36]:
df_year.reset_index(inplace=True)
df_month.reset_index(inplace=True)

In [37]:
# see if there are any null values
df_year.loc[df_year.loc[:, 'built'].isnull()]

Unnamed: 0,year,area_code,mov_male_ratio,r_9,pop_change_ratio,multi_ratio,r_2,r_3,head_ratio,r_4,...,mov_vol,head_male_ratio,ppl_per_house,ppa_median,built,trade_vol,ppa_mean,ppa_median_log,ppa_change_pre,ppa_change_post
184,2006,46910,0.54129,0.053191,0.03052,0.202827,0.057152,-0.02415,0.543951,-0.007532,...,12878.0,0.679515,1.366284,,,,,,,
412,2007,46910,0.530417,0.052828,-0.031313,0.197456,0.049587,-0.019446,0.528358,-0.00478,...,12342.0,0.666769,1.355858,,,,,,,
1324,2011,46910,0.560958,-0.023491,-0.056798,0.188022,-0.008639,-0.05232,0.56358,-0.016113,...,10302.0,0.67568,1.341584,,,,,,,
1780,2013,46910,0.558573,0.005786,0.039192,0.193539,0.005111,-0.032112,0.565477,-0.011765,...,10370.0,0.692019,1.345902,,,,,,,
2008,2014,46910,0.565747,0.004725,-0.010215,0.188235,-0.006461,-0.04812,0.555352,-0.014465,...,10370.0,0.695086,1.331051,,,,,,,
2259,2015,47940,0.566884,-0.005475,-0.018376,0.215439,-0.036956,-0.048453,0.693676,-0.010402,...,3653.0,0.708761,1.385437,,,,,,,


In [38]:
df_year = df_year.loc[~df_year.loc[:, 'built'].isnull()]
df_month = df_month.loc[~df_month.loc[:, 'built'].isnull()]

In [39]:
df_year.to_csv(path + 'pop_estate_agg_year.csv', index=False, encoding='utf-8')
df_month.to_csv(path + 'pop_estate_agg_month.csv', index=False, encoding='utf-8')