# 파트 05
----
인구이동 데이터에 대한 2번째 전처리 과정을 행한다. 그룹으로 묶을수 있는 단위를 최대한 세밀하게 잡고 그룹으로 묶어서 데이터 크기를 줄이는 것을 목적으로 한다. 다만 추후 인구이동의 증감 등의 만약의 계산의 편의를 생각해서 전입기준과 전출기준으로 데이터를 중복으로 세는 방법을 선택했다. 문서의 첫번째 파트는 모든 문서를 일괄적으로 처리하여 csv파일로 출력하는 코드이며 과정에 대한 자세한 설명은 하단에 기술한다.

In [1]:
%matplotlib inline
import glob
from __future__ import division
from __future__ import print_function

In [3]:
# setting file paths
path = 'raw_data\\'
population_files = glob.glob(path + 'population_*.csv')

In [4]:
# defining custom functions to use when aggregating dataframes
def age_average(series):
    if series[series >= 0].count() != 0:
        return series[series >= 0].sum() / series[series >= 0].count()
    else:
        return -1

def count_ones(series):
    return series[series == 1].count()

In [5]:
# this is a dictionary that will be used to aggregate the original data
# as to why we count the movement of female population is explained below
agg_functions = {
    u'세대주_관계': 'sum',
    u'세대주_나이': age_average,
    u'세대주_성별': count_ones,
    u'세대_관련': count_ones,
    u'이동_총인구': 'sum',
    u'이동_남인구': 'sum',
    u'이동_여인구': 'count',
}

In [6]:
# these are dictionaries for remapping columns so that dataframes for
# people moving in and people moving out have the same columns
# as there is no missing data value (i.e. nan value), we use the count (not the sum) of
# the movement of female population as the number of households moved
in_rename_map = {
    u'전입_년': u'전입년', u'전입_월': u'전입월', u'세대주_관계': u'세대주', u'이동_여인구': u'총세대',
    u'세대_관련': u'다인세대', u'세대주_성별': u'세대주_남', u'전입코드': u'행정코드',
}
out_rename_map = {
    u'전입_년': u'전입년', u'전입_월': u'전입월', u'세대주_관계': u'세대주', u'이동_여인구': u'총세대',
    u'세대_관련': u'다인세대', u'세대주_성별': u'세대주_남', u'전출코드': u'행정코드'
}

In [7]:
# an array for rearranging columns in order
column_order = [
    u'전입년', u'전입월', u'행정코드', u'전입', u'타지전입', u'사유',
    u'총세대', u'세대주', u'세대주_남', u'세대주_나이',
    u'다인세대', u'이동_총인구', u'이동_남인구'
]

In [8]:
# since we cannot read in all files into the memory at once, we read in one file at a time
# at each loop step, we overwrite the variables so we do not cause a memory overflow
df_temp = []
for population_file in population_files:
    df_population = pd.read_csv(population_file, encoding='utf-8')
    
    df_population[u'타지전입'] = (df_population[u'전입코드'] != df_population[u'전출코드']) * 1
    
    df_in = df_population.groupby(
        [u'전입_년', u'전입_월', u'전입코드', u'타지전입', u'사유'], as_index=False
    ).agg(agg_functions)
    df_out = df_population.groupby(
        [u'전입_년', u'전입_월', u'전출코드', u'타지전입', u'사유'], as_index=False
    ).agg(agg_functions)
    
    df_in.rename(columns=in_rename_map, inplace=True)
    df_out.rename(columns=out_rename_map, inplace=True)
    df_in[u'전입'] = 1
    df_out[u'전입'] = 0
    df_in = df_in[column_order]
    df_out = df_out[column_order]
    
    df_temp.append(df_in)
    df_temp.append(df_out)
    
    print(population_file)

raw_data\population_change_2006.csv
raw_data\population_change_2007.csv
raw_data\population_change_2008.csv
raw_data\population_change_2009.csv
raw_data\population_change_2010.csv
raw_data\population_change_2011.csv
raw_data\population_change_2012.csv
raw_data\population_change_2013.csv
raw_data\population_change_2014.csv
raw_data\population_change_2015.csv


In [26]:
# concatenate everything and export to a file
df_population = pd.concat(df_temp, ignore_index=True)
df_population.to_csv(path + 'pop_05_export.csv', index=False, encoding='utf-8')

# 주석
----
그룹은 전입 년, 전입 월, 행정코드, 전입/전출, 사유 단위로 묶는다. 이를 위해서 행정구역 내 이동인지 행정구역 외 이동인지를 알려주는 플래그 값을 추가한다.

각 칼럼의 데이터 종류와 표기 방식이 다름으로 각각 칼럼에 다른 함수를 적용시켜 서 묶는다. 예로, 이동 총인구는 합으로 계산하지만, 세대주 나이의 경우 평균으로 계산하는게 더 직관적이라고 볼 수 있다.

아래 과정에서 주목할만한 것은 총세대 수를 계산하는데 있어서 편의상 이동 여인구의 데이터 갯수를 센 뒤 칼럼 이름을 바꿨다는 점이다. 이동 여인구는 이동 총인구에와 이동 남인구의 차에서 바로 계산 할 수 있으므로, 칼럼단위의 데이터 누락은 이루어지지 않는다.

In [29]:
# read in a file
df_population = pd.read_csv(path + 'population_change_2006.csv', encoding='utf-8')
df_population.head()

Unnamed: 0,전입_년,전입_월,사유,세대주_관계,세대주_나이,세대주_성별,세대_관련,이동_총인구,이동_남인구,이동_여인구,전입코드,전출코드
0,2006,1,1,1,41,2,1,3,1,2,11110,11440
1,2006,1,1,1,34,1,2,1,1,0,11110,41285
2,2006,1,1,1,49,1,1,4,3,1,11110,11290
3,2006,1,1,0,-1,-1,2,1,0,1,11110,41480
4,2006,1,1,1,45,2,1,2,0,2,11110,11380


In [30]:
# make a new column indicating whether the movement happend within the are or not
df_population[u'타지전입'] = (df_population[u'전입코드'] != df_population[u'전출코드']) * 1
df_population.head()

Unnamed: 0,전입_년,전입_월,사유,세대주_관계,세대주_나이,세대주_성별,세대_관련,이동_총인구,이동_남인구,이동_여인구,전입코드,전출코드,타지전입
0,2006,1,1,1,41,2,1,3,1,2,11110,11440,1
1,2006,1,1,1,34,1,2,1,1,0,11110,41285,1
2,2006,1,1,1,49,1,1,4,3,1,11110,11290,1
3,2006,1,1,0,-1,-1,2,1,0,1,11110,41480,1
4,2006,1,1,1,45,2,1,2,0,2,11110,11380,1


In [31]:
# we create two dataframes, one for people moving into area codes and
# another for people moving out of area codes
# this way, although each data point gets 'duplicated', we do this so that we may
# compute population changes for area codes (and other stuff) just in case if they become needed
df_in = df_population.groupby(
    [u'전입_년', u'전입_월', u'전입코드', u'타지전입', u'사유'], as_index=False
).agg(agg_functions)
df_out = df_population.groupby(
    [u'전입_년', u'전입_월', u'전출코드', u'타지전입', u'사유'], as_index=False
).agg(agg_functions)

In [32]:
df_in.head()

Unnamed: 0,전입_년,전입_월,전입코드,타지전입,사유,세대주_관계,이동_여인구,세대주_나이,세대_관련,이동_총인구,세대주_성별,이동_남인구
0,2006,1,11110,0,1,258,309,42.344961,121,588,178,326
1,2006,1,11110,0,2,8,32,48.5,8,50,6,23
2,2006,1,11110,0,3,129,134,41.945736,59,264,87,135
3,2006,1,11110,0,4,11,12,38.0,8,26,3,8
4,2006,1,11110,0,5,2,2,30.5,0,2,1,1


In [33]:
df_out.head()

Unnamed: 0,전입_년,전입_월,전출코드,타지전입,사유,세대주_관계,이동_여인구,세대주_나이,세대_관련,이동_총인구,세대주_성별,이동_남인구
0,2006,1,11110,0,1,258,309,42.344961,121,588,178,326
1,2006,1,11110,0,2,8,32,48.5,8,50,6,23
2,2006,1,11110,0,3,129,134,41.945736,59,264,87,135
3,2006,1,11110,0,4,11,12,38.0,8,26,3,8
4,2006,1,11110,0,5,2,2,30.5,0,2,1,1


In [34]:
# add in a new column to indicate whether the data is for people moving in or out
df_in[u'전입'] = 1
df_out[u'전입'] = 0

In [35]:
# rename the columns for consistency to support concatenation later
df_in.rename(columns=in_rename_map, inplace=True)
df_out.rename(columns=out_rename_map, inplace=True)

In [36]:
df_in.head()

Unnamed: 0,전입년,전입월,행정코드,타지전입,사유,세대주,총세대,세대주_나이,다인세대,이동_총인구,세대주_남,이동_남인구,전입
0,2006,1,11110,0,1,258,309,42.344961,121,588,178,326,1
1,2006,1,11110,0,2,8,32,48.5,8,50,6,23,1
2,2006,1,11110,0,3,129,134,41.945736,59,264,87,135,1
3,2006,1,11110,0,4,11,12,38.0,8,26,3,8,1
4,2006,1,11110,0,5,2,2,30.5,0,2,1,1,1


In [37]:
df_out.head()

Unnamed: 0,전입년,전입월,행정코드,타지전입,사유,세대주,총세대,세대주_나이,다인세대,이동_총인구,세대주_남,이동_남인구,전입
0,2006,1,11110,0,1,258,309,42.344961,121,588,178,326,0
1,2006,1,11110,0,2,8,32,48.5,8,50,6,23,0
2,2006,1,11110,0,3,129,134,41.945736,59,264,87,135,0
3,2006,1,11110,0,4,11,12,38.0,8,26,3,8,0
4,2006,1,11110,0,5,2,2,30.5,0,2,1,1,0


In [38]:
# make sure columns are in the same order just in case
df_in = df_in[column_order]
df_out = df_out[column_order]
df_population = pd.concat([df_in, df_out])

In [39]:
df_in.head()

Unnamed: 0,전입년,전입월,행정코드,전입,타지전입,사유,총세대,세대주,세대주_남,세대주_나이,다인세대,이동_총인구,이동_남인구
0,2006,1,11110,1,0,1,309,258,178,42.344961,121,588,326
1,2006,1,11110,1,0,2,32,8,6,48.5,8,50,23
2,2006,1,11110,1,0,3,134,129,87,41.945736,59,264,135
3,2006,1,11110,1,0,4,12,11,3,38.0,8,26,8
4,2006,1,11110,1,0,5,2,2,1,30.5,0,2,1


In [40]:
df_out.head()

Unnamed: 0,전입년,전입월,행정코드,전입,타지전입,사유,총세대,세대주,세대주_남,세대주_나이,다인세대,이동_총인구,이동_남인구
0,2006,1,11110,0,0,1,309,258,178,42.344961,121,588,326
1,2006,1,11110,0,0,2,32,8,6,48.5,8,50,23
2,2006,1,11110,0,0,3,134,129,87,41.945736,59,264,135
3,2006,1,11110,0,0,4,12,11,3,38.0,8,26,8
4,2006,1,11110,0,0,5,2,2,1,30.5,0,2,1


In [41]:
df_population.head()

Unnamed: 0,전입년,전입월,행정코드,전입,타지전입,사유,총세대,세대주,세대주_남,세대주_나이,다인세대,이동_총인구,이동_남인구
0,2006,1,11110,1,0,1,309,258,178,42.344961,121,588,326
1,2006,1,11110,1,0,2,32,8,6,48.5,8,50,23
2,2006,1,11110,1,0,3,134,129,87,41.945736,59,264,135
3,2006,1,11110,1,0,4,12,11,3,38.0,8,26,8
4,2006,1,11110,1,0,5,2,2,1,30.5,0,2,1


In [42]:
print(df_in.shape)
print(df_out.shape)
print(df_population.shape)

(39447, 13)
(40002, 13)
(79449, 13)
