# Part 03
----
아래에서는 인구이동 데이터의 1차적인 전처리 과정을 행한다. 우선적으로 읍면동 부분을 제외한 행자부코드를 이용하여 하나의 지역코드로 묶는다. 이후 불필요한 칼럼을 없앤다.

2012년과 2013년의 데이터의 경우 이동 총인구가 마침표로 잘못 기입된 데이터가 있어 이를 수정하고 정수타입으로 변환한다.

또한 누락된 데이터 외의 모든 데이터는 정수타입으로 표기될 수 있으므로 nan값을 -1로 수정 한뒤 정수타입으로 변환시킨다.

세대주 성별의 경우 주민등록상 등록된 번호로 구분이 되어 있기에 일괄적으로 대응되는 1 또는 2로 변환한다.

세대주 구분에서 0과 -1은 모두 비세대주로 간주하여 0으로 통일한다.

마지막으로 데이터 탐색 단계에서 이동 총인구가 -1이거나나 0으로 기입된 데이터 포인트가 119개 있다는 것을 알수 있었으며, 이 경우 이동 남인구와 이동 여인구의 데이터가 모두 0으로 되어있는 것을 감안, 그리고 전체 데이터 크기인 약 6300만과 비교하여 상당히 적은 수인 것도 감안하여 이동 총인구가 0보다 작거나 같은 경우 누락시키기로 한다.

위의 전처리 과정을 위한 데이터 탐구 과정(이동 총인구 데이터의 타입, 세대주 성별 구분 코드, 세대주 관계 코드의 비일관성 발견 등)은 아래 코드에는 생략한다.

In [1]:
%matplotlib inline
import glob

In [2]:
path = 'raw_data\\'

In [3]:
population_columns = ['in_state', 'in_city', 'in_nbr', 'year', 'month', 'in_date',
                      'out_state', 'out_city', 'out_nbr', 'reason',
                      'hh_head', 'head_age', 'head_male', 'mov_multi',
                      'mov_ppl', 'mov_male', 'mov_female']

# we drop move the following columns:
#     date: won't be used as it is too specific
#     nbr: same reason as above
#     mov_multi, mov_ppl: can be derived from mov_male and mov_female
used_columns = ['in_state', 'in_city', 'year', 'month',
                'out_state', 'out_city', 'reason',
                'hh_head', 'head_age', 'head_male',
                'mov_male', 'mov_female']

In [4]:
population_files = glob.glob(path + 'population_*.txt')

In [5]:
# take a random file to fiddle around with
df = pd.read_csv(population_files[np.random.randint(len(population_files))],
                 names=population_columns, usecols=used_columns)

## Data Scrubbing

In [6]:
df.head()

Unnamed: 0,in_state,in_city,year,month,out_state,out_city,reason,hh_head,head_age,head_male,mov_male,mov_female
0,11,110,2007,1,11,110,1.0,1,39.0,1.0,2,1
1,11,110,2007,1,11,440,1.0,1,48.0,1.0,2,1
2,11,110,2007,1,43,710,1.0,1,56.0,2.0,0,2
3,11,110,2007,1,41,360,1.0,1,40.0,2.0,0,1
4,11,110,2007,1,41,360,1.0,1,26.0,1.0,1,0


In [7]:
# checking null values
df.isnull().any()

in_state      False
in_city       False
year          False
month         False
out_state     False
out_city      False
reason         True
hh_head       False
head_age       True
head_male      True
mov_male      False
mov_female    False
dtype: bool

In [8]:
# replace all null values to -1
df = df.replace(to_replace=np.NaN, value=-1)

In [9]:
# we convert everything into int
df = df.astype(int)

In [10]:
# create a column with combined address codes
df['in_address'] = df.in_state * 1000 + df.in_city
df['out_address'] = df.out_state * 1000 + df.out_city
df = df.drop(['in_state', 'in_city', 'out_state', 'out_city'], axis=1)

In [11]:
df.head()

Unnamed: 0,year,month,reason,hh_head,head_age,head_male,mov_male,mov_female,in_address,out_address
0,2007,1,1,1,39,1,2,1,11110,11110
1,2007,1,1,1,48,1,2,1,11110,11440
2,2007,1,1,1,56,2,0,2,11110,43710
3,2007,1,1,1,40,2,0,1,11110,41360
4,2007,1,1,1,26,1,1,0,11110,41360


In [12]:
# anomalies below were discovered much later in the process
# and are spread out through different files
# we leave these here for the sake of completeness

In [13]:
# change gender codes to 1 (male) or 0 (female) accordingly
# this is due to how gender specifying code works in korean id system
df.loc[(df.head_male == 9) | (df.head_male == 3), 'head_male'] = 1
df.loc[df.head_male != 1, 'head_male'] = 0

In [14]:
# change the missing data values -1 (originally null) to 0 for consistency
# there are cases where values were simply missing rather than being marked as 0
df.loc[df.hh_head == -1, 'hh_head'] = 0

In [15]:
# final check
df.sample(5)

Unnamed: 0,year,month,reason,hh_head,head_age,head_male,mov_male,mov_female,in_address,out_address
3624379,2007,5,1,1,45,1,1,2,41173,27290
6756545,2007,9,9,1,40,0,1,2,48850,26380
108900,2007,3,1,0,-1,0,1,2,11200,11200
6055263,2007,2,4,0,-1,0,2,0,46880,46880
104741,2007,6,3,1,65,1,1,1,11200,11200


## Data Mining

In [16]:
# make a new column indicating whether the movement happend within the are or not
df['migrate'] = (df.in_address != df.out_address) * 1

# we derive these two columns from others
df['mov_ppl'] = df.mov_male + df.mov_female
df['mov_multi'] = (df.mov_ppl > 1) * 1
df['hh_total'] = 1

In [17]:
# there are rare occasions where the data of number of people moved is missing
# there are exactly 116 entries of this kind, and as we have more than 63,000,000
# data points, we simply remove them
df = df.loc[df.mov_ppl > 0]

In [18]:
df.sample(5)

Unnamed: 0,year,month,reason,hh_head,head_age,head_male,mov_male,mov_female,in_address,out_address,migrate,mov_ppl,mov_multi,hh_total
74306,2007,12,1,1,35,0,1,2,11170,41135,1,3,1,1
1670961,2007,12,1,1,43,1,1,2,26260,26260,0,3,1,1
5012307,2007,12,1,0,-1,0,0,1,42170,42150,1,1,0,1
716963,2007,12,1,0,-1,0,1,0,11440,11440,0,1,0,1
1936438,2007,1,1,1,72,0,0,1,26710,48330,1,1,0,1


In [19]:
# defining a custom function to use when aggregating dataframes
# when calculating the average of the ages, we only count those greater than or equal to 0
# as missing values are indicated by -1
def age_average(series):
    if series[series >= 0].count() != 0:
        return series[series >= 0].sum() / series[series >= 0].count()
    else:
        return -1

In [20]:
# this is a dictionary that will be used to aggregate the original data
agg_functions = {
    'hh_head': 'sum',
    'head_age': age_average,
    'head_male': 'sum',
    'mov_multi': 'sum',
    'mov_ppl': 'sum',
    'mov_male': 'sum',
    'hh_total': 'count',
}

In [21]:
# we create two dataframes, one for people moving into area codes and
# another for people moving out of area codes
# this way, although each data point gets 'duplicated', we do this so that we may
# compute population changes for area codes (and other stuff) just in case if they become needed
df_i = df.groupby(['year', 'month', 'in_address', 'migrate', 'reason'], as_index=False).agg(agg_functions)
df_o = df.groupby(['year', 'month', 'out_address', 'migrate', 'reason'], as_index=False).agg(agg_functions)

In [22]:
# add in a new column to indicate whether the data is for people moving in or out
df_i['move_in'] = 1
df_o['move_in'] = 0

In [23]:
df_i.sample(5)

Unnamed: 0,year,month,in_address,migrate,reason,hh_head,head_age,head_male,mov_multi,mov_ppl,mov_male,hh_total,move_in
3982,2007,2,28110,1,2,4,47.75,4,12,70,25,48,1
31118,2007,10,43740,1,4,3,28.333333,3,2,14,10,11,1
31682,2007,10,46770,0,1,114,50.631579,70,64,293,143,172,1
25809,2007,8,48270,1,5,0,-1.0,0,1,2,2,1,1
28225,2007,9,45190,0,9,38,48.526316,18,17,80,34,43,1


In [24]:
df_o.sample(5)

Unnamed: 0,year,month,out_address,migrate,reason,hh_head,head_age,head_male,mov_multi,mov_ppl,mov_male,hh_total,move_in
25156,2007,8,45210,0,9,29,46.827586,16,18,78,35,39,0
7163,2007,3,26530,1,1,617,36.301459,392,221,1348,685,931,0
17425,2007,6,31200,1,3,73,40.972603,44,35,153,79,79,0
27934,2007,9,42830,0,9,11,44.636364,10,3,19,14,11,0
10060,2007,4,11320,0,2,9,42.888889,6,15,73,33,48,0


In [25]:
io_rename_dic = {'in_address': 'area_code', 'out_address': 'area_code'}

In [26]:
# rename the columns for consistency to support concatenation later
df_i = df_i.rename(columns=io_rename_dic)
df_o = df_o.rename(columns=io_rename_dic)

In [27]:
# note that mov_female column is dropped since we can derive it from other columns
io_column_order = ['year', 'month', 'area_code', 'move_in', 'migrate',
                   'hh_total', 'hh_head', 'head_age', 'head_male', 'mov_multi',
                   'mov_ppl', 'mov_male']

In [28]:
# make sure columns are in the same order just in case
df_i = df_i[io_column_order]
df_o = df_o[io_column_order]
df = pd.concat([df_i, df_o])

In [29]:
# check the final result
df.sample(5)

Unnamed: 0,year,month,area_code,move_in,migrate,hh_total,hh_head,head_age,head_male,mov_multi,mov_ppl,mov_male
29345,2007,10,11380,1,1,813,774,42.479328,489,323,1438,724
38145,2007,12,45210,0,0,80,76,51.434211,39,35,155,65
36622,2007,12,28245,0,0,17,7,35.285714,4,7,30,13
32987,2007,11,26170,0,1,103,23,40.913043,15,22,138,63
25132,2007,8,45180,0,1,70,7,44.285714,3,6,79,33


In [30]:
# in the following section below, we combine the codes above
# and turn them into functions so that we may automate the whole process

In [31]:
# takes a population migration data and apply scrubbing
def scrub_pop(df):
    # replace all null values to -1
    df = df.replace(to_replace=np.NaN, value=-1)
    
    # we convert everything into int
    df = df.astype(int)
    
    # create a column with combined address codes
    df['in_address'] = df.in_state * 1000 + df.in_city
    df['out_address'] = df.out_state * 1000 + df.out_city
    df = df.drop(['in_state', 'in_city', 'out_state', 'out_city'], axis=1)
    
    # change gender codes to 1 (male) or 0 (female) accordingly
    # this is due to how gender specifying code works in korean id system
    df.loc[(df['head_male'] == 9) | (df['head_male'] == 3), 'head_male'] = 1
    df.loc[df.head_male != 1, 'head_male'] = 0
    
    # change the missing data values -1 (originally null) to 0 for consistency
    # there are cases where values were simply missing rather than being marked as 0
    df.loc[df.hh_head == -1, 'hh_head'] = 0
    
    return df

In [32]:
# groups the migration data by year, month, area code, migrate, and reason
def aggregate_pop(df):
    df['migrate'] = (df.in_address != df.out_address) * 1
    df['mov_ppl'] = df.mov_male + df.mov_female
    df['mov_multi'] = (df.mov_ppl > 1) * 1
    df['hh_total'] = 1
    df = df.loc[df.mov_ppl > 0]
    
    df_i = df.groupby(['year', 'month', 'in_address', 'migrate', 'reason'], as_index=False).agg(agg_functions)
    df_o = df.groupby(['year', 'month', 'out_address', 'migrate', 'reason'], as_index=False).agg(agg_functions)
    
    df_i['move_in'] = 1
    df_o['move_in'] = 0
    df_i = df_i.rename(columns=io_rename_dic)
    df_o = df_o.rename(columns=io_rename_dic)
    df_i = df_i[io_column_order]
    df_o = df_o[io_column_order]
    
    return pd.concat([df_i, df_o])

In [279]:
df = []
for population_file in population_files:
    df_temp = pd.read_csv(population_file, names=population_columns, usecols=used_columns)
    
    df_temp = scrub_pop(df_temp)
    df_temp = aggregate_pop(df_temp)
    
    df.append(df_temp)
    # print statements to check progress
    print(population_file[-8:-4], end=' ')

2006 2007 

KeyboardInterrupt: 