# Part 01
----
아파트 매매 데이터에서 불필요한 칼럼들을 제거한 뒤 추후 인구이동 데이터와 비교분석을 위해 주소 간략화 및 행자부코드를 부여한다.

행자부코드 목록에 명기된 주소보다 아파트 매매 데이터에는 더 상세히 기술되어 있어 주소 문자열을 처음 두 단어 기준으로 간략화 시켰다. 따라서 일반적으로 대도시인 특별 행정구역의 경우 시+구의 단위로 쓰이게 되며 그 외에는 도+시 단위로 쓰이게 된다.

세종특별시의 경우 생긴지 얼마 안된 이유로 잦은 행정구역의 변화가 있었기에 임의로 이전 행정구역을 사용하도록 수정을 한다.

마지막으로 건축년도가 null값으로 되어있는 데이터 역시 수정을 가해 null값이 없는 깨끗한 데이터셋을 만든다.

In [1]:
%matplotlib inline
import re

In [2]:
# set path and read files
path = 'raw_data\\'
df_estate = pd.read_csv(path + 'real_estate_2006_2015.csv', encoding='utf-8')
df_address_code = pd.read_excel(path + 'address_code.xlsx', header=2)

In [3]:
# have a quick glance
df_estate.tail()

Unnamed: 0,시군구,번지,본번,부번,단지명,전용면적(㎡),계약년월,계약일,거래금액(만원),층,건축년도,도로명
5863875,충청북도 충주시 호암동,547-6,547,6,호반현대,59.76,201512,21~31,10500,8,1994.0,신촌2길
5863876,충청북도 충주시 호암동,547-6,547,6,호반현대,59.76,201512,21~31,10200,10,1994.0,신촌2길
5863877,충청북도 충주시 호암동,905,905,0,호암동수채,84.93,201512,11~20,19700,12,2007.0,호암중앙1로
5863878,충청북도 충주시 호암동,221-23,221,23,호암리버빌(1단지),84.68,201512,11~20,15900,9,2002.0,원호암5길
5863879,충청북도 충주시 호암동,221-1,221,1,호암리버빌(2단지),84.68,201512,11~20,16000,10,2002.0,호암중앙2로


In [4]:
# rename columns into english
estate_column_dic = {
    '시군구': 'address',
    '번지': 'block',
    '본번': 'block_main',
    '부번': 'block_sub',
    '단지명': 'apt',
    '전용면적(㎡)': 'area',
    '계약년월': 'year_month',
    '계약일': 'day',
    '거래금액(만원)': 'price',
    '층': 'floor',
    '건축년도': 'built',
    '도로명': 'street'
}

In [5]:
# rename column names into English
df_estate.rename(columns=estate_column_dic, inplace=True)

In [6]:
# list of columns that we will keep
df_columns = ['address', 'area', 'year_month', 'price', 'built']

In [7]:
# drop all columns we don't need
df_estate = df_estate[df_columns]

In [8]:
df_estate.head()

Unnamed: 0,address,area,year_month,price,built
0,세종특별자치시 금남면 용포리,43.01,200601,4520,2000.0
1,세종특별자치시 금남면 용포리,43.01,200601,4495,2000.0
2,세종특별자치시 금남면 용포리,43.01,200601,4495,2000.0
3,세종특별자치시 금남면 용포리,43.01,200601,4520,2000.0
4,세종특별자치시 금남면 용포리,43.01,200601,4520,2000.0


In [9]:
# convert prices into integer type
df_estate.price = df_estate.price.str.replace(',', '').astype(int)

In [10]:
# we split up year and month sold to separate columns
df_estate['year'] = df_estate.year_month // 100
df_estate['month'] = df_estate.year_month % 100
df_estate.drop('year_month', axis=1, inplace=True)

In [11]:
# as year built is of float type, we perform a sanity check
# fortunately, there are only four with missing values
df_estate[df_estate.built.isnull()]

Unnamed: 0,address,area,price,built,year,month
234524,경기도 안양시 동안구 호계동,52.92,12000,,2006,6
234525,경기도 안양시 동안구 호계동,52.92,11000,,2006,6
356846,경기도 안양시 동안구 호계동,52.92,12400,,2006,9
1459878,세종특별자치시 조치원읍 교리,66.68,5500,,2008,7


In [12]:
# one apartment can be reasonably assumed to be a part of the apartment complex called 태하
df_estate.loc[df_estate[df_estate.built.isnull()].index[:-1], 'built'] = 1987

# we also drop a single row which cannot be identified
df_estate = df_estate.loc[~df_estate.built.isnull()]

In [13]:
# there is one apartment with built year set to zero
df_estate[df_estate.built == 0].head()

Unnamed: 0,address,area,price,built,year,month
101805,대전광역시 서구 괴정동,49.68,3500,0.0,2006,3
968679,대전광역시 서구 괴정동,49.68,3700,0.0,2007,9
1195861,대전광역시 서구 괴정동,49.68,4000,0.0,2008,2
1195862,대전광역시 서구 괴정동,49.68,4200,0.0,2008,2
1317092,대전광역시 서구 괴정동,49.68,5000,0.0,2008,4


In [14]:
# after some investigation, we set this to 1976 and convert the column into integer type
df_estate.loc[df_estate.built == 0, 'built'] = 1976
df_estate.built = df_estate.built.astype(int)

In [15]:
# check the changes made
df_estate.head()

Unnamed: 0,address,area,price,built,year,month
0,세종특별자치시 금남면 용포리,43.01,4520,2000,2006,1
1,세종특별자치시 금남면 용포리,43.01,4495,2000,2006,1
2,세종특별자치시 금남면 용포리,43.01,4495,2000,2006,1
3,세종특별자치시 금남면 용포리,43.01,4520,2000,2006,1
4,세종특별자치시 금남면 용포리,43.01,4520,2000,2006,1


In [16]:
# from here, we try to convert address texts to area codes

In [17]:
# function that returns the first word or two words of the string
subaddress_regex = re.compile(u'(\S+\s|\S+)\s*(\S*)')
def shrink_address(address):
    return ''.join(subaddress_regex.search(address).groups())

In [18]:
# remove whitespaces at the beginning of strings and at the end of strings,
# shrink multiple white spaces, and cut off addresses after the first two words
df_estate.address = df_estate.address.apply(shrink_address)

In [19]:
df_estate.head()

Unnamed: 0,address,area,price,built,year,month
0,세종특별자치시 금남면,43.01,4520,2000,2006,1
1,세종특별자치시 금남면,43.01,4495,2000,2006,1
2,세종특별자치시 금남면,43.01,4495,2000,2006,1
3,세종특별자치시 금남면,43.01,4520,2000,2006,1
4,세종특별자치시 금남면,43.01,4520,2000,2006,1


In [20]:
# similar process as above
df_address_code['address'] = df_address_code['행정구역명'].apply(shrink_address)

In [21]:
df_address_code['area_code'] = df_address_code['행자부코드'] // 100000

In [22]:
# as we have both shortened address texts and area codes,
# we get duplicates in both columns
# when creating maps, we only take the first among the duplicates

In [23]:
df_code_to_address = df_address_code[['address', 'area_code']] \
    .drop_duplicates(subset='area_code', keep='first').set_index('area_code')

In [24]:
df_address_to_code = df_address_code[['address', 'area_code']] \
    .drop_duplicates(subset='address', keep='first').set_index('address')

In [25]:
# now create a column named area code
df_estate['area_code'] = df_estate.address.map(df_address_to_code.area_code)

In [26]:
# we see below address codes are given float values instead of integer values
# this probably means there are some addresses that did not get mapped properly
# hence getting assigned nan (float) values
df_estate.tail()

Unnamed: 0,address,area,price,built,year,month,area_code
5863875,충청북도 충주시,59.76,10500,1994,2015,12,43130.0
5863876,충청북도 충주시,59.76,10200,1994,2015,12,43130.0
5863877,충청북도 충주시,84.93,19700,2007,2015,12,43130.0
5863878,충청북도 충주시,84.68,15900,2002,2015,12,43130.0
5863879,충청북도 충주시,84.68,16000,2002,2015,12,43130.0


In [27]:
# we check to see which addresses are not taken care of properly
# fortunately, there are only three
df_estate[df_estate.area_code.isnull()].groupby('address').count()

Unnamed: 0_level_0,area,price,built,year,month,area_code
address,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
세종특별자치시 고운동,71,71,71,71,71,0
세종특별자치시 나성동,93,93,93,93,93,0
세종특별자치시 어진동,139,139,139,139,139,0


In [28]:
# getting ready to patch things up manually
# 고운동, 나성동, 어진동 were all part of 한솔동
# moreover, 도담동, 아름동, 종촌동 were all part of 한솔동 until very recently
# so we might as well combine everything into one address
df_estate.address.replace(to_replace=['세종특별자치시 고운동', '세종특별자치시 나성동', '세종특별자치시 어진동',
                                      '세종특별자치시 도담동', '세종특별자치시 아름동', '세종특별자치시 종촌동'],
                          value='세종특별자치시 한솔동', inplace=True)

In [29]:
# try mapping string addresses to codes again
df_estate['area_code'] = df_estate.address.map(df_address_to_code.area_code)

In [30]:
df_estate.tail()

Unnamed: 0,address,area,price,built,year,month,area_code
5863875,충청북도 충주시,59.76,10500,1994,2015,12,43130
5863876,충청북도 충주시,59.76,10200,1994,2015,12,43130
5863877,충청북도 충주시,84.93,19700,2007,2015,12,43130
5863878,충청북도 충주시,84.68,15900,2002,2015,12,43130
5863879,충청북도 충주시,84.68,16000,2002,2015,12,43130


In [31]:
# one last sanity check
df_estate.isnull().values.any()

False

In [32]:
# we don't need the address column anymore
df_estate.drop('address', axis=1, inplace=True)

In [33]:
# export to a new file
df_estate.to_csv(path + '01_estate.csv', index=False, encoding='utf-8')

In [34]:
# export the conversion tables for later use
df_address_to_code.to_csv(path + '01_address_to_code.csv', index=True, encoding='utf-8')
df_code_to_address.to_csv(path + '01_code_to_address.csv', index=True, encoding='utf-8')