# 1. Library Import

In [254]:
# visualization
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
fe = fm.FontEntry(
    fname=r'/usr/share/fonts/truetype/nanum/NanumGothic.ttf', # ttf 파일이 저장되어 있는 경로
    name='NanumBarunGothic')                        # 이 폰트의 원하는 이름 설정
fm.fontManager.ttflist.insert(0, fe)              # Matplotlib에 폰트 추가
plt.rcParams.update({'font.size': 10, 'font.family': 'NanumBarunGothic'}) # 폰트 설정
plt.rc('font', family='NanumBarunGothic')
import seaborn as sns

# utils
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle
import warnings;warnings.filterwarnings('ignore')

# 소수점 둘째자리까지 표시하도록 설정 (원하는 자릿수로 변경 가능)
pd.options.display.float_format = '{:.2f}'.format  


# Model
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

import eli5
from eli5.sklearn import PermutationImportance

## Optuna tunning for XGB
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score
import optuna

## Optuna visulization
import plotly.express as px
import plotly.graph_objects as go
import plotly

# 열의 개수를 출력할 때 모두 표시하도록 설정
pd.set_option('display.max_columns', None)


# 2. Data Loading

In [255]:
# 필요한 데이터를 load 하겠습니다. 경로는 환경에 맞게 지정해주면 됩니다.
train_path = '../data/train3.csv'
test_path  = '../data/test3.csv'
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

In [256]:
train1 = train.copy()
test1 = test.copy()

In [257]:
student = pd.read_csv('../data/student.csv')
president = pd.read_csv('../data/president.csv',encoding='cp949')

In [258]:
student1 = student.copy()
president1 = president.copy()

In [260]:
train1 = train1.drop(columns='Unnamed: 0')
test1 = test1.drop(columns='Unnamed: 0')

In [261]:
train1.rename(columns={'interest_rate':'INTEREST_RATE','real_gdp':'REAL_GDP',
                        'nominal_gdp':'NOMINAL_GDP','school_district':'SCHOOL_DISTRICT',
                        'redevelop':'REDEVELOP','gu':'GU','dong':'DONG'},inplace=True)
test1.rename(columns={'interest_rate':'INTEREST_RATE','real_gdp':'REAL_GDP',
                        'nominal_gdp':'NOMINAL_GDP','school_district':'SCHOOL_DISTRICT',
                        'redevelop':'REDEVELOP','gu':'GU','dong':'DONG'},inplace=True)

In [262]:
# train1.columns
# test1.columns

# 3. Merge Student / 자치구 와 연도별 평균 초등학교 학급수 

In [263]:
student1.rename(columns={'연도':'YEAR','구':'GU',
                        '학급수':'CLASS_NUM'},inplace=True)

In [264]:
student1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   YEAR       400 non-null    int64  
 1   GU         400 non-null    object 
 2   CLASS_NUM  400 non-null    float64
dtypes: float64(1), int64(1), object(1)
memory usage: 9.5+ KB


In [265]:
train1['YEAR'] = train1['CONTR_YEAR_MONTH'].astype(str).str[:4]

In [266]:
train1['CONTR_YEAR_MONTH'] = train1['CONTR_YEAR_MONTH'].astype(int)
train1['YEAR'] = train1['YEAR'].astype(int)

In [267]:
test1['YEAR'] = test1['CONTR_YEAR_MONTH'].astype(str).str[:4]

In [268]:
test1['CONTR_YEAR_MONTH'] = test1['CONTR_YEAR_MONTH'].astype(int)
test1['YEAR'] = test1['YEAR'].astype(int)

In [269]:
student1['YEAR'].unique().max()

2023

In [270]:
student1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   YEAR       400 non-null    int64  
 1   GU         400 non-null    object 
 2   CLASS_NUM  400 non-null    float64
dtypes: float64(1), int64(1), object(1)
memory usage: 9.5+ KB


In [271]:
train2 = pd.merge(train1,student1,how='left',on=['YEAR','GU'])
test2 = pd.merge(test1,student1,how='left',on=['YEAR','GU'])

In [272]:
train2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1118822 entries, 0 to 1118821
Data columns (total 53 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   ADRES                   1118822 non-null  object 
 1   LOT_NO                  1118822 non-null  object 
 2   APT_NM                  1116696 non-null  object 
 3   EXCL_AREA_SQM           1118822 non-null  float64
 4   CONTR_YEAR_MONTH        1118822 non-null  int64  
 5   CONTR_DAY               1118822 non-null  int64  
 6   FLOOR                   1118822 non-null  int64  
 7   BUILD_YEAR              1118822 non-null  int64  
 8   ADRES_DORO              1118822 non-null  object 
 9   CANCEL_REASON_DATE      5983 non-null     float64
 10  TRADE_TYPE              32371 non-null    object 
 11  AGNCY_LOCATION          29241 non-null    object 
 12  CODEAPTNM               248131 non-null   object 
 13  HSHLDR_TY               249259 non-null   object 
 14  CR

# 4. Merge President / 대통령 선거 16대~20대 서울 자치구별 지지당 및 득표율 

In [273]:
president1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125 entries, 0 to 124
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   선거일      125 non-null    int64 
 1   임기       125 non-null    object
 2   지역       125 non-null    object
 3   지역별_지지당  125 non-null    object
 4   득표율      125 non-null    object
dtypes: int64(1), object(4)
memory usage: 5.0+ KB


In [274]:
president2 = president1[['선거일','지역','지역별_지지당','득표율']]

In [275]:
president3 = president1[['선거일','임기','지역']]

In [276]:
president3.rename(columns={'선거일':'CONTR','지역':'GU','임기':'TERM'},inplace=True)

In [277]:
president2.rename(columns={'선거일':'CONTR','지역':'GU','지역별_지지당':'PM','득표율':'VOTE'},inplace=True)

In [278]:
president2['VOTE'] = president2['VOTE'].str.rstrip('%').astype(float)

In [279]:
president2['VOTERATE'] =  president2.apply(lambda row: -row['VOTE'] if row['PM'] == '야당' else row['VOTE'] if row['PM'] == '여당' else None, axis=1)

In [280]:
president2 = president2[['CONTR','GU','VOTERATE']]

In [281]:
president4 = pd.merge(president3,president2,how='left',on=['CONTR','GU'])

In [282]:
train3 = train2.copy()
test3 = test2.copy()

In [283]:
president3['TERM'].unique().tolist()

['16대', '17대', '18대', '19대', '20대']

### 4-1. Merge 대통령 선거 임기 기간별 지지당 및 득표율 

In [284]:
contract_years = sorted([20021219, 20071219, 20121219, 20170509, 20220309,20240101])

merged_df = pd.DataFrame()

before = contract_years[0]

condlist = [
(train3['CONTR'] >= 20220309)& (train3['CONTR']<20240101),
(train3['CONTR'] >= 20170509)& (train3['CONTR']<20220309),
(train3['CONTR'] >= 20121219)& (train3['CONTR']<20170509),
(train3['CONTR'] >= 20071219) & (train3['CONTR'] < 20121219),
(train3['CONTR'] >= 20021219) & (train3['CONTR'] < 20071219)]
choicelist = sorted(['16대', '17대', '18대', '19대', '20대'],reverse=True)
train3['TERM'] = np.select(condlist, choicelist)

condlist = [
(test3['CONTR'] >= 20220309)& (test3['CONTR']<20240101),
(test3['CONTR'] >= 20170509)& (test3['CONTR']<20220309),
(test3['CONTR'] >= 20121219)& (test3['CONTR']<20170509),
(test3['CONTR'] >= 20071219) & (test3['CONTR'] < 20121219),
(test3['CONTR'] >= 20021219) & (test3['CONTR'] < 20071219)]
choicelist = sorted(['16대', '17대', '18대', '19대', '20대'],reverse=True)
test3['TERM'] = np.select(condlist, choicelist)

In [285]:
train4 = pd.merge(train3,president4,how='left',on=['GU','TERM'])
test4 = pd.merge(test3,president4,how='left',on=['GU','TERM'])

In [286]:
train4 = train4.drop(columns='CONTR_y')
test4 = test4.drop(columns='CONTR_y')

In [287]:
train4.rename(columns={'CONTR_x':'CONTR','dongAPT_NM':'DONGAPT_NM'},inplace=True)
test4.rename(columns={'CONTR_x':'CONTR','dongAPT_NM':'DONGAPT_NM'},inplace=True)

In [288]:
train4 = train4.drop(columns=['YEAR','TERM'])
test4 = test4.drop(columns=['YEAR','TERM'])

# 5. Save train4 & test4

In [289]:
test4.to_csv('../data/test4.csv',index=False)
train4.to_csv('../data/train4.csv',index=False)

# 6. Change Variables English to Korean

In [291]:
train4.rename(columns={'ADRES':'시군구','LOT_NO':'번지','APT_NM':'아파트명','EXCL_AREA_SQM':'전용면적',
                      'CONTR_YEAR_MONTH':'계약년월','CONTR_DAY':'계약일','FLOOR':'층','BUILD_YEAR':'건축년도',
                      'ADRES_DORO':'도로명','CANCEL_REASON_DATE':'해제사유발생일',
                      'TRADE_TYPE':'거래유형','AGNCY_LOCATION':'중개사소재지','CODEAPTNM':'k-단지분류',
                      'HSHLDR_TY':'k-세대타입(분양형태)','CRRDPR_TY':'k-복도유형',
                      'HEAT_MTHD':'k-난방방식','ALL_DONG_CO':'k-전체동수','ALL_HSHLD_CO':'k-전체세대수',
                      'CO_WO':'k-건설사','CO_EX':'k-시행사','TOTAR':'k-연면적',
                      'PRIVAREA':'k-주거전용면적','KAPTMPAREA60':'k-전용면적별세대현황60이하',
                      'KAPTMPAREA85':'k-전용면적별세대현황6085이하','KAPTMPAREA135':'k-85135이하',
                      'KAPTMPAREA136':'k-135초과', 'HSHLD_ELCTY_CNTRCT_MTH': '세대전기계약방법',
                      'BU_AR':'건축면적','CNT_PA':'주차대수',
                      'GUBUN':'기타의무임대1234',
                      'X_CODE':'좌표X','Y_CODE':'좌표Y','USE_RQSTDT':'단지신청일',
                      'GU':'구','DONG':'동','FULL_ADRES':'전체주소명',
                      'DONGAPT_NM':'동아파트명','FLOATING_POPULATION':'유동인구',
                      'X_CODE':'좌표X','Y_CODE':'좌표Y','USE_RQSTDT':'단지신청일',
                      'SUBWAY_DIST':'아파트 지하철역 거리','1STSUBAREA':'1차역세권',
                      '2NDSUBAREA':'2차역세권',
                      'BUS_DIST':'아파트 버스정류장 거리','BRIDGE_DIST':'아파트 한강대교 거리',
                      'LEASE_RATE':'전세가율','CONTR':'전체계약일자','INTEREST_RATE':'금리',
                      'REAL_GDP':'실질gdp','NOMINAL_GDP':'명목gdp','SCHOOL_DISTRICT':'학군',
                      'REDEVELOP':'재개발개수', 'CLASS_NUM':'학급수', 'VOTERATE':'여당득표율'
                      },inplace=True)

In [292]:
test4.rename(columns={'ADRES':'시군구','LOT_NO':'번지','APT_NM':'아파트명','EXCL_AREA_SQM':'전용면적',
                      'CONTR_YEAR_MONTH':'계약년월','CONTR_DAY':'계약일','FLOOR':'층','BUILD_YEAR':'건축년도',
                      'ADRES_DORO':'도로명','CANCEL_REASON_DATE':'해제사유발생일',
                      'TRADE_TYPE':'거래유형','AGNCY_LOCATION':'중개사소재지','CODEAPTNM':'k-단지분류',
                      'HSHLDR_TY':'k-세대타입(분양형태)','CRRDPR_TY':'k-복도유형',
                      'HEAT_MTHD':'k-난방방식','ALL_DONG_CO':'k-전체동수','ALL_HSHLD_CO':'k-전체세대수',
                      'CO_WO':'k-건설사','CO_EX':'k-시행사','TOTAR':'k-연면적',
                      'PRIVAREA':'k-주거전용면적','KAPTMPAREA60':'k-전용면적별세대현황60이하',
                      'KAPTMPAREA85':'k-전용면적별세대현황6085이하','KAPTMPAREA135':'k-85135이하',
                      'KAPTMPAREA136':'k-135초과', 'HSHLD_ELCTY_CNTRCT_MTH': '세대전기계약방법',
                      'BU_AR':'건축면적','CNT_PA':'주차대수',
                      'GUBUN':'기타의무임대1234',
                      'X_CODE':'좌표X','Y_CODE':'좌표Y','USE_RQSTDT':'단지신청일',
                      'GU':'구','DONG':'동','FULL_ADRES':'전체주소명',
                      'DONGAPT_NM':'동아파트명','FLOATING_POPULATION':'유동인구',
                      'X_CODE':'좌표X','Y_CODE':'좌표Y','USE_RQSTDT':'단지신청일',
                      'SUBWAY_DIST':'아파트 지하철역 거리','1STSUBAREA':'1차역세권',
                      '2NDSUBAREA':'2차역세권',
                      'BUS_DIST':'아파트 버스정류장 거리','BRIDGE_DIST':'아파트 한강대교 거리',
                      'LEASE_RATE':'전세가율','CONTR':'전체계약일자','INTEREST_RATE':'금리',
                      'REAL_GDP':'실질gdp','NOMINAL_GDP':'명목gdp','SCHOOL_DISTRICT':'학군',
                      'REDEVELOP':'재개발개수', 'CLASS_NUM':'학급수', 'VOTERATE':'여당득표율'
                      },inplace=True)

# 7. Save Korean train4 & test4

In [293]:
test4.to_csv('../data/KOREAN_test4.csv',index=False)
train4.to_csv('../data/KOREAN_train4.csv',index=False)