In [11]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import set_matplotlib_formats
import seaborn as sns
import missingno as msno
from statistics import mode
# Mac OS를 이용하고 있기 떄문에 AppleGothic을 설정했습니다.
# 윈도우의 경우 다른 폰트를 사용해주셔야 합니다. 나눔폰트가 있는 경우 'NanumGothic' 없는 경우에는 'Malgun Gothic'을 사용해주세요.
plt.rc('font', family='Malgun Gothic')
# mpl.rc('font', family='NanumGothic')

# 별도로, 폰트를 바꿀 경우 마이너스가 표시되지 않는 경우도 있는데 이를 막아주는 코드입니다.
plt.rc('axes', unicode_minus=False)

# retina 옵션을 통해 해상도를 올립니다.
set_matplotlib_formats('retina')

In [40]:
data=pd.read_csv("../../data/201901-202003.csv",encoding='utf-8')
sub=pd.read_csv("../../data/submission.csv",encoding='utf-8')

In [41]:
data.isnull().sum()

REG_YYMM             0
CARD_SIDO_NM         0
CARD_CCG_NM      87213
STD_CLSS_NM          0
HOM_SIDO_NM          0
HOM_CCG_NM      147787
AGE                  0
SEX_CTGO_CD          0
FLC                  0
CSTMR_CNT            0
AMT                  0
CNT                  0
dtype: int64

### Null값처리

1) 결제자 거주지는 연관관계가 떨어지기 때문에, drop

2) 결제 주소지는 연관관계가 높으므로 살펴보자

In [42]:
data=data[~data['HOM_CCG_NM'].isnull()]

각 결제 주소지 시도 별로 가장 많이 발생하는 군 추출하여 null값에 해당 값으로 대체하려고 했으나

세종시에서 값이 없는 것을 확인

In [43]:
data.groupby('CARD_SIDO_NM')['CARD_CCG_NM'].agg(pd.Series.mode).to_frame()

Unnamed: 0_level_0,CARD_CCG_NM
CARD_SIDO_NM,Unnamed: 1_level_1
강원,강릉시
경기,성남시 분당구
경남,김해시
경북,경주시
광주,서구
대구,수성구
대전,유성구
부산,해운대구
서울,중구
세종,[]


In [44]:
data.loc[data['CARD_CCG_NM'].isnull(),'CARD_SIDO_NM'].value_counts()

세종    79477
Name: CARD_SIDO_NM, dtype: int64

CARD_SIDO_NM: 세종 ->  CARD_CCG_NM: 세종

In [45]:
data.loc[data['CARD_SIDO_NM']=='세종','CARD_CCG_NM'] ='세종'

In [46]:
data.isnull().sum()

REG_YYMM        0
CARD_SIDO_NM    0
CARD_CCG_NM     0
STD_CLSS_NM     0
HOM_SIDO_NM     0
HOM_CCG_NM      0
AGE             0
SEX_CTGO_CD     0
FLC             0
CSTMR_CNT       0
AMT             0
CNT             0
dtype: int64

## data cleansing

In [122]:
# Month 추출 -> 월 특성에 따른 지출 다를 수 있음
data['MM']=data['REG_YYMM'].apply(lambda x:str(x)[-2:]).astype(int)
data['YY']=data['REG_YYMM'].apply(lambda x:str(x)[:4]).astype(int)

In [123]:
corona =[202001,202002,202003]
data.loc[data['REG_YYMM'].isin(corona),'CORONA'] = 1
data.loc[~(data['REG_YYMM'].isin(corona)),'CORONA'] = 0

In [124]:
data['CORONA']=data['CORONA'].astype(int)

In [134]:
age=data['AGE']
flc=data['FLC']
data.loc[(age == '10s') & (flc==1),'Life_Style'] = 0
data.loc[(age == '20s') & (flc==1),'Life_Style'] = 1
data.loc[(age == '20s') & (flc==2),'Life_Style'] = 2
data.loc[(age == '30s') & (flc==1),'Life_Style'] = 3
data.loc[(age == '30s') & (flc==2),'Life_Style'] = 4
data.loc[(age == '30s') & (flc==3),'Life_Style'] = 5
data.loc[(age == '40s') & (flc==2),'Life_Style'] = 6
data.loc[(age == '40s') & (flc==3),'Life_Style'] = 7
data.loc[(age == '40s') & (flc==4),'Life_Style'] = 8
data.loc[(age == '50s') & (flc==2),'Life_Style'] = 9
data.loc[(age == '50s') & (flc==3),'Life_Style'] = 10
data.loc[(age == '50s') & (flc==4),'Life_Style'] = 11
data.loc[(age == '60s') & (flc==5),'Life_Style'] = 12
data.loc[(age == '70s') & (flc==5),'Life_Style'] = 13

In [135]:
data[data['Life_Style'].isnull()]

Unnamed: 0,REG_YYMM,CARD_SIDO_NM,CARD_CCG_NM,STD_CLSS_NM,HOM_SIDO_NM,HOM_CCG_NM,AGE,SEX_CTGO_CD,FLC,CSTMR_CNT,AMT,CNT,MM,YY,CORONA,Life_Style


In [136]:
data['Life_Style']=data['Life_Style'].astype('int32')

## Categorical data

In [137]:
data.head(1)

Unnamed: 0,REG_YYMM,CARD_SIDO_NM,CARD_CCG_NM,STD_CLSS_NM,HOM_SIDO_NM,HOM_CCG_NM,AGE,SEX_CTGO_CD,FLC,CSTMR_CNT,AMT,CNT,MM,YY,CORONA,Life_Style
0,201901,강원,강릉시,건강보조식품 소매업,강원,강릉시,20s,1,1,4,311200,4,1,2019,0,1


In [138]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24550005 entries, 0 to 24697791
Data columns (total 16 columns):
REG_YYMM        int64
CARD_SIDO_NM    object
CARD_CCG_NM     object
STD_CLSS_NM     object
HOM_SIDO_NM     object
HOM_CCG_NM      object
AGE             object
SEX_CTGO_CD     int64
FLC             int64
CSTMR_CNT       int64
AMT             int64
CNT             int64
MM              int32
YY              int32
CORONA          int32
Life_Style      int32
dtypes: int32(4), int64(6), object(6)
memory usage: 2.7+ GB


In [139]:
data['CARD_SIDO_NM'].nunique()

17

In [140]:
data['CARD_CCG_NM'].nunique()

227

In [141]:
label_data = data.copy()

In [142]:
from sklearn.preprocessing import LabelEncoder

label_data = data.copy()
label_encoder1 = LabelEncoder()
label_encoder2 = LabelEncoder()
label_encoder3 = LabelEncoder()
label_encoder4 = LabelEncoder()
label_encoder5 = LabelEncoder()
label_encoder6 = LabelEncoder()

label_data['CARD_CCG_NM'] = label_encoder1.fit_transform(data['CARD_CCG_NM'])
label_data['CARD_SIDO_NM'] = label_encoder2.fit_transform(data['CARD_SIDO_NM'])
label_data['HOM_CCG_NM'] = label_encoder3.fit_transform(data['HOM_CCG_NM'])
label_data['HOM_SIDO_NM'] = label_encoder4.fit_transform(data['HOM_SIDO_NM'])
label_data['HOM_SIDO_NM'] = label_encoder5.fit_transform(data['HOM_SIDO_NM'])
label_data['STD_CLSS_NM'] = label_encoder6.fit_transform(data['STD_CLSS_NM'])

In [143]:
label_data

Unnamed: 0,REG_YYMM,CARD_SIDO_NM,CARD_CCG_NM,STD_CLSS_NM,HOM_SIDO_NM,HOM_CCG_NM,AGE,SEX_CTGO_CD,FLC,CSTMR_CNT,AMT,CNT,MM,YY,CORONA,Life_Style
0,201901,0,3,0,0,3,20s,1,1,4,311200,4,1,2019,0,1
1,201901,0,3,0,0,3,30s,1,2,7,1374500,8,1,2019,0,4
2,201901,0,3,0,0,3,30s,2,2,6,818700,6,1,2019,0,4
3,201901,0,3,0,0,3,40s,1,3,4,1717000,5,1,2019,0,7
4,201901,0,3,0,0,3,40s,1,4,3,1047300,3,1,2019,0,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24697787,202003,16,202,40,15,201,30s,1,2,3,43300,4,3,2020,1,4
24697788,202003,16,202,40,15,201,40s,1,3,3,35000,3,3,2020,1,7
24697789,202003,16,202,40,15,201,50s,1,4,4,188000,6,3,2020,1,11
24697790,202003,16,202,40,15,201,50s,2,4,4,99000,6,3,2020,1,11


In [144]:
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

In [149]:
yymm_list=[201901,201902,201903,202001,202002,202003,202004]
label_data=label_data[label_data['REG_YYMM'].isin(yymm_list)]

In [150]:
y = label_data['AMT']
X = label_data[['CARD_SIDO_NM','CARD_CCG_NM','STD_CLSS_NM','SEX_CTGO_CD','CSTMR_CNT','CNT','MM','YY','Life_Style','CORONA']]

In [151]:
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1, test_size=0.2)

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(random_state=1)
rf_model.fit(train_X,train_y)
rf_val_predictions = rf_model.predict(val_X)
rf_val_mae = mean_absolute_error(rf_val_predictions, val_y)
rf_val_mae = mean_absolute_error(rf_model.predict(val_X),val_y)

print("Validation MAE for Random Forest Model: {}".format(rf_val_mae))



In [120]:
data.shape

(24550005, 13)