In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

In [2]:
print('Pandas : %s'%(pd.__version__))
print('Numpy : %s'%(np.__version__))
print('Scikit-Learn : %s'%(sklearn.__version__))
!python --version

Pandas : 1.0.1
Numpy : 1.18.1
Scikit-Learn : 0.22.1
Python 3.7.6


In [3]:
def grap_year(data):
    data = str(data)
    return int(data[:4])

def grap_month(data):
    data = str(data)
    return int(data[4:])

In [4]:
# 날짜 처리
data = pd.read_csv('201901-202003.csv')
data = data.fillna('')
data['year'] = data['REG_YYMM'].apply(lambda x: grap_year(x))
data['month'] = data['REG_YYMM'].apply(lambda x: grap_month(x))
data = data.drop(['REG_YYMM'], axis=1)

In [5]:
# 데이터 정제
df = data.copy()
df = df.drop(['CARD_CCG_NM', 'HOM_CCG_NM'], axis=1)

columns = ['CARD_SIDO_NM', 'STD_CLSS_NM', 'HOM_SIDO_NM', 'AGE', 'SEX_CTGO_CD', 'FLC', 'year', 'month']
df = df.groupby(columns).sum().reset_index(drop=False)

In [6]:
# 인코딩
dtypes = df.dtypes
encoders = {}
for column in df.columns:
    if str(dtypes[column]) == 'object':
        encoder = LabelEncoder()
        encoder.fit(df[column])
        encoders[column] = encoder
        
df_num = df.copy()        
for column in encoders.keys():
    encoder = encoders[column]
    df_num[column] = encoder.transform(df[column])

In [7]:
#4월 데이터 들고오기
testset = pd.read_csv('202004.csv')
testset = testset.fillna('')
testset['year'] = testset['REG_YYMM'].apply(lambda x: grap_year(x))
testset['month'] = testset['REG_YYMM'].apply(lambda x: grap_month(x))
testset = testset.drop(['REG_YYMM'], axis=1)

# 데이터 정제
test_df = testset.copy()
test_df = test_df.drop(['CARD_CCG_NM', 'HOM_CCG_NM'], axis=1)

columns = ['CARD_SIDO_NM', 'STD_CLSS_NM', 'HOM_SIDO_NM', 'AGE', 'SEX_CTGO_CD', 'FLC', 'year', 'month']
test_df = test_df.groupby(columns).sum().reset_index(drop=False)

In [8]:
test_df

Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,HOM_SIDO_NM,AGE,SEX_CTGO_CD,FLC,year,month,CSTMR_CNT,AMT,CNT
0,강원,건강보조식품 소매업,강원,20s,1,1,2020,4,6,478500,5
1,강원,건강보조식품 소매업,강원,30s,1,2,2020,4,15,1585500,15
2,강원,건강보조식품 소매업,강원,30s,2,2,2020,4,55,5099306,58
3,강원,건강보조식품 소매업,강원,40s,1,3,2020,4,26,1884090,30
4,강원,건강보조식품 소매업,강원,40s,1,4,2020,4,6,617000,8
...,...,...,...,...,...,...,...,...,...,...,...
57052,충북,휴양콘도 운영업,충북,30s,2,2,2020,4,3,28000,4
57053,충북,휴양콘도 운영업,충북,40s,1,3,2020,4,3,218850,3
57054,충북,휴양콘도 운영업,충북,40s,2,3,2020,4,5,193000,5
57055,충북,휴양콘도 운영업,충북,50s,1,4,2020,4,14,720500,18


In [9]:
#4월
test_num = test_df.copy()        
for column in encoders.keys():
    encoder = encoders[column]
    test_num[column] = encoder.transform(test_df[column])
test_num.head()

Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,HOM_SIDO_NM,AGE,SEX_CTGO_CD,FLC,year,month,CSTMR_CNT,AMT,CNT
0,0,0,0,1,1,1,2020,4,6,478500,5
1,0,0,0,2,1,2,2020,4,15,1585500,15
2,0,0,0,2,2,2,2020,4,55,5099306,58
3,0,0,0,3,1,3,2020,4,26,1884090,30
4,0,0,0,3,1,4,2020,4,6,617000,8


# 여기서부터 랜덤포레스트

In [344]:
#이건 모두 합치는 것
df_num = df_num.append(test_num)            #원본

df_num = df_num.reset_index()
df_num = df_num.drop('index',axis=1)


In [346]:
df_num

Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,HOM_SIDO_NM,AGE,SEX_CTGO_CD,FLC,year,month,CSTMR_CNT,AMT,CNT
0,0,0,0,1,1,1,2019,1,4,311200,4
1,0,0,0,1,1,1,2019,2,3,605000,3
2,0,0,0,1,1,1,2019,6,3,139000,3
3,0,0,0,1,1,1,2019,8,3,27500,3
4,0,0,0,1,1,1,2019,9,3,395500,3
...,...,...,...,...,...,...,...,...,...,...,...
1114446,16,40,16,2,2,2,2020,4,3,28000,4
1114447,16,40,16,3,1,3,2020,4,3,218850,3
1114448,16,40,16,3,2,3,2020,4,5,193000,5
1114449,16,40,16,4,1,4,2020,4,14,720500,18


In [10]:
#기본 데이터
train_num = df_num.sample(frac=1, random_state=0)
train_features = train_num.drop(['AMT'], axis=1)
train_target = np.log1p(train_num['AMT'])

# test_features = test_num.drop(['AMT'], axis=1)
# test_target = np.log1p(test_num['AMT'])


# from sklearn.model_selection import train_test_split
# df_train_x, df_test_x, df_train_y,df_test_y = train_test_split(train_features,train_target,test_size = 0.2 , random_state = 1234)

# model = RandomForestRegressor(n_jobs=-1, random_state=0)
# model.fit(df_train_x, df_train_y)

# model.score(df_test_x,df_test_y)

model = RandomForestRegressor(n_jobs=-1, random_state=0)
model.fit(train_features, train_target)
model.score(train_features, train_target)

0.9962319803338259

In [11]:
#CSTMR_CNT 예측 모델
train_num = df_num.sample(frac=1, random_state=0)
train_features = train_num.drop(['AMT','CSTMR_CNT','CNT'], axis=1)
train_target = train_num['CSTMR_CNT']

# testset_num = test_num
# test_features = testset_num.drop(['AMT','CSTMR_CNT','CNT'], axis=1)
# test_target = test_num['CSTMR_CNT']


cstmr_model = RandomForestRegressor(n_jobs=-1, random_state=0)
cstmr_model.fit(train_features, train_target)
print(cstmr_model.score(train_features, train_target))


0.9963749070409276


In [12]:
#CNT 예측 모델
train_num = df_num.sample(frac=1, random_state=0)
train_features = train_num.drop(['AMT','CSTMR_CNT','CNT'], axis=1)
train_target = train_num['CNT']

# testset_num = test_num
# test_features = testset_num.drop(['AMT','CSTMR_CNT','CNT'], axis=1)
# test_target = test_num['CNT']


cnt_model = RandomForestRegressor(n_jobs=-1, random_state=0)
cnt_model.fit(train_features, train_target)
print(cnt_model.score(train_features, train_target))


0.9953367174852283


# 여기서 부터 예측 시작

In [55]:
predict_csv = df[(df['year']==2020) & (df['month']==1)].drop('AMT',axis=1)
predict_csv['month']=4
predict_csv

Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,HOM_SIDO_NM,AGE,SEX_CTGO_CD,FLC,year,month,CSTMR_CNT,CNT
6,강원,건강보조식품 소매업,강원,20s,1,1,2020,4,6,6
18,강원,건강보조식품 소매업,강원,20s,2,1,2020,4,10,10
41,강원,건강보조식품 소매업,강원,30s,1,2,2020,4,69,70
57,강원,건강보조식품 소매업,강원,30s,2,2,2020,4,85,86
80,강원,건강보조식품 소매업,강원,40s,1,3,2020,4,90,91
...,...,...,...,...,...,...,...,...,...,...
1057341,충북,휴양콘도 운영업,충북,50s,2,4,2020,4,65,97
1057356,충북,휴양콘도 운영업,충북,60s,1,5,2020,4,50,70
1057371,충북,휴양콘도 운영업,충북,60s,2,5,2020,4,40,71
1057385,충북,휴양콘도 운영업,충북,70s,1,5,2020,4,9,14


In [358]:
predict_csv = pd.read_csv('predict_table.csv',encoding='UTF-8')
predict_csv = predict_csv.drop('Unnamed: 0',axis=1)
predict_csv


Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,HOM_SIDO_NM,AGE,SEX_CTGO_CD,FLC,year,month,CSTMR_CNT,CNT
0,강원,건강보조식품 소매업,강원,20s,1,1,2020,4,6,5
1,강원,건강보조식품 소매업,강원,30s,1,2,2020,4,15,15
2,강원,건강보조식품 소매업,강원,30s,2,2,2020,4,55,58
3,강원,건강보조식품 소매업,강원,40s,1,3,2020,4,26,30
4,강원,건강보조식품 소매업,강원,40s,1,4,2020,4,6,8
...,...,...,...,...,...,...,...,...,...,...
114109,충북,휴양콘도 운영업,충북,30s,2,2,2020,7,3,4
114110,충북,휴양콘도 운영업,충북,40s,1,3,2020,7,3,3
114111,충북,휴양콘도 운영업,충북,40s,2,3,2020,7,5,5
114112,충북,휴양콘도 운영업,충북,50s,1,4,2020,7,14,18


In [50]:
predict_num = predict_csv.copy()        
for column in encoders.keys():
    encoder = encoders[column]
    predict_num[column] = encoder.transform(predict_num[column])


In [51]:
predict_num = predict_num.drop(['CNT','CSTMR_CNT'],axis=1)
cnt = cnt_model.predict(predict_num)
cstmr = cstmr_model.predict(predict_num)

predict_num['CNT'] = np.round(cnt,0)

predict_num['CSTMR_CNT']=np.round(cstmr,0)
predict_num

Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,HOM_SIDO_NM,AGE,SEX_CTGO_CD,FLC,year,month,CNT,CSTMR_CNT
6,0,0,0,1,1,1,2020,4,3.0,4.0
18,0,0,0,1,2,1,2020,4,7.0,7.0
41,0,0,0,2,1,2,2020,4,39.0,36.0
57,0,0,0,2,2,2,2020,4,45.0,44.0
80,0,0,0,3,1,3,2020,4,48.0,49.0
...,...,...,...,...,...,...,...,...,...,...
1057341,16,40,16,4,2,4,2020,4,17.0,11.0
1057356,16,40,16,5,1,5,2020,4,13.0,12.0
1057371,16,40,16,5,2,5,2020,4,11.0,7.0
1057385,16,40,16,6,1,5,2020,4,12.0,9.0


In [52]:
# 예측
pred = model.predict(predict_num)
pred = np.expm1(pred)

predict_num['AMT'] = np.round(pred, 0)
predict_num['REG_YYMM'] = predict_num['year']*100 + predict_num['month']

predict_num = predict_num[['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM', 'AMT']]
predict_num = predict_num.groupby(['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM']).sum().reset_index(drop=False)

predict_num['CARD_SIDO_NM'] = encoders['CARD_SIDO_NM'].inverse_transform(predict_num['CARD_SIDO_NM'])
predict_num['STD_CLSS_NM'] = encoders['STD_CLSS_NM'].inverse_transform(predict_num['STD_CLSS_NM'])


In [53]:
# 제출 파일 만들기
submission = pd.read_csv('submission (2).csv', index_col=0)
submission = submission.drop(['AMT'], axis=1)
submission = pd.merge(submission,predict_num, on=['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM'], how='left')
submission.index.name = 'id'
submission = submission.fillna(0)
submission.to_csv('submission.csv', encoding='utf-8-sig')
submission

Unnamed: 0_level_0,REG_YYMM,CARD_SIDO_NM,STD_CLSS_NM,AMT
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,202004,강원,건강보조식품 소매업,2.884247e+08
1,202004,강원,골프장 운영업,4.368047e+09
2,202004,강원,과실 및 채소 소매업,1.155078e+09
3,202004,강원,관광 민예품 및 선물용품 소매업,1.842730e+07
4,202004,강원,그외 기타 분류안된 오락관련 서비스업,0.000000e+00
...,...,...,...,...
1389,202007,충북,피자 햄버거 샌드위치 및 유사 음식점업,0.000000e+00
1390,202007,충북,한식 음식점업,0.000000e+00
1391,202007,충북,호텔업,0.000000e+00
1392,202007,충북,화장품 및 방향제 소매업,0.000000e+00
