# 전국 업종/업태별 신용카드 사용량 예측
### 배경
### - 신용카드 사용량을 분석을 통한  ‘Post COVID-19 시대’ 신용카드 사용량 예측 모델 개발
### ﻿﻿- ﻿﻿﻿﻿﻿﻿﻿﻿﻿﻿﻿﻿﻿﻿﻿﻿﻿﻿﻿﻿﻿﻿﻿﻿﻿﻿﻿지역 경제 위축 및 중소상공인 경영난 해소를 위한 대책 마련
### 대회 설명 : https://dacon.io/competitions/official/235615/overview/
### 데이터 설명 : https://dacon.io/competitions/official/235615/data/
### 
## 아이디어
### 1. 신용카드 이용 고객 수 선 예측을 통한 총 사용금액 최종 예측
### 2. 코로나 이슈로 인해 예측력이 낮은 관광업 같은 업종을 위한 코로나 현황과 코스피 지수 데이터 활용

### 

## 결과
### 코로나 현황, 코스피 지수 데이터는 변수 중요도가 매우 낮았음. 즉 이용 가치도가 낮음. -> 주석 처리함
### 신용카드 이용 건수를 선 예측하여 성능을 높일 수 있었음.
### 약 400팀 가운데 21위 마무리

### 

## 개선
### FLC(가구 생애 주기) 데이터 또한 선예측 했으면 성능을 높일 수 있었음. 


In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

In [2]:
print('Pandas : %s'%(pd.__version__))
print('Numpy : %s'%(np.__version__))
print('Scikit-Learn : %s'%(sklearn.__version__))
!python --version

Pandas : 1.0.1
Numpy : 1.18.1
Scikit-Learn : 0.22.1
Python 3.7.6


## 데이터 전처리 
### 년/월 구분 열 생성
### 명목 변수 -> 수치로 인코딩

In [3]:
def grap_year(data):
    data = str(data)
    return int(data[:4])

def grap_month(data):
    data = str(data)
    return int(data[4:])

In [4]:
# 날짜 처리
data = pd.read_csv('201901-202003.csv')
data = data.fillna('')
data['year'] = data['REG_YYMM'].apply(lambda x: grap_year(x))
data['month'] = data['REG_YYMM'].apply(lambda x: grap_month(x))
data = data.drop(['REG_YYMM'], axis=1)

In [5]:
# 데이터 정제
df = data.copy()
df = df.drop(['CARD_CCG_NM', 'HOM_CCG_NM'], axis=1)

columns = ['CARD_SIDO_NM', 'STD_CLSS_NM', 'HOM_SIDO_NM', 'AGE', 'SEX_CTGO_CD', 'FLC', 'year', 'month']
df = df.groupby(columns).sum().reset_index(drop=False)

In [6]:
# 인코딩
dtypes = df.dtypes
encoders = {}
for column in df.columns:
    if str(dtypes[column]) == 'object':
        encoder = LabelEncoder()
        encoder.fit(df[column])
        encoders[column] = encoder
        
df_num = df.copy()        
for column in encoders.keys():
    encoder = encoders[column]
    df_num[column] = encoder.transform(df[column])

In [7]:
df.head()

Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,HOM_SIDO_NM,AGE,SEX_CTGO_CD,FLC,year,month,CSTMR_CNT,AMT,CNT
0,강원,건강보조식품 소매업,강원,20s,1,1,2019,1,4,311200,4
1,강원,건강보조식품 소매업,강원,20s,1,1,2019,2,3,605000,3
2,강원,건강보조식품 소매업,강원,20s,1,1,2019,6,3,139000,3
3,강원,건강보조식품 소매업,강원,20s,1,1,2019,8,3,27500,3
4,강원,건강보조식품 소매업,강원,20s,1,1,2019,9,3,395500,3


In [238]:
# covid_df = pd.read_csv('covid.csv',encoding='utf-8')
# covid_df = covid_df.drop('Unnamed: 0',axis=1)
# covid_df.head()

Unnamed: 0,province,day_confirmed,year,month
0,강원,0,2020,1
1,경기,6,2020,1
2,경남,0,2020,1
3,경북,0,2020,1
4,광주,0,2020,1


In [239]:
# df

Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,HOM_SIDO_NM,AGE,SEX_CTGO_CD,FLC,year,month,CSTMR_CNT,AMT,CNT
0,강원,건강보조식품 소매업,강원,20s,1,1,2019,1,4,311200,4
1,강원,건강보조식품 소매업,강원,20s,1,1,2019,2,3,605000,3
2,강원,건강보조식품 소매업,강원,20s,1,1,2019,6,3,139000,3
3,강원,건강보조식품 소매업,강원,20s,1,1,2019,8,3,27500,3
4,강원,건강보조식품 소매업,강원,20s,1,1,2019,9,3,395500,3
...,...,...,...,...,...,...,...,...,...,...,...
1057389,충북,휴양콘도 운영업,충북,70s,2,5,2019,3,3,148000,4
1057390,충북,휴양콘도 운영업,충북,70s,2,5,2019,5,5,329800,7
1057391,충북,휴양콘도 운영업,충북,70s,2,5,2019,10,7,557800,7
1057392,충북,휴양콘도 운영업,충북,70s,2,5,2019,12,3,247800,3


In [240]:
# a=pd.merge(df,covid_df,left_on=['CARD_SIDO_NM','year','month'],right_on=['province','year','month'] ,how='left')
# a = a.fillna(0)
# a = a.drop('province',axis=1)


In [241]:
# b = pd.merge(a,covid_df, left_on = ['HOM_SIDO_NM','year','month'],right_on =['province','year','month'],how='left')
# b = b.fillna(0)
# b = b.drop('province',axis=1)
# b

Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,HOM_SIDO_NM,AGE,SEX_CTGO_CD,FLC,year,month,CSTMR_CNT,AMT,CNT,day_confirmed_x,day_confirmed_y
0,강원,건강보조식품 소매업,강원,20s,1,1,2019,1,4,311200,4,0.0,0.0
1,강원,건강보조식품 소매업,강원,20s,1,1,2019,2,3,605000,3,0.0,0.0
2,강원,건강보조식품 소매업,강원,20s,1,1,2019,6,3,139000,3,0.0,0.0
3,강원,건강보조식품 소매업,강원,20s,1,1,2019,8,3,27500,3,0.0,0.0
4,강원,건강보조식품 소매업,강원,20s,1,1,2019,9,3,395500,3,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1057389,충북,휴양콘도 운영업,충북,70s,2,5,2019,3,3,148000,4,0.0,0.0
1057390,충북,휴양콘도 운영업,충북,70s,2,5,2019,5,5,329800,7,0.0,0.0
1057391,충북,휴양콘도 운영업,충북,70s,2,5,2019,10,7,557800,7,0.0,0.0
1057392,충북,휴양콘도 운영업,충북,70s,2,5,2019,12,3,247800,3,0.0,0.0


In [11]:
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# b_scale = b[['day_confirmed_x','day_confirmed_y']].copy()
# cols = b_scale.columns

# np_scaled = scaler.fit_transform(b_scale)
# df_scaled = pd.DataFrame(np_scaled, columns = cols)
# df_scaled.describe()

In [12]:
# b['day_confirmed_x'] = df_scaled['day_confirmed_x']
# b['day_confirmed_y'] = df_scaled['day_confirmed_y']
# b.describe()

In [252]:
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# kospi = pd.read_csv('kospi_stock.csv',encoding='utf-8')
# kospi = kospi.drop('Unnamed: 0',axis=1)
# kospi

# low_kospi = kospi.copy()
# year = low_kospi['year']
# month = low_kospi['month']
# low_kospi = low_kospi.drop(['시가','고가','종가','year','month'],axis=1)
# low_kospi.describe()

# cols = low_kospi.columns
# scaler.fit(low_kospi)
# np_scaled = scaler.transform(low_kospi)
# df_scaled = pd.DataFrame(np_scaled, columns = cols)
# df_scaled['year'] =year
# df_scaled['month'] = month
# df_scaled.columns = ['low_stock','volume','year','month']
# df_scaled


# c = pd.merge(b,df_scaled,on=['year','month'],how='left')
# c

Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,HOM_SIDO_NM,AGE,SEX_CTGO_CD,FLC,year,month,CSTMR_CNT,AMT,CNT,day_confirmed_x,day_confirmed_y,low_stock,volume
0,강원,건강보조식품 소매업,강원,20s,1,1,2019,1,4,311200,4,0.0,0.0,-0.000820,-0.562388
1,강원,건강보조식품 소매업,강원,20s,1,1,2019,2,3,605000,3,0.0,0.0,1.073603,-0.744921
2,강원,건강보조식품 소매업,강원,20s,1,1,2019,6,3,139000,3,0.0,0.0,0.261513,-0.253765
3,강원,건강보조식품 소매업,강원,20s,1,1,2019,8,3,27500,3,0.0,0.0,-0.545700,-0.871010
4,강원,건강보조식품 소매업,강원,20s,1,1,2019,9,3,395500,3,0.0,0.0,-0.153259,-1.033972
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1057389,충북,휴양콘도 운영업,충북,70s,2,5,2019,3,3,148000,4,0.0,0.0,0.800752,-1.064838
1057390,충북,휴양콘도 운영업,충북,70s,2,5,2019,5,5,329800,7,0.0,0.0,0.185587,-0.376968
1057391,충북,휴양콘도 운영업,충북,70s,2,5,2019,10,7,557800,7,0.0,0.0,0.154147,-0.981977
1057392,충북,휴양콘도 운영업,충북,70s,2,5,2019,12,3,247800,3,0.0,0.0,0.404844,-0.410845


In [253]:
# # 인코딩
# dtypes = b.dtypes
# encoders = {}
# for column in b.columns:
#     if str(dtypes[column]) == 'object':
#         encoder = LabelEncoder()
#         encoder.fit(b[column])
#         encoders[column] = encoder
        
# b_num = b.copy()        
# for column in encoders.keys():
#     encoder = encoders[column]
#     b_num[column] = encoder.transform(b[column])
# b_num.head()

Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,HOM_SIDO_NM,AGE,SEX_CTGO_CD,FLC,year,month,CSTMR_CNT,AMT,CNT,day_confirmed_x,day_confirmed_y
0,0,0,0,1,1,1,2019,1,4,311200,4,0.0,0.0
1,0,0,0,1,1,1,2019,2,3,605000,3,0.0,0.0
2,0,0,0,1,1,1,2019,6,3,139000,3,0.0,0.0
3,0,0,0,1,1,1,2019,8,3,27500,3,0.0,0.0
4,0,0,0,1,1,1,2019,9,3,395500,3,0.0,0.0


In [254]:
# # 인코딩
# dtypes = c.dtypes
# encoders = {}
# for column in c.columns:
#     if str(dtypes[column]) == 'object':
#         encoder = LabelEncoder()
#         encoder.fit(c[column])
#         encoders[column] = encoder
        
# c_num = c.copy()        
# for column in encoders.keys():
#     encoder = encoders[column]
#     c_num[column] = encoder.transform(c[column])
# c_num.head()

Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,HOM_SIDO_NM,AGE,SEX_CTGO_CD,FLC,year,month,CSTMR_CNT,AMT,CNT,day_confirmed_x,day_confirmed_y,low_stock,volume
0,0,0,0,1,1,1,2019,1,4,311200,4,0.0,0.0,-0.00082,-0.562388
1,0,0,0,1,1,1,2019,2,3,605000,3,0.0,0.0,1.073603,-0.744921
2,0,0,0,1,1,1,2019,6,3,139000,3,0.0,0.0,0.261513,-0.253765
3,0,0,0,1,1,1,2019,8,3,27500,3,0.0,0.0,-0.5457,-0.87101
4,0,0,0,1,1,1,2019,9,3,395500,3,0.0,0.0,-0.153259,-1.033972


In [255]:
# d_num = c_num.drop(['day_confirmed_x','day_confirmed_y'],axis=1)

In [256]:
#4월 데이터 들고오기
testset = pd.read_csv('202004.csv')
testset = testset.fillna('')
testset['year'] = testset['REG_YYMM'].apply(lambda x: grap_year(x))
testset['month'] = testset['REG_YYMM'].apply(lambda x: grap_month(x))
testset = testset.drop(['REG_YYMM'], axis=1)

# 데이터 정제
test_df = testset.copy()
test_df = test_df.drop(['CARD_CCG_NM', 'HOM_CCG_NM'], axis=1)

columns = ['CARD_SIDO_NM', 'STD_CLSS_NM', 'HOM_SIDO_NM', 'AGE', 'SEX_CTGO_CD', 'FLC', 'year', 'month']
test_df = test_df.groupby(columns).sum().reset_index(drop=False)

In [9]:
test_df

Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,HOM_SIDO_NM,AGE,SEX_CTGO_CD,FLC,year,month,CSTMR_CNT,AMT,CNT
0,강원,건강보조식품 소매업,강원,20s,1,1,2020,4,6,478500,5
1,강원,건강보조식품 소매업,강원,30s,1,2,2020,4,15,1585500,15
2,강원,건강보조식품 소매업,강원,30s,2,2,2020,4,55,5099306,58
3,강원,건강보조식품 소매업,강원,40s,1,3,2020,4,26,1884090,30
4,강원,건강보조식품 소매업,강원,40s,1,4,2020,4,6,617000,8
...,...,...,...,...,...,...,...,...,...,...,...
57052,충북,휴양콘도 운영업,충북,30s,2,2,2020,4,3,28000,4
57053,충북,휴양콘도 운영업,충북,40s,1,3,2020,4,3,218850,3
57054,충북,휴양콘도 운영업,충북,40s,2,3,2020,4,5,193000,5
57055,충북,휴양콘도 운영업,충북,50s,1,4,2020,4,14,720500,18


In [258]:
# covid_test_df = pd.merge(test_df,covid_df, left_on = ['CARD_SIDO_NM','year','month'],right_on =['province','year','month'],how='left')
# covid_test_df = covid_test_df.fillna(0)
# covid_test_df = covid_test_df.drop('province',axis=1)

# covid_test_df = pd.merge(covid_test_df,covid_df, left_on = ['HOM_SIDO_NM','year','month'],right_on =['province','year','month'],how='left')
# covid_test_df = covid_test_df.fillna(0)
# covid_test_df = covid_test_df.drop('province',axis=1)

# covid_test_df.head()



Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,HOM_SIDO_NM,AGE,SEX_CTGO_CD,FLC,year,month,CSTMR_CNT,AMT,CNT,day_confirmed_x,day_confirmed_y
0,강원,건강보조식품 소매업,강원,20s,1,1,2020,4,6,478500,5,15,15
1,강원,건강보조식품 소매업,강원,30s,1,2,2020,4,15,1585500,15,15,15
2,강원,건강보조식품 소매업,강원,30s,2,2,2020,4,55,5099306,58,15,15
3,강원,건강보조식품 소매업,강원,40s,1,3,2020,4,26,1884090,30,15,15
4,강원,건강보조식품 소매업,강원,40s,1,4,2020,4,6,617000,8,15,15


In [259]:
# kospi_covid_test_df = pd.merge(covid_test_df,df_scaled,on=['year','month'],how='left')
# kospi_covid_test_df

Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,HOM_SIDO_NM,AGE,SEX_CTGO_CD,FLC,year,month,CSTMR_CNT,AMT,CNT,day_confirmed_x,day_confirmed_y,low_stock,volume
0,강원,건강보조식품 소매업,강원,20s,1,1,2020,4,6,478500,5,15,15,-1.883689,2.061987
1,강원,건강보조식품 소매업,강원,30s,1,2,2020,4,15,1585500,15,15,15,-1.883689,2.061987
2,강원,건강보조식품 소매업,강원,30s,2,2,2020,4,55,5099306,58,15,15,-1.883689,2.061987
3,강원,건강보조식품 소매업,강원,40s,1,3,2020,4,26,1884090,30,15,15,-1.883689,2.061987
4,강원,건강보조식품 소매업,강원,40s,1,4,2020,4,6,617000,8,15,15,-1.883689,2.061987
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57052,충북,휴양콘도 운영업,충북,30s,2,2,2020,4,3,28000,4,1,1,-1.883689,2.061987
57053,충북,휴양콘도 운영업,충북,40s,1,3,2020,4,3,218850,3,1,1,-1.883689,2.061987
57054,충북,휴양콘도 운영업,충북,40s,2,3,2020,4,5,193000,5,1,1,-1.883689,2.061987
57055,충북,휴양콘도 운영업,충북,50s,1,4,2020,4,14,720500,18,1,1,-1.883689,2.061987


In [261]:
# kospi_test_df = kospi_covid_test_df.drop(['day_confirmed_x','day_confirmed_y'],axis=1)
# kospi_test_df


Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,HOM_SIDO_NM,AGE,SEX_CTGO_CD,FLC,year,month,CSTMR_CNT,AMT,CNT,low_stock,volume
0,강원,건강보조식품 소매업,강원,20s,1,1,2020,4,6,478500,5,-1.883689,2.061987
1,강원,건강보조식품 소매업,강원,30s,1,2,2020,4,15,1585500,15,-1.883689,2.061987
2,강원,건강보조식품 소매업,강원,30s,2,2,2020,4,55,5099306,58,-1.883689,2.061987
3,강원,건강보조식품 소매업,강원,40s,1,3,2020,4,26,1884090,30,-1.883689,2.061987
4,강원,건강보조식품 소매업,강원,40s,1,4,2020,4,6,617000,8,-1.883689,2.061987
...,...,...,...,...,...,...,...,...,...,...,...,...,...
57052,충북,휴양콘도 운영업,충북,30s,2,2,2020,4,3,28000,4,-1.883689,2.061987
57053,충북,휴양콘도 운영업,충북,40s,1,3,2020,4,3,218850,3,-1.883689,2.061987
57054,충북,휴양콘도 운영업,충북,40s,2,3,2020,4,5,193000,5,-1.883689,2.061987
57055,충북,휴양콘도 운영업,충북,50s,1,4,2020,4,14,720500,18,-1.883689,2.061987


In [257]:
#4월
test_num = test_df.copy()        
for column in encoders.keys():
    encoder = encoders[column]
    test_num[column] = encoder.transform(test_df[column])
test_num.head()

Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,HOM_SIDO_NM,AGE,SEX_CTGO_CD,FLC,year,month,CSTMR_CNT,AMT,CNT
0,0,0,0,1,1,1,2020,4,6,478500,5
1,0,0,0,2,1,2,2020,4,15,1585500,15
2,0,0,0,2,2,2,2020,4,55,5099306,58
3,0,0,0,3,1,3,2020,4,26,1884090,30
4,0,0,0,3,1,4,2020,4,6,617000,8


In [262]:
# #4월 +주식

# kospi_test_df_num  = kospi_test_df.copy()        
# for column in encoders.keys():
#     encoder = encoders[column]
#     kospi_test_df_num[column] = encoder.transform(kospi_test_df[column])
# kospi_test_df_num.head()




Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,HOM_SIDO_NM,AGE,SEX_CTGO_CD,FLC,year,month,CSTMR_CNT,AMT,CNT,low_stock,volume
0,0,0,0,1,1,1,2020,4,6,478500,5,-1.883689,2.061987
1,0,0,0,2,1,2,2020,4,15,1585500,15,-1.883689,2.061987
2,0,0,0,2,2,2,2020,4,55,5099306,58,-1.883689,2.061987
3,0,0,0,3,1,3,2020,4,26,1884090,30,-1.883689,2.061987
4,0,0,0,3,1,4,2020,4,6,617000,8,-1.883689,2.061987


In [263]:
# #4월 + 확진자
# covid_test_df_num = covid_test_df.copy()        
# for column in encoders.keys():
#     encoder = encoders[column]
#     covid_test_df_num[column] = encoder.transform(covid_test_df[column])
# covid_test_df_num.head()


Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,HOM_SIDO_NM,AGE,SEX_CTGO_CD,FLC,year,month,CSTMR_CNT,AMT,CNT,day_confirmed_x,day_confirmed_y
0,0,0,0,1,1,1,2020,4,6,478500,5,15,15
1,0,0,0,2,1,2,2020,4,15,1585500,15,15,15
2,0,0,0,2,2,2,2020,4,55,5099306,58,15,15
3,0,0,0,3,1,3,2020,4,26,1884090,30,15,15
4,0,0,0,3,1,4,2020,4,6,617000,8,15,15


In [264]:
# #4월 + 주식 + 확진자
# kospi_covid_test_num  = kospi_covid_test_df.copy()        
# for column in encoders.keys():
#     encoder = encoders[column]
#     kospi_covid_test_num[column] = encoder.transform(kospi_covid_test_df[column])
# kospi_covid_test_num.head()

Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,HOM_SIDO_NM,AGE,SEX_CTGO_CD,FLC,year,month,CSTMR_CNT,AMT,CNT,day_confirmed_x,day_confirmed_y,low_stock,volume
0,0,0,0,1,1,1,2020,4,6,478500,5,15,15,-1.883689,2.061987
1,0,0,0,2,1,2,2020,4,15,1585500,15,15,15,-1.883689,2.061987
2,0,0,0,2,2,2,2020,4,55,5099306,58,15,15,-1.883689,2.061987
3,0,0,0,3,1,3,2020,4,26,1884090,30,15,15,-1.883689,2.061987
4,0,0,0,3,1,4,2020,4,6,617000,8,15,15,-1.883689,2.061987


# 여기서 부터 코로나, 코스피 데이터를 제외한 것으로 모델링

In [212]:
# 인코딩
dtypes = df.dtypes
encoders = {}
for column in df.columns:
    if str(dtypes[column]) == 'object':
        encoder = LabelEncoder()
        encoder.fit(df[column])
        encoders[column] = encoder
        
df_num = df.copy()        
for column in encoders.keys():
    encoder = encoders[column]
    df_num[column] = encoder.transform(df[column])

# 여기서부터 랜덤포레스트

In [265]:
df_num = df_num.append(test_num)            #원본
b_num = b_num.append(covid_test_df_num)     #원본+확진자
c_num = c_num.append(kospi_covid_test_num)  #원본+확진자+주식
d_num = d_num.append(kospi_test_df_num)     #원본+주식
 
    
df_num = df_num.reset_index()
df_num = df_num.drop('index',axis=1)

b_num = b_num.reset_index()
b_num = b_num.drop('index',axis=1)

c_num = c_num.reset_index()
c_num = c_num.drop('index',axis=1)

d_num = d_num.reset_index()
d_num = d_num.drop('index',axis=1)


In [214]:
df_num

Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,HOM_SIDO_NM,AGE,SEX_CTGO_CD,FLC,year,month,CSTMR_CNT,AMT,CNT
0,0,0,0,1,1,1,2019,1,4,311200,4
1,0,0,0,1,1,1,2019,2,3,605000,3
2,0,0,0,1,1,1,2019,6,3,139000,3
3,0,0,0,1,1,1,2019,8,3,27500,3
4,0,0,0,1,1,1,2019,9,3,395500,3
...,...,...,...,...,...,...,...,...,...,...,...
1114446,16,40,16,2,2,2,2020,4,3,28000,4
1114447,16,40,16,3,1,3,2020,4,3,218850,3
1114448,16,40,16,3,2,3,2020,4,5,193000,5
1114449,16,40,16,4,1,4,2020,4,14,720500,18


In [94]:
# from sklearn.preprocessing import StandardScaler
# AMT = df_num['AMT']
# df_num = df_num.drop('AMT',axis=1)

# scaler = StandardScaler()
# df_scale = df_num.copy()
# cols = df_scale.columns


# scaler.fit(df_scale)
# df_scaled = scaler.transform(df_scale)
# df_scaled = pd.DataFrame(df_scaled, columns = cols)
# df_scaled

Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,HOM_SIDO_NM,AGE,SEX_CTGO_CD,FLC,year,month,CSTMR_CNT,CNT
0,-1.631199,-1.982167,-1.578247,-1.252105,-0.894061,-1.344054,-0.529940,-1.313194,-0.129294,-0.106706
1,-1.631199,-1.982167,-1.578247,-1.252105,-0.894061,-1.344054,-0.529940,-1.029414,-0.129383,-0.106742
2,-1.631199,-1.982167,-1.578247,-1.252105,-0.894061,-1.344054,-0.529940,0.105703,-0.129383,-0.106742
3,-1.631199,-1.982167,-1.578247,-1.252105,-0.894061,-1.344054,-0.529940,0.673262,-0.129383,-0.106742
4,-1.631199,-1.982167,-1.578247,-1.252105,-0.894061,-1.344054,-0.529940,0.957041,-0.129383,-0.106742
...,...,...,...,...,...,...,...,...,...,...
1114446,1.629612,1.519776,1.723410,-0.575394,1.118492,-0.615120,1.887005,-0.461856,-0.129383,-0.106706
1114447,1.629612,1.519776,1.723410,0.101317,-0.894061,0.113814,1.887005,-0.461856,-0.129383,-0.106742
1114448,1.629612,1.519776,1.723410,0.101317,1.118492,0.113814,1.887005,-0.461856,-0.129205,-0.106670
1114449,1.629612,1.519776,1.723410,0.778029,-0.894061,0.842748,1.887005,-0.461856,-0.128402,-0.106200


In [96]:
df_scaled['AMT']=AMT

In [68]:
# pd.DataFrame(scaler.inverse_transform(df_scaled), columns = cols)

Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,HOM_SIDO_NM,AGE,SEX_CTGO_CD,FLC,year,month,CSTMR_CNT,AMT,CNT
0,0.0,0.0,0.0,1.0,1.0,1.0,2019.0,1.0,4.0,311200.0,4.0
1,0.0,0.0,0.0,1.0,1.0,1.0,2019.0,2.0,3.0,605000.0,3.0
2,0.0,0.0,0.0,1.0,1.0,1.0,2019.0,6.0,3.0,139000.0,3.0
3,0.0,0.0,0.0,1.0,1.0,1.0,2019.0,8.0,3.0,27500.0,3.0
4,0.0,0.0,0.0,1.0,1.0,1.0,2019.0,9.0,3.0,395500.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...
1114446,16.0,40.0,16.0,2.0,2.0,2.0,2020.0,4.0,3.0,28000.0,4.0
1114447,16.0,40.0,16.0,3.0,1.0,3.0,2020.0,4.0,3.0,218850.0,3.0
1114448,16.0,40.0,16.0,3.0,2.0,3.0,2020.0,4.0,5.0,193000.0,5.0
1114449,16.0,40.0,16.0,4.0,1.0,4.0,2020.0,4.0,14.0,720500.0,18.0


# 원본 데이터의 설명력이 높은것을 알수 있음

In [266]:
#기본 데이터


train_num = df_num.sample(frac=1, random_state=0)
train_features = train_num.drop(['AMT'], axis=1)
train_target = np.log1p(train_num['AMT'])


# test_num = kospi_covid_test_num.sample(frac=1, random_state=0)
# test_features = test_num.drop(['CSTMR_CNT', 'AMT', 'CNT'], axis=1)
# test_target = np.log1p(test_num['AMT'])

from sklearn.model_selection import train_test_split
df_train_x, df_test_x, df_train_y,df_test_y = train_test_split(train_features,train_target,test_size = 0.2 , random_state = 1234)


model = RandomForestRegressor(n_jobs=-1, random_state=0)
model.fit(df_train_x, df_train_y)

model.score(df_test_x,df_test_y)

0.9746128994042205

In [99]:
train_score=[]; test_score = []
para_n_tree = [n_tree*10 for n_tree in range(1,11)]

for v_n_estimators in para_n_tree:
    rf=RandomForestRegressor(n_estimators=v_n_estimators, random_state = 1234)
    rf.fit(df_train_x, df_train_y)
    train_score.append(rf.score(df_train_x,df_train_y))
    test_score.append(rf.score(df_test_x,df_test_y))


df_score_n=pd.DataFrame()
df_score_n['n_estimator']= para_n_tree
df_score_n['TrainScore']= train_score
df_score_n['TestScore']= test_score
df_score_n.round(3)

Unnamed: 0,n_estimator,TrainScore,TestScore
0,10,0.978,0.888
1,20,0.983,0.897
2,30,0.984,0.9
3,40,0.985,0.901
4,50,0.985,0.902
5,60,0.986,0.902
6,70,0.986,0.903
7,80,0.986,0.903
8,90,0.986,0.904
9,100,0.986,0.904


In [152]:
train_score=[]; test_score = []
para_leaf = [n_leaf*1 for n_leaf in range(2,10)]

for v_min_samples_leaf in para_leaf:
    rf=RandomForestRegressor(n_estimators=100,min_samples_leaf=v_min_samples_leaf, random_state = 1234,n_jobs=-1)
    rf.fit(df_train_x, df_train_y)
    train_score.append(rf.score(df_train_x,df_train_y))
    test_score.append(rf.score(df_test_x,df_test_y))


df_score_leaf=pd.DataFrame()
df_score_leaf['min_leaf']= para_leaf
df_score_leaf['TrainScore']= train_score
df_score_leaf['TestScore']= test_score
df_score_leaf.round(3)

Unnamed: 0,min_leaf,TrainScore,TestScore
0,2,0.964,0.906
1,3,0.946,0.897
2,4,0.93,0.887
3,5,0.917,0.876
4,6,0.903,0.864
5,7,0.89,0.852
6,8,0.876,0.839
7,9,0.863,0.828


In [129]:
train_score=[]; test_score = []
para_split = [n_split*2 for n_split in range(5,10)]

for v_min_samples_split in para_split:
    rf=RandomForestRegressor(n_estimators=60,min_samples_leaf=4,min_samples_split=v_min_samples_split, random_state = 1234,n_jobs=-1)
    rf.fit(df_train_x, df_train_y)
    train_score.append(rf.score(df_train_x,df_train_y))
    test_score.append(rf.score(df_test_x,df_test_y))


df_score_split=pd.DataFrame()
df_score_split['n_split']= para_split
df_score_split['TrainScore']= train_score
df_score_split['TestScore']= test_score
df_score_split.round(3)

Unnamed: 0,n_split,TrainScore,TestScore
0,2,0.926,0.868
1,4,0.926,0.868
2,6,0.926,0.868
3,8,0.926,0.868


In [130]:
train_score=[]; test_score = []
para_split = [n_split*2 for n_split in range(5,10)]

for v_min_samples_split in para_split:
    rf=RandomForestRegressor(n_estimators=60,min_samples_leaf=4,min_samples_split=v_min_samples_split, random_state = 1234,n_jobs=-1)
    rf.fit(df_train_x, df_train_y)
    train_score.append(rf.score(df_train_x,df_train_y))
    test_score.append(rf.score(df_test_x,df_test_y))


df_score_split=pd.DataFrame()
df_score_split['n_split']= para_split
df_score_split['TrainScore']= train_score
df_score_split['TestScore']= test_score
df_score_split.round(3)

Unnamed: 0,n_split,TrainScore,TestScore
0,10,0.921,0.867
1,12,0.915,0.864
2,14,0.908,0.859
3,16,0.899,0.853
4,18,0.891,0.848


In [217]:
df_importance = pd.DataFrame()
v_feature_name = df_train_x.columns
df_importance["Feature"] = v_feature_name
df_importance["Importance"] = model.feature_importances_

df_importance.sort_values("Importance", ascending = False, inplace = True)
df_importance.round(3)


Unnamed: 0,Feature,Importance
8,CSTMR_CNT,0.854
1,STD_CLSS_NM,0.083
9,CNT,0.022
0,CARD_SIDO_NM,0.012
3,AGE,0.01
2,HOM_SIDO_NM,0.007
7,month,0.006
5,FLC,0.003
4,SEX_CTGO_CD,0.002
6,year,0.001


In [143]:
model = RandomForestRegressor(random_state = 1234,n_jobs=-1)
model.fit(df_train_x, df_train_y)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=-1, oob_score=False,
                      random_state=1234, verbose=0, warm_start=False)

In [229]:
predict_csv = pd.read_csv('predict_table.csv',encoding='UTF-8')
predict_csv = predict_csv.drop('Unnamed: 0',axis=1)
predict_csv


Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,HOM_SIDO_NM,AGE,SEX_CTGO_CD,FLC,year,month,CSTMR_CNT,CNT
0,강원,건강보조식품 소매업,강원,20s,1,1,2020,4,6,5
1,강원,건강보조식품 소매업,강원,30s,1,2,2020,4,15,15
2,강원,건강보조식품 소매업,강원,30s,2,2,2020,4,55,58
3,강원,건강보조식품 소매업,강원,40s,1,3,2020,4,26,30
4,강원,건강보조식품 소매업,강원,40s,1,4,2020,4,6,8
...,...,...,...,...,...,...,...,...,...,...
114109,충북,휴양콘도 운영업,충북,30s,2,2,2020,7,3,4
114110,충북,휴양콘도 운영업,충북,40s,1,3,2020,7,3,3
114111,충북,휴양콘도 운영업,충북,40s,2,3,2020,7,5,5
114112,충북,휴양콘도 운영업,충북,50s,1,4,2020,7,14,18


In [197]:
# predict_csv = predict_csv.merge(covid_df,left_on=['CARD_SIDO_NM','year','month'],right_on=['province','year','month'] ,how='left')
# predict_csv = predict_csv.drop('province',axis=1)
# predict_csv = predict_csv.merge(covid_df,left_on=['HOM_SIDO_NM','year','month'],right_on=['province','year','month'] ,how='left')
# predict_csv = predict_csv.drop('province',axis=1)

In [198]:
# predict_csv = predict_csv.merge(df_scaled,on=['year','month'],how='left')
# predict_csv

In [2]:
# 예측 테이블 전처리
predict_num = predict_csv.copy()        
for column in encoders.keys():
    encoder = encoders[column]
    predict_num[column] = encoder.transform(predict_num[column])

In [200]:
# pd.DataFrame(scaler.inverse_transform(df_scaled))
#스케일러 풀기

In [201]:

# predict_scale = predict_num.copy()
# cols = predict_scale.columns

# predict_scaled = scaler.transform(predict_scale)
# predict_scaled = pd.DataFrame(predict_scaled, columns = cols)
# predict_scaled.head()

Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,HOM_SIDO_NM,AGE,SEX_CTGO_CD,FLC,year,month,CSTMR_CNT,CNT
0,-1.631199,-1.982167,-1.578247,-1.252105,-0.894061,-1.344054,1.887005,-0.461856,-0.129116,-0.10667
1,-1.631199,-1.982167,-1.578247,-0.575394,-0.894061,-0.61512,1.887005,-0.461856,-0.128313,-0.106309
2,-1.631199,-1.982167,-1.578247,-0.575394,1.118492,-0.61512,1.887005,-0.461856,-0.124744,-0.104756
3,-1.631199,-1.982167,-1.578247,0.101317,-0.894061,0.113814,1.887005,-0.461856,-0.127331,-0.105767
4,-1.631199,-1.982167,-1.578247,0.101317,-0.894061,0.842748,1.887005,-0.461856,-0.129116,-0.106561


In [231]:
# 예측
pred = model.predict(predict_num)
pred = np.expm1(pred)

predict_num['AMT'] = np.round(pred, 0)
predict_num['REG_YYMM'] = predict_num['year']*100 + predict_num['month']
predict_num



Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,HOM_SIDO_NM,AGE,SEX_CTGO_CD,FLC,year,month,CSTMR_CNT,CNT,AMT,REG_YYMM
0,0,0,0,1,1,1,2020,4,6,5,375196.0,202004
1,0,0,0,2,1,2,2020,4,15,15,1319470.0,202004
2,0,0,0,2,2,2,2020,4,55,58,5118355.0,202004
3,0,0,0,3,1,3,2020,4,26,30,2228480.0,202004
4,0,0,0,3,1,4,2020,4,6,8,587963.0,202004
...,...,...,...,...,...,...,...,...,...,...,...,...
114109,16,40,16,2,2,2,2020,7,3,4,55439.0,202007
114110,16,40,16,3,1,3,2020,7,3,3,150224.0,202007
114111,16,40,16,3,2,3,2020,7,5,5,160372.0,202007
114112,16,40,16,4,1,4,2020,7,14,18,752394.0,202007


In [232]:
predict_num = predict_num[['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM', 'AMT']]

predict_num


Unnamed: 0,REG_YYMM,CARD_SIDO_NM,STD_CLSS_NM,AMT
0,202004,0,0,375196.0
1,202004,0,0,1319470.0
2,202004,0,0,5118355.0
3,202004,0,0,2228480.0
4,202004,0,0,587963.0
...,...,...,...,...
114109,202007,16,40,55439.0
114110,202007,16,40,150224.0
114111,202007,16,40,160372.0
114112,202007,16,40,752394.0


In [233]:
predict_num = predict_num.groupby(['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM']).sum().reset_index(drop=False)

In [207]:
predict_scaled[predict_scaled['AMT']==8.132300e+09]

Unnamed: 0,REG_YYMM,CARD_SIDO_NM,STD_CLSS_NM,AMT


In [205]:
2.445537e+08

244553700.0

In [235]:
predict_num[predict_num['STD_CLSS_NM']=='기타 주점업']

Unnamed: 0,REG_YYMM,CARD_SIDO_NM,STD_CLSS_NM,AMT
9,202004,강원,기타 주점업,244283100.0
46,202004,경기,기타 주점업,1003935000.0
85,202004,경남,기타 주점업,575371900.0
123,202004,경북,기타 주점업,242228100.0
158,202004,광주,기타 주점업,352938600.0
192,202004,대구,기타 주점업,169672000.0
225,202004,대전,기타 주점업,87487360.0
258,202004,부산,기타 주점업,1255336000.0
295,202004,서울,기타 주점업,828157500.0
331,202004,세종,기타 주점업,3698973.0


In [234]:
predict_num['CARD_SIDO_NM'] = encoders['CARD_SIDO_NM'].inverse_transform(predict_num['CARD_SIDO_NM'])
predict_num['STD_CLSS_NM'] = encoders['STD_CLSS_NM'].inverse_transform(predict_num['STD_CLSS_NM'])


In [236]:
# 제출 파일 만들기
submission = pd.read_csv('submission (2).csv', index_col=0)
submission = submission.drop(['AMT'], axis=1)
submission = pd.merge(submission,predict_num, on=['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM'], how='left')
submission.index.name = 'id'
submission = submission.fillna(0)
submission.to_csv('submission.csv', encoding='utf-8-sig')
submission

Unnamed: 0_level_0,REG_YYMM,CARD_SIDO_NM,STD_CLSS_NM,AMT
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,202004,강원,건강보조식품 소매업,8.839446e+07
1,202004,강원,골프장 운영업,4.535208e+09
2,202004,강원,과실 및 채소 소매업,1.120567e+09
3,202004,강원,관광 민예품 및 선물용품 소매업,1.507826e+07
4,202004,강원,그외 기타 분류안된 오락관련 서비스업,0.000000e+00
...,...,...,...,...
1389,202007,충북,피자 햄버거 샌드위치 및 유사 음식점업,1.374682e+09
1390,202007,충북,한식 음식점업,1.893702e+10
1391,202007,충북,호텔업,1.342246e+07
1392,202007,충북,화장품 및 방향제 소매업,4.620118e+08


In [148]:
submission2 = pd.read_csv('submission (2).csv', index_col=0)
pd.merge(submission2,submission,on=['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM'], how='inner')

Unnamed: 0,REG_YYMM,CARD_SIDO_NM,STD_CLSS_NM,AMT_x,AMT_y
0,202004,강원,건강보조식품 소매업,0,8.861782e+07
1,202004,강원,골프장 운영업,0,4.535607e+09
2,202004,강원,과실 및 채소 소매업,0,1.119608e+09
3,202004,강원,관광 민예품 및 선물용품 소매업,0,1.512625e+07
4,202004,강원,그외 기타 분류안된 오락관련 서비스업,0,0.000000e+00
...,...,...,...,...,...
1497,202007,충북,피자 햄버거 샌드위치 및 유사 음식점업,0,1.373908e+09
1498,202007,충북,한식 음식점업,0,1.893802e+10
1499,202007,충북,호텔업,0,1.342244e+07
1500,202007,충북,화장품 및 방향제 소매업,0,4.621928e+08


In [149]:
submission = pd.read_csv('submission (2).csv', index_col=0)
submission

Unnamed: 0_level_0,REG_YYMM,CARD_SIDO_NM,STD_CLSS_NM,AMT
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,202004,강원,건강보조식품 소매업,0
1,202004,강원,골프장 운영업,0
2,202004,강원,과실 및 채소 소매업,0
3,202004,강원,관광 민예품 및 선물용품 소매업,0
4,202004,강원,그외 기타 분류안된 오락관련 서비스업,0
...,...,...,...,...
1389,202007,충북,피자 햄버거 샌드위치 및 유사 음식점업,0
1390,202007,충북,한식 음식점업,0
1391,202007,충북,호텔업,0
1392,202007,충북,화장품 및 방향제 소매업,0


In [90]:
p = df[(df['year']==2019) & (df['month']==4)]
p = p.groupby(['AGE','CARD_SIDO_NM','STD_CLSS_NM','HOM_SIDO_NM','year','month','SEX_CTGO_CD','FLC']).sum().reset_index()
p

Unnamed: 0,AGE,CARD_SIDO_NM,STD_CLSS_NM,HOM_SIDO_NM,year,month,SEX_CTGO_CD,FLC,CSTMR_CNT,AMT,CNT
0,10s,강원,그외 기타 종합 소매업,강원,2019,4,2,1,4,38620,6
1,10s,강원,기타 대형 종합 소매업,강원,2019,4,1,1,69,954970,96
2,10s,강원,기타 대형 종합 소매업,강원,2019,4,2,1,156,2820890,247
3,10s,강원,기타 외국식 음식점업,강원,2019,4,1,1,22,312000,24
4,10s,강원,기타 외국식 음식점업,강원,2019,4,2,1,21,294200,29
...,...,...,...,...,...,...,...,...,...,...,...
70982,70s,충북,호텔업,충북,2019,4,2,5,7,406000,10
70983,70s,충북,화장품 및 방향제 소매업,충북,2019,4,1,5,98,5211045,104
70984,70s,충북,화장품 및 방향제 소매업,충북,2019,4,2,5,331,24251550,388
70985,70s,충북,휴양콘도 운영업,서울,2019,4,1,5,9,685200,12


In [89]:
test_df

Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,HOM_SIDO_NM,AGE,SEX_CTGO_CD,FLC,year,month,CSTMR_CNT,AMT,CNT
0,강원,건강보조식품 소매업,강원,20s,1,1,2020,4,6,478500,5
1,강원,건강보조식품 소매업,강원,30s,1,2,2020,4,15,1585500,15
2,강원,건강보조식품 소매업,강원,30s,2,2,2020,4,55,5099306,58
3,강원,건강보조식품 소매업,강원,40s,1,3,2020,4,26,1884090,30
4,강원,건강보조식품 소매업,강원,40s,1,4,2020,4,6,617000,8
...,...,...,...,...,...,...,...,...,...,...,...
57052,충북,휴양콘도 운영업,충북,30s,2,2,2020,4,3,28000,4
57053,충북,휴양콘도 운영업,충북,40s,1,3,2020,4,3,218850,3
57054,충북,휴양콘도 운영업,충북,40s,2,3,2020,4,5,193000,5
57055,충북,휴양콘도 운영업,충북,50s,1,4,2020,4,14,720500,18


In [88]:
gr_test.groupby(['CARD_SIDO_NM','STD_CLSS_NM','HOM_SIDO_NM','year','month','SEX_CTGO_CD','FLC']).sum().reset_index()


Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,HOM_SIDO_NM,year,month,SEX_CTGO_CD,FLC,CSTMR_CNT,AMT,CNT
0,강원,건강보조식품 소매업,강원,2020,4,17,43,776,88588988,856
1,강원,건강보조식품 소매업,서울,2020,4,2,4,3,38000,3
2,강원,건강보조식품 소매업,충북,2020,4,2,2,3,197000,3
3,강원,골프장 운영업,강원,2020,4,25,53,4955,763684690,7148
4,강원,골프장 운영업,경기,2020,4,23,53,7171,1313821735,8594
...,...,...,...,...,...,...,...,...,...,...
6101,충북,화장품 및 방향제 소매업,충북,2020,4,36,66,8376,442758791,9531
6102,충북,휴양콘도 운영업,경기,2020,4,9,18,36,4102170,60
6103,충북,휴양콘도 운영업,서울,2020,4,11,21,24,2940200,44
6104,충북,휴양콘도 운영업,충남,2020,4,2,5,6,325800,9


In [62]:

gr_test = test_df.groupby(['CARD_SIDO_NM','STD_CLSS_NM','HOM_SIDO_NM','year','month']).sum().reset_index()
a= p.merge(gr_test,on=['CARD_SIDO_NM','STD_CLSS_NM','HOM_SIDO_NM'],how='left')
# gr_test
a = a.fillna(0)
#460719139
a[a['AMT_y']==0].sort_values('AMT_x',ascending=False)['AMT_x'].sum()
a[a['AMT_y']==0].sort_values('AMT_x',ascending=False)

Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,HOM_SIDO_NM,year_x,month_x,SEX_CTGO_CD_x,FLC_x,CSTMR_CNT_x,AMT_x,CNT_x,year_y,month_y,SEX_CTGO_CD_y,FLC_y,CSTMR_CNT_y,AMT_y,CNT_y
3000,부산,면세점,부산,2019,4,34,67,9305,1467553396,12654,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3538,서울,면세점,경기,2019,4,34,67,4983,693108521,6325,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2995,부산,면세점,경남,2019,4,32,64,4600,610193207,6015,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5676,전북,정기 항공 운송업,경기,2019,4,27,48,3473,595107531,3584,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4667,인천,면세점,광주,2019,4,32,64,1856,371264338,2746,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1826,광주,관광 민예품 및 선물용품 소매업,대전,2019,4,1,1,3,11300,6,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1477,경북,버스 운송업,충남,2019,4,1,1,3,11100,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7034,충북,여관업,부산,2019,4,2,3,3,10400,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3989,세종,그외 기타 종합 소매업,광주,2019,4,1,1,3,5800,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [255]:

sub = pd.read_csv('submission.csv')
sub = sub[sub['REG_YYMM']==202004]
sub

Unnamed: 0,id,REG_YYMM,CARD_SIDO_NM,STD_CLSS_NM,AMT
0,0,202004,강원,건강보조식품 소매업,9.446666e+07
1,1,202004,강원,골프장 운영업,3.370510e+09
2,2,202004,강원,과실 및 채소 소매업,8.689251e+08
3,3,202004,강원,관광 민예품 및 선물용품 소매업,1.345954e+07
4,4,202004,강원,그외 기타 분류안된 오락관련 서비스업,0.000000e+00
...,...,...,...,...,...
692,692,202004,충북,피자 햄버거 샌드위치 및 유사 음식점업,8.816309e+08
693,693,202004,충북,한식 음식점업,1.337014e+10
694,694,202004,충북,호텔업,1.622228e+07
695,695,202004,충북,화장품 및 방향제 소매업,4.382669e+08


In [256]:
tt = gr_test.groupby(['CARD_SIDO_NM','STD_CLSS_NM']).sum().reset_index()
tt = tt.drop(['year','month','SEX_CTGO_CD','FLC','CSTMR_CNT','CNT'],axis=1)
real = sub.merge(tt,on=['CARD_SIDO_NM','STD_CLSS_NM'],how='left')
real = real.fillna(0)

In [257]:
sub_amt = np.array(real['AMT_x']) - np.array(real['AMT_y'])
real['sub_amt'] = sub_amt

In [259]:
real[real['AMT_y']==0].sort_values('sub_amt',ascending=False)

Unnamed: 0,id,REG_YYMM,CARD_SIDO_NM,STD_CLSS_NM,AMT_x,AMT_y,sub_amt
301,301,202004,부산,면세점,63133289.0,0.0,63133289.0
382,382,202004,세종,마사지업,1875420.0,0.0,1875420.0
643,643,202004,충남,자동차 임대업,416753.0,0.0,416753.0
302,302,202004,부산,버스 운송업,341643.0,0.0,341643.0
254,254,202004,대전,기타 수상오락 서비스업,103306.0,0.0,103306.0
...,...,...,...,...,...,...,...
228,228,202004,대구,여행사업,0.0,0.0,0.0
220,220,202004,대구,버스 운송업,0.0,0.0,0.0
219,219,202004,대구,면세점,0.0,0.0,0.0
217,217,202004,대구,내항 여객 운송업,0.0,0.0,0.0


In [264]:
df
p = df.groupby(['CARD_SIDO_NM','STD_CLSS_NM','year','month']).sum().reset_index()
p[(p['STD_CLSS_NM']=='면세점')&(p['CARD_SIDO_NM']=='부산')]

Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,year,month,SEX_CTGO_CD,FLC,CSTMR_CNT,AMT,CNT
4064,부산,면세점,2019,1,280,570,22087,3695671882,31296
4065,부산,면세점,2019,2,284,550,18942,2562238413,25777
4066,부산,면세점,2019,3,288,582,21515,3218109294,29340
4067,부산,면세점,2019,4,265,563,19167,2763529187,25604
4068,부산,면세점,2019,5,260,550,18869,2540790995,25209
4069,부산,면세점,2019,6,263,530,19616,3037134683,27104
4070,부산,면세점,2019,7,266,514,18165,2546105059,24557
4071,부산,면세점,2019,8,228,443,16117,2023225960,21253
4072,부산,면세점,2019,9,216,448,13012,1728476293,17047
4073,부산,면세점,2019,10,224,473,15804,2246719513,21265


In [204]:
real.sort_values('sub_amt',ascending=False).head(30)

Unnamed: 0,id,REG_YYMM,CARD_SIDO_NM,STD_CLSS_NM,AMT_x,AMT_y,sub_amt
363,363,202004,서울,택시 운송업,89420260000.0,80156850000.0,9263405000.0
365,365,202004,서울,한식 음식점업,165530300000.0,160157300000.0,5373035000.0
335,335,202004,서울,기타 대형 종합 소매업,89150110000.0,84095020000.0,5055088000.0
361,361,202004,서울,차량용 주유소 운영업,31010730000.0,27317020000.0,3693716000.0
294,294,202004,부산,기타 대형 종합 소매업,58378790000.0,54992680000.0,3386109000.0
348,348,202004,서울,슈퍼마켓,89070250000.0,86367790000.0,2702458000.0
48,48,202004,경기,기타 대형 종합 소매업,134900800000.0,132731100000.0,2169745000.0
320,320,202004,부산,차량용 주유소 운영업,24485830000.0,22870690000.0,1615138000.0
443,443,202004,울산,차량용 주유소 운영업,12305210000.0,10845540000.0,1459664000.0
354,354,202004,서울,일반유흥 주점업,2918680000.0,1487088000.0,1431592000.0


In [85]:
real[real['STD_CLSS_NM']=='면세점']

Unnamed: 0,id,REG_YYMM,CARD_SIDO_NM,STD_CLSS_NM,AMT_x,AMT_y,sub_amt
14,14,202004,강원,면세점,153109.0,0.0,153109.0
55,55,202004,경기,면세점,0.0,0.0,0.0
96,96,202004,경남,면세점,0.0,0.0,0.0
137,137,202004,경북,면세점,0.0,0.0,0.0
178,178,202004,광주,면세점,0.0,0.0,0.0
219,219,202004,대구,면세점,166717500.0,0.0,166717500.0
260,260,202004,대전,면세점,0.0,0.0,0.0
301,301,202004,부산,면세점,401992400.0,0.0,401992400.0
342,342,202004,서울,면세점,206751400.0,588529.0,206162900.0
383,383,202004,세종,면세점,0.0,0.0,0.0


In [405]:
q = df[(df['year']==2020) & (df['month']==1)]
q = q.groupby(['CARD_SIDO_NM','STD_CLSS_NM','HOM_SIDO_NM','year','month']).sum().reset_index()

In [411]:
w = p.merge(q,on = ['CARD_SIDO_NM','STD_CLSS_NM','HOM_SIDO_NM','month'],how='left')
w = w.fillna(-1)
w

Unnamed: 0,CARD_SIDO_NM,STD_CLSS_NM,HOM_SIDO_NM,year_x,month,SEX_CTGO_CD_x,FLC_x,CSTMR_CNT_x,AMT_x,CNT_x,year_y,SEX_CTGO_CD_y,FLC_y,CSTMR_CNT_y,AMT_y,CNT_y
0,강원,건강보조식품 소매업,강원,2019,1,22,51,1058,147831746,1105,2020.0,21.0,48.0,1309.0,177057268.0,1356.0
1,강원,건강보조식품 소매업,광주,2019,1,2,5,5,127000,5,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2,강원,건강보조식품 소매업,대구,2019,1,4,10,7,155000,7,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
3,강원,건강보조식품 소매업,서울,2019,1,2,5,5,151000,6,2020.0,4.0,6.0,9.0,5762000.0,10.0
4,강원,건강보조식품 소매업,전북,2019,1,2,5,5,90000,5,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7064,충북,휴양콘도 운영업,울산,2019,1,1,3,9,811950,14,2020.0,4.0,8.0,12.0,1006910.0,20.0
7065,충북,휴양콘도 운영업,인천,2019,1,12,20,112,10798340,198,2020.0,10.0,23.0,111.0,10391250.0,186.0
7066,충북,휴양콘도 운영업,전북,2019,1,1,3,7,462150,10,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
7067,충북,휴양콘도 운영업,충남,2019,1,7,14,36,3938700,65,2020.0,9.0,18.0,49.0,5795310.0,75.0


In [414]:
e = w[w['AMT_y']==-1]
e.groupby(['CARD_SIDO_NM','STD_CLSS_NM','HOM_SIDO_NM','year_x','month']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,SEX_CTGO_CD_x,FLC_x,CSTMR_CNT_x,AMT_x,CNT_x,year_y,SEX_CTGO_CD_y,FLC_y,CSTMR_CNT_y,AMT_y,CNT_y
CARD_SIDO_NM,STD_CLSS_NM,HOM_SIDO_NM,year_x,month,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
강원,건강보조식품 소매업,광주,2019,1,2,5,5,127000,5,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
강원,건강보조식품 소매업,대구,2019,1,4,10,7,155000,7,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
강원,건강보조식품 소매업,전북,2019,1,2,5,5,90000,5,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
강원,과실 및 채소 소매업,제주,2019,1,1,5,3,187000,4,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
강원,기타 외국식 음식점업,세종,2019,1,1,2,3,158705,2,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
충북,호텔업,충남,2019,1,1,4,3,135500,4,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
충북,화장품 및 방향제 소매업,대구,2019,1,2,1,3,31300,3,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
충북,화장품 및 방향제 소매업,제주,2019,1,2,1,3,46530,3,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
충북,화장품 및 방향제 소매업,충남,2019,1,5,5,16,487840,22,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


In [398]:
submission[(submission['CARD_SIDO_NM']=='강원') & (submission['STD_CLSS_NM']=='휴양콘도 운영업')]

Unnamed: 0_level_0,REG_YYMM,CARD_SIDO_NM,STD_CLSS_NM,AMT
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
40,202004,강원,휴양콘도 운영업,868499418.0
737,202007,강원,휴양콘도 운영업,998256835.0


In [395]:
submission[submission['AMT']==0].head(50)

Unnamed: 0_level_0,REG_YYMM,CARD_SIDO_NM,STD_CLSS_NM,AMT
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4,202004,강원,그외 기타 분류안된 오락관련 서비스업,0.0
30,202004,강원,정기 항공 운송업,0.0
45,202004,경기,그외 기타 분류안된 오락관련 서비스업,0.0
55,202004,경기,면세점,0.0
71,202004,경기,정기 항공 운송업,0.0
96,202004,경남,면세점,0.0
112,202004,경남,정기 항공 운송업,0.0
137,202004,경북,면세점,0.0
146,202004,경북,여행사업,0.0
153,202004,경북,정기 항공 운송업,0.0
