# Model Code Review

### 1. 사용하고자 하는 패키지 Import

In [1]:
import pandas as pd
import numpy as np
pd.set_option('max_columns', 100)

import warnings
warnings.filterwarnings('ignore')

### 2. Data Loading And Data Preprocessing 

* EDA 결과 11월 데이터만 사용하기로 결정
* 추가적인 Feature 생성 및 데이터 전처리 작업 

In [2]:
train = pd.read_csv('train.csv',encoding='cp949')
test = pd.read_csv('test.csv',encoding='cp949')
sub =  pd.read_csv('sample.csv',encoding='cp949')

In [3]:
cols = ['base_ym','접수년월',
'dsas_ltwt_gcd','질병경중등급코드',
'kcd_gcd','KCD등급코드',
'dsas_acd_rst_dcd','질병구분코드',
'ar_rclss_cd','발생지역구분코드',
'blrs_cd','치료행위코드',
'mdct_inu_rclss_dcd','의료기관구분코드',
'nur_hosp_yn','요양병원여부',
'isrd_age_dcd','고객나이구분코드',
'smrtg_5y_passed_yn','부담보5년경과여부',
'urlb_fc_yn','부실모집설계사계약여부',
'mtad_cntr_yn','중도부가계약여부',
'heltp_pf_ntyn','건강인우대계약가입여부',
'fds_cust_yn','보험사기이력고객여부',
'prm_nvcd','보험료구간코드',
'inamt_nvcd','가입금액구간코드',
'optt_nbtm_s','통원횟수',
'bilg_isamt_s','청구보험금',
'hspz_dys_s','입원일수',
'ac_ctr_diff','청구일계약일간기간구분코드',
'ac_rst_diff','청구일부활일간기간구분코드',
'hsp_avg_hspz_bilg_isamt_s','병원별평균입원청구보험금',
'hsp_avg_optt_bilg_isamt_s','병원별평균통원청구보험금',
'hsp_avg_surop_bilg_isamt_s','병원별평균수술청구보험금',
'hsp_avg_diag_bilg_isamt_s','병원별평균진단청구보험금',
'dsas_avg_hspz_bilg_isamt_s','질병별평균입원청구보험금',
'dsas_avg_optt_bilg_isamt_s','질병별평균통원청구보험금',
'dsas_avg_surop_bilg_isamt_s','질병별평균수술청구보험금',
'dsas_avg_diag_bilg_isamt_s','질병별평균진단청구보험금',
'hspz_blcnt_s','입원청구건수',
'surop_blcnt_s','수술청구건수',
'optt_blcnt_s','통원청구건수']

In [4]:
b_cols = []
a_cols = []
for i in range(len(cols)) : 
    if i %2 == 1 : 
        b_cols.append(cols[i])
    else : a_cols.append(cols[i])

In [5]:
train = train[a_cols]
train.columns = b_cols
train = train.query("접수년월 == 201911") 

test = test[a_cols]
test.columns = b_cols

data = pd.concat([train,test],axis=0)

In [6]:
data['재가입여부'] = data['청구일부활일간기간구분코드'].map(lambda x: 0 if x == 0 else 1)
data['총청구건수'] = data['입원청구건수'] + data['수술청구건수'] + data['통원청구건수']
data['통원_입원일수'] = data['통원횟수'] + data['입원일수']

In [7]:
def 입원(x) : 
    if x <= 7 : 
        return(0)
    else : return(1)
    
def 통원(x) : 
    if (x <= 3 or 8 <= x <= 11) : 
        return(0)
    else : return(1)
    
def 수술(x) : 
    if (x == 1 or 4 <= x <= 5 or 8 <= x <= 9 or 12 <= x <= 13) : 
        return(0)
    else : return(1)
    
def 진단(x) : 
    if (x % 2 == 1) : 
        return(1)
    else : return(0)

data['행위_입원'] = data['치료행위코드'].apply(입원)
data['행위_통원'] = data['치료행위코드'].apply(통원)
data['행위_수술'] = data['치료행위코드'].apply(수술)
data['행위_진단'] = data['치료행위코드'].apply(진단)

data['병원대비청구보험금'] = abs(data['청구보험금'] - (data['병원별평균입원청구보험금'] * data['행위_입원'] + data['병원별평균통원청구보험금'] * data['행위_통원'] + data['병원별평균수술청구보험금'] * data['행위_수술'] + data['병원별평균진단청구보험금'] * data['행위_진단']))

In [8]:
data['병원별_평균_청구액'] = (data[['행위_입원','행위_통원','행위_수술','행위_진단']].values * data[['병원별평균입원청구보험금','병원별평균통원청구보험금','병원별평균수술청구보험금','병원별평균진단청구보험금']].values).sum(axis=1)
data['질병별_평균_청구액'] = (data[['행위_입원','행위_통원','행위_수술','행위_진단']].values * data[['질병별평균입원청구보험금','질병별평균통원청구보험금','질병별평균수술청구보험금','질병별평균진단청구보험금']].values).sum(axis=1)
data['청구-질병평균청구'] = data['청구보험금'] - data['질병별_평균_청구액']

In [9]:
features = []

f = train.groupby(['가입금액구간코드'])['청구보험금'].agg([('m1','mean')]).reset_index()
features.append(f)

for f in features : 
    data = pd.merge(data,f,how='left',on=['가입금액구간코드'])

data['diffs1'] = abs(data['m1'] - data['청구보험금'])

---------------------------

### 3. 모델 적합 및 예측

In [10]:
# 학습에 용이하도록 데이터 변환 및 정제 작업 실시 
X_train = data.iloc[:train.shape[0],:].drop(['접수년월','m1','건강인우대계약가입여부','요양병원여부','중도부가계약여부','행위_입원','행위_통원','행위_수술','행위_진단'],axis=1)
y_train = pd.read_csv('train.csv',encoding='cp949').query("base_ym >= 201911").target
X_test = data.iloc[train.shape[0]:,:].drop(['접수년월','m1','건강인우대계약가입여부','요양병원여부','중도부가계약여부','행위_입원','행위_통원','행위_수술','행위_진단'],axis=1).fillna(0)
y_test = pd.read_csv('y_test.csv',encoding='cp949').target

In [11]:
# Extra Tree Model 선정 - EDA를 통해 트리기반의 모델을 바탕으로 규칙 탐색을 하기로 결정 
# Extra Tree Model의 Random성으로 인해 모델 3개를 적합하고 앙상블을 통해 모델 예측의 안정성을 높이고자 함 
from sklearn.ensemble import ExtraTreesClassifier

ext1 = ExtraTreesClassifier(n_jobs = -1, n_estimators= 450,verbose=1, random_state = 5187)
ext2 = ExtraTreesClassifier(n_jobs = -1, n_estimators= 450,verbose=1, random_state = 1217)
ext3 = ExtraTreesClassifier(n_jobs = -1, n_estimators= 450,verbose=1, random_state = 701)

ext1.fit(X_train,y_train)
ext2.fit(X_train,y_train)
ext3.fit(X_train,y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    5.0s
[Parallel(n_jobs=-1)]: Done 450 out of 450 | elapsed:    5.2s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    5.2s
[Parallel(n_jobs=-1)]: Done 450 out of 450 | elapsed:    5.5s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    5.3s
[Parallel(n_jobs=-1)]: Done 450 out of 450 | elapsed:    

ExtraTreesClassifier(n_estimators=450, n_jobs=-1, random_state=701, verbose=1)

In [12]:
y_pred1 = ext1.predict_proba(X_test)
y_pred2 = ext2.predict_proba(X_test)
y_pred3 = ext3.predict_proba(X_test)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.6s
[Parallel(n_jobs=8)]: Done 450 out of 450 | elapsed:    0.6s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.6s
[Parallel(n_jobs=8)]: Done 450 out of 450 | elapsed:    0.6s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.6s
[Parallel(n_jobs=8)]: Done 450 out of 450 | elapsed:    0.6s finished


In [13]:
# 규칙 기반 탐색과 동시에 경험기반 탐색을 진행하도록 결정 
## 과거 경험에 대하여 직전 달의 현재 들어온 값과 가장 유사한 값의 Target 값을 찾아내는 모델을 통해 규칙기반 탐색의 모델을 보완
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1,n_jobs=-1,weights='distance',metric='manhattan')

knn.fit(X_train,y_train)
knn_pred = knn.predict_proba(X_test)

In [14]:
y_pred =  knn_pred  *0.15 + (y_pred1 * y_pred2 * y_pred3)**(1/3)*0.85
y_pred_ = pd.DataFrame(y_pred)
y_pred_1 = [np.argmax(line) for line in np.array(y_pred_)]

In [15]:
yyyy = y_pred_1

In [16]:
from sklearn.metrics import f1_score,precision_score

f1_score(y_test,np.array(y_pred_1), average=None)

array([0.83213392, 0.90947095, 0.893134  ])

In [17]:
f1_score(y_test,np.array(y_pred_1), average=None).mean()

0.8782462893477913

In [18]:
precision_score(y_test,np.array(y_pred_1), average=None)

array([0.80872738, 0.91540917, 0.94272355])

In [19]:
precision_score(y_test,np.array(y_pred_1), average=None).mean()

0.8889533689450638

In [20]:
pd.crosstab(y_test,np.array(y_pred_1), rownames=['True'], colnames=['Predicted'], margins=True)

Predicted,0,1,2,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,5319,885,3,6207
1,1251,12618,95,13964
2,7,281,1613,1901
All,6577,13784,1711,22072


-------------

# 2-Stage Model

In [21]:
y_train.index = range(len(y_train))
y_test.index = range(len(y_test))
X_test.index = range(len(X_test))
X_train.index = range(len(X_train))

In [22]:
y_train_value1 = y_train.map(lambda x: 1 if x >1 else x)
y_test_value1 = y_test.map(lambda x: 1 if x >1 else x)

In [23]:
y_train_2 = y_train[y_train>0]-1
X_train_value = X_train[y_train>0]

In [24]:
# Extra Tree Model 선정 - EDA를 통해 트리기반의 모델을 바탕으로 규칙 탐색을 하기로 결정 
# Extra Tree Model의 Random성으로 인해 모델 3개를 적합하고 앙상블을 통해 모델 예측의 안정성을 높이고자 함 
from sklearn.ensemble import ExtraTreesClassifier

ext1 = ExtraTreesClassifier(n_jobs = -1, n_estimators= 450,verbose=1, random_state = 5187)
ext2 = ExtraTreesClassifier(n_jobs = -1, n_estimators= 450,verbose=1, random_state = 1217)
ext3 = ExtraTreesClassifier(n_jobs = -1, n_estimators= 450,verbose=1, random_state = 701)

ext1.fit(X_train,y_train_value1)
ext2.fit(X_train,y_train_value1)
ext3.fit(X_train,y_train_value1)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done 450 out of 450 | elapsed:    3.9s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done 450 out of 450 | elapsed:    4.4s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done 450 out of 450 | elapsed:    

ExtraTreesClassifier(n_estimators=450, n_jobs=-1, random_state=701, verbose=1)

In [25]:
y_pred1 = ext1.predict_proba(X_test)
y_pred2 = ext2.predict_proba(X_test)
y_pred3 = ext3.predict_proba(X_test)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.6s
[Parallel(n_jobs=8)]: Done 450 out of 450 | elapsed:    0.6s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.6s
[Parallel(n_jobs=8)]: Done 450 out of 450 | elapsed:    0.7s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.6s
[Parallel(n_jobs=8)]: Done 450 out of 450 | elapsed:    0.7s finished


In [26]:
# 규칙 기반 탐색과 동시에 경험기반 탐색을 진행하도록 결정 
## 과거 경험에 대하여 직전 달의 현재 들어온 값과 가장 유사한 값의 Target 값을 찾아내는 모델을 통해 규칙기반 탐색의 모델을 보완
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1,n_jobs=-1,weights='distance',metric='cosine')

knn.fit(X_train,y_train_value1)
knn_pred = knn.predict_proba(X_test)

In [27]:
y_pred =  knn_pred * 0.15 +(y_pred1 * y_pred2 * y_pred3)**(1/3)*0.85
y_pred_ = pd.DataFrame(y_pred)
y_pred_1 = [np.argmax(line) for line in np.array(y_pred_)]

In [28]:
from sklearn.metrics import f1_score

f1_score(y_test_value1,y_pred_1)

0.9302960119293125

In [29]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test_value1,y_pred_1)

array([[ 5214,   993],
       [ 1204, 14661]], dtype=int64)

# Stage - 2

In [30]:
pred_1 = pd.DataFrame(y_pred_1)

In [31]:
X_test_2 = X_test[pred_1[0]>0]

In [32]:
ext4 = ExtraTreesClassifier(n_jobs = -1, n_estimators= 450,verbose=1, random_state = 5187)
ext5 = ExtraTreesClassifier(n_jobs = -1, n_estimators= 450,verbose=1, random_state = 1217)
ext6 = ExtraTreesClassifier(n_jobs = -1, n_estimators= 450,verbose=1, random_state = 701)

ext4.fit(X_train_value,y_train_2)
ext5.fit(X_train_value,y_train_2)
ext6.fit(X_train_value,y_train_2)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done 450 out of 450 | elapsed:    3.2s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done 450 out of 450 | elapsed:    3.5s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done 450 out of 450 | elapsed:    

ExtraTreesClassifier(n_estimators=450, n_jobs=-1, random_state=701, verbose=1)

In [33]:
y_pred4 = ext4.predict_proba(X_test_2)
y_pred5 = ext5.predict_proba(X_test_2)
y_pred6 = ext6.predict_proba(X_test_2)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.5s
[Parallel(n_jobs=8)]: Done 450 out of 450 | elapsed:    0.5s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.4s
[Parallel(n_jobs=8)]: Done 450 out of 450 | elapsed:    0.5s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.4s
[Parallel(n_jobs=8)]: Done 450 out of 450 | elapsed:    0.4s finished


In [34]:
knn_2 = KNeighborsClassifier(n_neighbors=2,n_jobs=-1,weights='distance',metric='manhattan')

knn_2.fit(X_train_value,y_train_2)
knn_pred_2 = knn_2.predict_proba(X_test_2)

In [35]:
y_pred_2 =  knn_pred_2  *0.15 + (y_pred4 * y_pred5 * y_pred6)**(1/3)*0.85
y_pred_22 = pd.DataFrame(y_pred_2)
y_pred_222 = [np.argmax(line) for line in np.array(y_pred_22)]

In [36]:
X_test_2['target'] = y_pred_222
y_predd_2 = X_test_2['target'].reset_index()
X_test_2.drop('target',axis=1,inplace=True)

In [37]:
y_test_2 = y_test[pred_1[0]>0].map(lambda x : 1 if x ==0 else x) -1

f1_score(y_test_2,y_predd_2['target'])

0.889259877573734

In [38]:
y_pred_fin = pd.merge(pred_1.reset_index(),y_predd_2,on='index',how='left').fillna(0)
y_pred_fin.head()

Unnamed: 0,index,0,target
0,0,1,0.0
1,1,1,0.0
2,2,1,0.0
3,3,1,0.0
4,4,0,0.0


In [39]:
f1_score(y_test,(y_pred_fin[0] + y_pred_fin['target']).astype('int64'), average=None)

array([0.8259802 , 0.90735236, 0.88876529])

In [40]:
f1_score(y_test,(y_pred_fin[0] + y_pred_fin['target']).astype('int64'), average=None).mean()

0.874032618214669

In [41]:
precision_score(y_test,(y_pred_fin[0] + y_pred_fin['target']).astype('int64'), average=None)

array([0.81240262, 0.90751486, 0.94277286])

In [42]:
precision_score(y_test,(y_pred_fin[0] + y_pred_fin['target']).astype('int64'), average=None).mean()

0.8875634479854998

In [43]:
pd.crosstab(y_test,(y_pred_fin[0] + y_pred_fin['target']).astype('int64'), rownames=['True'], colnames=['Predicted'], margins=True)

Predicted,0,1,2,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,5214,990,3,6207
1,1202,12668,94,13964
2,2,301,1598,1901
All,6418,13959,1695,22072


In [47]:
qq = precision_score(y_test,(y_pred_fin[0] + y_pred_fin['target']).astype('int64'), average=None)[0]
ww = precision_score(y_test,np.array(yyyy), average=None)[0]
aa = f1_score(y_test,(y_pred_fin[0] + y_pred_fin['target']).astype('int64'), average=None).mean()
bb = f1_score(y_test,np.array(yyyy), average=None).mean()
print('2 stage {}'.format((0.7 * qq) + (0.3 *aa)))
print('1 stage {}'.format((0.7 * ww) + (0.3 *bb)))

2 stage 0.830891617810926
1 stage 0.8295830551181582
