In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from dateutil.relativedelta import relativedelta
from datetime import *

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
import os
os.getcwd()

## 서비스 데이터 읽기 : 69,708 rows, 14 columns
  - Rephurchase 정보 포함

In [None]:
df_service = pd.read_csv('Train/train_service.csv', parse_dates=['registerdate', 'enddate'], infer_datetime_format=True)
df_service.sample(3)

In [None]:
df_service.info()

## 서비스 데이터 Unique Value

In [None]:
# Column 별 unique values
for column in df_service.columns.values.tolist():
    unique_value_list = df_service[column].unique()
    print(f'Column <{column}>', len(unique_value_list))
    print(unique_value_list[:10], '\n')

## 서비스 데이터 결측치 및 이상치 처리

In [None]:
filled_service = df_service

In [None]:
service_missing_info = pd.DataFrame(filled_service.isnull().sum().sort_values(ascending=False), columns=['missing num'])
service_missing_info['%'] = service_missing_info['missing num']/filled_service.shape[0]*100
service_missing_info

In [None]:
# gender null --> N
filled_service['gender'] = filled_service['gender'].fillna('N')
filled_service['gender'].value_counts()

In [None]:
# agegroup outlier (0살, 950살) --> mean
filled_service['agegroup'].value_counts().sort_index()

In [None]:
filled_service.shape

In [None]:
age_mean = filled_service.loc[(filled_service['agegroup']<950) & (filled_service['agegroup']>0), 'agegroup'].mean()
age_mean

In [None]:
filled_service['agegroup'] = filled_service['agegroup'].replace(950, 30)
filled_service['agegroup'] = filled_service['agegroup'].replace(0, 30)
filled_service['agegroup'].value_counts().sort_index()

In [None]:
# pgamount US Dollar --> 원화
filled_service['pgamount'].value_counts().sort_index()

In [None]:
filled_service.loc[(filled_service['pgamount'] < 100), 'pgamount'] = filled_service['pgamount'] * 1120
filled_service['pgamount'].value_counts().sort_index()

In [None]:
# 나머지 결측치는 X로 처리 (O or null인 경우)
filled_service.isnull().sum()

In [None]:
filled_service = filled_service.fillna('X')
filled_service.isnull().sum()

In [None]:
os.getcwd()

In [None]:
filled_service.head(5)

## numeric으로 전환

In [None]:
numeric_service = filled_service.copy()
numeric_service['Repurchase'].replace('O', 0, inplace=True)
numeric_service['Repurchase'].replace('X', 1, inplace=True)
numeric_service.rename(columns={'Repurchase':"Unsubscription"}, inplace=True)
numeric_service.head(5)

In [None]:
numeric_service.info()

In [None]:
column_list = numeric_service.columns.values.tolist()
column_list.remove('uno')
column_list.remove('productcode')

for col in column_list:
    if numeric_service[col].dtype == 'object':
        lbl = LabelEncoder()
        lbl.fit(list(numeric_service[col].values))
        numeric_service[col] = lbl.transform(numeric_service[col].values)
        
numeric_service.sample(5)

In [None]:
numeric_service.info()

## numeric_service 저장

In [None]:
# numeric_service.to_csv(path_or_buf="Train/numeric_service.csv", index=False)

## numeric_service 불러오기

In [None]:
# numeric_service = pd.read_csv("Train/numeric_service.csv")

In [None]:
# numeric_service

In [None]:
# numeric_service.columns

In [None]:
numeric_service.info()

## 시청 이력 데이터 읽기 : 7,987,609 rows

In [None]:
df_bookmark = pd.read_csv("Train/train_bookmark.csv", parse_dates=['dates'], infer_datetime_format=True)


In [None]:
ex_uno = df_bookmark.loc[0, 'uno']
df_bookmark[df_bookmark['uno']==ex_uno]

In [None]:
df_bookmark.info()

## 시청 이력 데이터 Unique Value

In [None]:
for column in df_bookmark.columns.values.tolist():
    unique_value_list = df_bookmark[column].unique()
    print(f'Column <{column}>', len(unique_value_list))
    print(unique_value_list[:10], '\n')

## 시청 이력 데이터 가공

In [None]:
# (1) 고객별 서비스 가입 이력 수
df_feature_1 = numeric_service.groupby(by='uno', as_index=False).registerdate.count()
df_feature_1.rename(columns={'registerdate':'REG_CNT'}, inplace=True)

# (2) 고객별 서비스 가입 이력 상품 수
df_feature_2 = numeric_service[['uno','productcode']]
df_feature_2 = numeric_service.drop_duplicates() # 고객별 동일 상품 제거
df_feature_2 = numeric_service.groupby(by='uno', as_index=False).productcode.count()
df_feature_2.rename(columns={'productcode':'PRD_CNT'}, inplace=True)

# (3) 고객별 시청 건수 (1시간 단위)
df_feature_3 = df_bookmark.groupby(by='uno', as_index=False).dates.count()
df_feature_3.rename(columns={'dates':'BM_CNT'}, inplace=True)

# (4) 고객별 시청 총 시간
df_feature_4 = df_bookmark.groupby(by='uno', as_index=False).viewtime.sum()
df_feature_4.rename(columns={'viewtime':'VT_TOT'}, inplace=True)

# (5) 고객별 시청 평균 시간
df_feature_5 = df_bookmark.groupby(by='uno', as_index=False).viewtime.mean()
df_feature_5.rename(columns={'viewtime':'VT_AVG'}, inplace=True)

# (6) 고객별 시청 채널 수
df_feature_6 = df_bookmark[['uno','channeltype']]
df_feature_6 = df_feature_6.drop_duplicates() # 고객별 동일 채널 제거
df_feature_6 = df_feature_6.groupby(by='uno', as_index=False).channeltype.count()
df_feature_6.rename(columns={'channeltype':'CH_CNT'}, inplace=True)

# (7) 고객별 시청 프로그램 수
df_feature_7 = df_bookmark[['uno','programid']]
df_feature_7 = df_feature_7.drop_duplicates() # 고객별 동일 프로그램 제거
df_feature_7 = df_feature_7.groupby(by='uno', as_index=False).programid.count()
df_feature_7.rename(columns={'programid':'PRG_CNT'}, inplace=True)

# (8) 고객별 시청 디바이스 수
df_feature_8 = df_bookmark[['uno','devicetype']]
df_feature_8 = df_feature_8.drop_duplicates() # 고객별 동일 프로그램 제거
df_feature_8 = df_feature_8.groupby(by='uno', as_index=False).devicetype.count()
df_feature_8.rename(columns={'devicetype':'DEV_CNT'}, inplace=True)

In [None]:
# (9) enddate - registerdate
df_feature_9 = numeric_service[['uno', 'enddate', 'registerdate', 'Unsubscription']]
df_feature_9['REG_END'] = df_feature_9['enddate'] - df_feature_9['registerdate']
df_feature_9 = df_feature_9[['uno', 'registerdate', 'REG_END']]
df_feature_9['REG_END'] = df_feature_9['REG_END'].astype('str')
df_feature_9['REG_END'] = df_feature_9.REG_END.str.split(' days').str[0]
df_feature_9['REG_END'] = df_feature_9['REG_END'].astype('int')

In [None]:
# # (10) enddate - lastview

# end_enddate = numeric_service.groupby(by='uno', as_index=False).enddate.max()
# lastview = df_bookmark.groupby(by='uno', as_index=False).dates.max()

# lastview_enddate = pd.merge(end_enddate, lastview, how='left')
# lastview_enddate.rename(columns={'dates':'last_view'}, inplace=True)
# lastview_enddate['LAST_V'] = lastview_enddate['enddate'] - lastview_enddate['last_view']
# lastview_enddate['LAST_V'] = lastview_enddate['LAST_V'].dt.days

# # lastview_enddate['LAST_V'] = lastview_enddate['LAST_V'].fillna(int(40))

# df_feature_10 = lastview_enddate[['uno', 'LAST_V']]

# df_feature_10.loc[df_feature_10['LAST_V']<20, 'LAST_V'] = 0
# df_feature_10.loc[df_feature_10['LAST_V']>=20, 'LAST_V'] = 1
# df_feature_10['LAST_V'] = df_feature_10['LAST_V'].fillna(2)

In [None]:
service_bookmark = numeric_service.copy()
feature_list = [df_feature_1, df_feature_2, df_feature_3, df_feature_4, 
                df_feature_5, df_feature_6, df_feature_7, df_feature_8]

for feature in feature_list:
    service_bookmark = pd.merge(service_bookmark, feature, on="uno", how='left')
    
service_bookmark = pd.merge(service_bookmark, df_feature_9, on=["uno", 'registerdate'], how='left')
    
print(service_bookmark.columns)
print(len(service_bookmark))
print(len(numeric_service))

In [None]:
service_bookmark['BM_CNT'] = service_bookmark['BM_CNT'].fillna(0)
service_bookmark['VT_TOT'] = service_bookmark['VT_TOT'].fillna(0)
service_bookmark['VT_AVG'] = service_bookmark['VT_AVG'].fillna(0)
service_bookmark['CH_CNT'] = service_bookmark['CH_CNT'].fillna(0)
service_bookmark['PRG_CNT'] = service_bookmark['PRG_CNT'].fillna(0)
service_bookmark['DEV_CNT'] = service_bookmark['DEV_CNT'].fillna(0)

In [None]:
service_bookmark.info()

## Scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
service_bookmark.sample(5)

In [None]:
scaler = MinMaxScaler()
selected_columns = ['pgamount', 'chargetypeid', 'concurrentwatchcount', 'promo_100', 'coinReceived',
                  'devicetypeid', 'isauth', 'gender', 'agegroup', 'REG_CNT', 'PRD_CNT', 'BM_CNT', 
                  'VT_TOT', 'VT_AVG', 'CH_CNT', 'PRG_CNT', 'DEV_CNT', 'REG_END']

service_bookmark[selected_columns] = scaler.fit_transform(service_bookmark[selected_columns])

In [None]:
service_bookmark
service_bookmark.describe()

## Correlation

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

plt.figure(figsize=(18,10))
sns.heatmap(service_bookmark.corr(), annot=True)   

In [None]:
service_bookmark.hist(figsize=(20,16))
plt.show()

In [None]:
save = service_bookmark.copy()

## Data Augmentation

In [None]:
plus = service_bookmark[service_bookmark['Unsubscription']==1]
service_bookmark = pd.concat([service_bookmark, plus], axis=0)

In [None]:
service_bookmark.shape

In [None]:
service_bookmark.sample(10)

## Feature 선별

In [None]:
service_bookmark.columns

In [None]:
service_bookmark.corr().abs().sort_values(by='Unsubscription', ascending=False)['Unsubscription']

In [None]:
# service_bookmark = service_bookmark[['uno', 'registerdate', 'enddate', 'productcode', 'pgamount',
#        'chargetypeid', 'concurrentwatchcount', 'promo_100', 'coinReceived',
#        'Unsubscription', 'devicetypeid', 'gender', 'agegroup',
#        'BM_CNT', 'VT_TOT', 'VT_AVG', 'CH_CNT', 'PRG_CNT',
#        'DEV_CNT', 'REG_END']]

## predict service data 가공

In [None]:
predict_service = pd.read_csv("Predict/predict_service.csv", parse_dates=['registerdate','enddate'], infer_datetime_format=True)
predict_bookmark = pd.read_csv("Predict/predict_bookmark.csv", parse_dates=['dates'], infer_datetime_format=True)

df_predict_service = predict_service.copy()
df_predict_bookmark = predict_bookmark.copy()

In [None]:
# predict_missing_info = pd.DataFrame(predict_service.isnull().sum().sort_values(ascending=False), columns=['missing num'])
# predict_missing_info['%'] = predict_missing_info['missing num']/predict_service.shape[0]*100
# predict_missing_info

In [None]:
# gender null --> N
df_predict_service['gender'] = df_predict_service['gender'].fillna('N')
df_predict_service['gender'].value_counts()

# agegroup outlier (0살, 950살) --> mean
df_predict_service['agegroup'].value_counts().sort_index()

age_mean = df_predict_service.loc[(df_predict_service['agegroup']<950) & (df_predict_service['agegroup']>0), 'agegroup'].mean()
age_mean

df_predict_service['agegroup'] = df_predict_service['agegroup'].replace(950, 30)
df_predict_service['agegroup'] = df_predict_service['agegroup'].replace(0, 30)
df_predict_service['agegroup'].value_counts().sort_index()

# pgamount US Dollar --> 원화
df_predict_service['pgamount'].value_counts().sort_index()

df_predict_service.loc[(df_predict_service['pgamount'] < 100), 'pgamount'] = df_predict_service['pgamount'] * 1120
df_predict_service['pgamount'].value_counts().sort_index()

# 나머지 결측치는 X로 처리 (O or null인 경우)
df_predict_service.isnull().sum()

df_predict_service = df_predict_service.fillna('X')
df_predict_service.isnull().sum()

In [None]:
df_predict_service['Repurchase'].replace('O', 0, inplace=True)
df_predict_service['Repurchase'].replace('X', 1, inplace=True)
df_predict_service.rename(columns={'Repurchase':"Unsubscription"}, inplace=True)
df_predict_service.head(5)

column_list = df_predict_service.columns.values.tolist()
column_list.remove('uno')
column_list.remove('productcode')

for col in column_list:
    if df_predict_service[col].dtype == 'object':
        lbl = LabelEncoder()
        lbl.fit(list(df_predict_service[col].values))
        df_predict_service[col] = lbl.transform(df_predict_service[col].values)
        
df_predict_service.sample(5)

df_predict_service.info()

## predict bookmark data 가공

In [None]:
# (1) 고객별 서비스 가입 이력 수
pd_feature_1 = df_predict_service.groupby(by='uno', as_index=False).registerdate.count()
pd_feature_1.rename(columns={'registerdate':'REG_CNT'}, inplace=True)

# (2) 고객별 서비스 가입 이력 상품 수
pd_feature_2 = df_predict_service[['uno','productcode']]
pd_feature_2 = df_predict_service.drop_duplicates() # 고객별 동일 상품 제거
pd_feature_2 = df_predict_service.groupby(by='uno', as_index=False).productcode.count()
pd_feature_2.rename(columns={'productcode':'PRD_CNT'}, inplace=True)

# (3) 고객별 시청 건수 (1시간 단위)
pd_feature_3 = df_predict_bookmark.groupby(by='uno', as_index=False).dates.count()
pd_feature_3.rename(columns={'dates':'BM_CNT'}, inplace=True)

# (4) 고객별 시청 총 시간
pd_feature_4 = df_predict_bookmark.groupby(by='uno', as_index=False).viewtime.sum()
pd_feature_4.rename(columns={'viewtime':'VT_TOT'}, inplace=True)

# (5) 고객별 시청 평균 시간
pd_feature_5 = df_predict_bookmark.groupby(by='uno', as_index=False).viewtime.mean()
pd_feature_5.rename(columns={'viewtime':'VT_AVG'}, inplace=True)

# (6) 고객별 시청 채널 수
pd_feature_6 = df_predict_bookmark[['uno','channeltype']]
pd_feature_6 = pd_feature_6.drop_duplicates() # 고객별 동일 채널 제거
pd_feature_6 = pd_feature_6.groupby(by='uno', as_index=False).channeltype.count()
pd_feature_6.rename(columns={'channeltype':'CH_CNT'}, inplace=True)

# (7) 고객별 시청 프로그램 수
pd_feature_7 = df_predict_bookmark[['uno','programid']]
pd_feature_7 = pd_feature_7.drop_duplicates() # 고객별 동일 프로그램 제거
pd_feature_7 = pd_feature_7.groupby(by='uno', as_index=False).programid.count()
pd_feature_7.rename(columns={'programid':'PRG_CNT'}, inplace=True)

# (8) 고객별 시청 디바이스 수
pd_feature_8 = df_predict_bookmark[['uno','devicetype']]
pd_feature_8 = pd_feature_8.drop_duplicates() # 고객별 동일 프로그램 제거
pd_feature_8 = pd_feature_8.groupby(by='uno', as_index=False).devicetype.count()
pd_feature_8.rename(columns={'devicetype':'DEV_CNT'}, inplace=True)

In [None]:
# (9) enddate - registerdate
pd_feature_9 = df_predict_service[['uno', 'enddate', 'registerdate', 'Unsubscription']]
pd_feature_9['REG_END'] = pd_feature_9['enddate'] - pd_feature_9['registerdate']
pd_feature_9 = pd_feature_9[['uno', 'registerdate', 'REG_END']]

pd_feature_9['REG_END'] = pd_feature_9['REG_END'].astype('str')
pd_feature_9['REG_END'] = pd_feature_9.REG_END.str.split(' days').str[0]
pd_feature_9['REG_END'] = pd_feature_9['REG_END'].astype('int')

In [None]:
# # (10) enddate - lastview

# end_enddate = df_predict_service.groupby(by='uno', as_index=False).enddate.max()
# lastview = df_predict_bookmark.groupby(by='uno', as_index=False).dates.max()

# lastview_enddate = pd.merge(end_enddate, lastview, how='left')
# lastview_enddate.rename(columns={'dates':'last_view'}, inplace=True)
# lastview_enddate['LAST_V'] = lastview_enddate['enddate'] - lastview_enddate['last_view']
# lastview_enddate['LAST_V'] = lastview_enddate['LAST_V'].dt.days

# # lastview_enddate['LAST_V'] = lastview_enddate['LAST_V'].fillna(int(40))

# pd_feature_10 = lastview_enddate[['uno', 'LAST_V']]

# pd_feature_10.loc[pd_feature_10['LAST_V']<20, 'LAST_V'] = 0
# pd_feature_10.loc[pd_feature_10['LAST_V']>=20, 'LAST_V'] = 1
# pd_feature_10['LAST_V'] = pd_feature_10['LAST_V'].fillna(2)

In [None]:
df_service_bookmark = df_predict_service.copy()
feature_list = [pd_feature_1, pd_feature_2, pd_feature_3, pd_feature_4, 
                pd_feature_5, pd_feature_6, pd_feature_7, pd_feature_8]

for feature in feature_list:
    df_service_bookmark = pd.merge(df_service_bookmark, feature, on=['uno'], how='left')
    
df_service_bookmark = pd.merge(df_service_bookmark, pd_feature_9, on=['uno', 'registerdate'], how='left')
    
print(df_service_bookmark.columns)
print(len(df_predict_service))
print(len(df_service_bookmark))
print(len(df_predict_service))

df_service_bookmark.isna().sum()

df_service_bookmark['BM_CNT'] = df_service_bookmark['BM_CNT'].fillna(0)
df_service_bookmark['VT_TOT'] = df_service_bookmark['VT_TOT'].fillna(0)
df_service_bookmark['VT_AVG'] = df_service_bookmark['VT_AVG'].fillna(0)
df_service_bookmark['CH_CNT'] = df_service_bookmark['CH_CNT'].fillna(0)
df_service_bookmark['PRG_CNT'] = df_service_bookmark['PRG_CNT'].fillna(0)
df_service_bookmark['DEV_CNT'] = df_service_bookmark['DEV_CNT'].fillna(0)

In [None]:
service_bookmark.columns == df_service_bookmark.columns

In [None]:
selected_columns = ['pgamount', 'chargetypeid', 'concurrentwatchcount', 'promo_100', 'coinReceived',
                  'devicetypeid', 'isauth', 'gender', 'agegroup', 'REG_CNT', 'PRD_CNT', 'BM_CNT', 
                  'VT_TOT', 'VT_AVG', 'CH_CNT', 'PRG_CNT', 'DEV_CNT', 'REG_END']

df_service_bookmark[selected_columns] = scaler.transform(df_service_bookmark[selected_columns])

In [None]:
df_service_bookmark.describe()

## 예측용 데이터

In [None]:
X_predict = df_service_bookmark.drop(['uno', 'registerdate', 'enddate', 'Unsubscription', 'productcode'], axis=1)

## Train data 쪼개기

In [None]:
X = service_bookmark.drop(['Unsubscription'], axis=1)
y = service_bookmark['Unsubscription']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=47)

In [None]:
service_bookmark.to_csv('service_bookmark.xlsx', index=False)

In [None]:
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

In [None]:
X_train_with = X_train.copy()
X_test_with = X_test.copy()

In [None]:
X_train = X_train.drop(['uno', 'registerdate', 'enddate', 'productcode'], axis=1)
X_test = X_test.drop(['uno', 'registerdate', 'enddate', 'productcode'], axis=1)

In [None]:
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

## Xgboost

In [None]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from xgboost import XGBClassifier, plot_importance

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score

In [None]:
#got the best parameters

param_grid = {
    'max_depth' : [6],
    'learning_rate' : [0.3, 0.1, 0.05],
#     'gamma' : [0, 0.05, 0.4, 0.5],
#     'reg_lambda' : [0, 1, 10, 20],
    'scale_pos_weight' : [1, 3, 5]    
}

xgb_model = GridSearchCV(
    estimator=XGBClassifier(objective = 'binary:logistic',
                            subsample = 0.9,
                            colsample_bytree = 0.5),
    param_grid=param_grid,
    scoring='f1',
    verbose=2,
    n_jobs=10,
    cv=3
)

In [None]:
#got the best parameters above

xgb_model.fit(X_train, y_train, verbose=True)

In [None]:
xgb_model.best_estimator_

In [None]:
best_xgb = xgb_model.best_estimator_
best_xgb

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=3, subsample=0.9,
              tree_method='exact', validate_parameters=1, verbosity=None)

## Xgboost result

In [None]:
y_pred = best_xgb.predict(X_test)

In [None]:
print("f1 score: ", f1_score(y_test, y_pred), "\nAccuracy: ", accuracy_score(y_test, y_pred))

[LAST_V 빼고]
* f1 score:  0.5419285604783075 
* Accuracy:  0.571367092239277

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
col = X_train.columns
importance = best_xgb.feature_importances_

for i in range(len(col)):
    print(col[i], importance[i])

## REG_END < 30인 경우 예측값을 1로 변경

In [None]:
# X_test_with.sample(5)

In [None]:
# len(X_test_with), len(y_pred)

In [None]:
# X_test_with = X_test_with.reset_index()
# X_test_with.head(5)

In [None]:
# df_y_pred = pd.DataFrame(y_pred, columns=['y_pred'])
# test_pred = pd.concat([X_test_with[['uno', 'productcode', 'REG_END']], df_y_pred], axis=1)

In [None]:
# test_pred[test_pred['REG_END']<30]

## 답안 Predict

In [None]:
# 예측 모델에 최종 데이터 입력
y_pred = best_xgb.predict(X_predict)

# 데이터 건수 확인
print('예측 대상 건수 = ', len(X_predict), ', 예측 결과 건수 = ', len(y_pred))

In [None]:
# 결과 제출 답안지 불러오기
df_sheet = pd.read_csv("Submission/CDS_submission.csv")
df_sheet.drop('CHURN', axis=1, inplace=True)
df_sheet.info()

In [None]:
df_sheet.sample(5)

In [None]:
# 답안지에 답안 표기
df_result = df_service_bookmark.loc[:,('uno','registerdate','productcode')]
df_result['KEY']   = df_result['uno'] + '|' + df_result['registerdate'].dt.strftime('%y-%m-%d %I:%M:%S') + '|' + df_result['productcode']   # 판다스 strftime()
df_result['CHURN'] = pd.DataFrame(y_pred)
df_result = df_result.loc[:,('KEY','CHURN')]
df_answer_sheet = pd.merge(df_sheet, df_result, on='KEY', how='left')
df_answer_sheet.info()

## 답안지 제출 파일 생성하기

In [None]:
ds_answer_sheet = "CDS_submission_후렌치파이_3차.csv"
df_answer_sheet.to_csv(ds_answer_sheet, index=False, encoding='utf8')

In [None]:
df_answer_sheet

In [None]:
submission1 = pd.read_csv("CDS_submission_후렌치파이_3차.csv")

In [None]:
len(submission1), len(df_answer_sheet)

## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz

In [None]:
for i in range(7, 30):
    dt_model = DecisionTreeClassifier(random_state=47, max_depth=i)
    dt_model.fit(X_train, y_train)
    y_pred = dt_model.predict(X_test)
    print("\nmax_depth: ", i, "\nf1 score: ", f1_score(y_test, y_pred), "\nAccuracy: ", accuracy_score(y_test, y_pred))

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
len(dt_model.feature_importances_)

In [None]:
columns = X_train.columns
importances = dt_model.feature_importances_
col_iptc = dict()

for i in range(18):
    col_iptc[columns[i]] = importances[i]


In [None]:
sorted(col_iptc, key=lambda x: col_iptc[x], reverse=True)

In [None]:
col_iptc

In [None]:
confusion_matrix(y_test, y_pred)

## code 데이터 읽기
  - content info
  - movie info

In [None]:
df_content_info = pd.read_csv("Code/content_info.csv")
df_content_info.sample(3)

In [None]:
df_movie_info = pd.read_csv("Code/movie_info.csv")
df_movie_info.sample(3)