## **RF_modeling** 
I1열 결측치 채우는 모델링

In [1]:
# required package import
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import time
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report  

import warnings  
warnings.filterwarnings('ignore')

In [4]:
# load data
train_df = pd.read_csv('../../data/preprocessed_data/train_data.csv')
valid_df = pd.read_csv('../../data/preprocessed_data/valid_data.csv')

In [5]:
x_train = train_df.drop(columns = ['cust_no', 'label'])
x_valid = valid_df.drop(columns = ['cust_no', 'label'])
y_train = train_df['label']
y_valid = valid_df['label']

print("x_train.shape: ", x_train.shape)
print("x_valid.shape: ", x_valid.shape)
print("y_train.shape: ", y_train.shape)
print("y_valid.shape: ", y_valid.shape)

x_train.shape:  (125290, 83)
x_valid.shape:  (10000, 83)
y_train.shape:  (125290,)
y_valid.shape:  (10000,)


### I1 결측치 채우기

In [6]:
# data for train
tmp_train = x_train.dropna()
tmp_train_x = tmp_train.drop('I1', axis = 1)
tmp_train_y = tmp_train['I1']

tmp_valid = x_valid.dropna()
tmp_valid_x = tmp_valid.drop('I1', axis = 1)
tmp_valid_y = tmp_valid['I1']

# model training
# (train note)
# logistic regression : no convergence
# random forest(default) : train acc = 1.0 , valid acc = 0.65
# random forest(n_estimators = 100, max_depth = 15) : train acc = 0.77, valid acc = 0.63
# random forest(n_estimators = 500(default), max_depth = 15) : train acc = 0.77, valid acc = 0.64
start_T = time.time()
rf = RandomForestClassifier(max_depth = 15)
rf.fit(tmp_train_x, tmp_train_y)
end_T = time.time()

print("model training time : ", end_T - start_T)

# prediction and evaluation
# train data
tmp_train_pred_y = rf.predict(tmp_train_x)
print('Accuracy: {:.2f}'.format(accuracy_score(tmp_train_y, tmp_train_pred_y)))
print(confusion_matrix(tmp_train_y, tmp_train_pred_y))
print(classification_report(tmp_train_y, tmp_train_pred_y))

# valid data
tmp_valid_pred_y = rf.predict(tmp_valid_x)
print('Accuracy: {:.2f}'.format(accuracy_score(tmp_valid_y, tmp_valid_pred_y)))
print(confusion_matrix(tmp_valid_y, tmp_valid_pred_y))
print(classification_report(tmp_valid_y, tmp_valid_pred_y))

model training time :  22.604636907577515
Accuracy: 0.78
[[60242  7632]
 [20415 36947]]
              precision    recall  f1-score   support

         0.0       0.75      0.89      0.81     67874
         1.0       0.83      0.64      0.72     57362

    accuracy                           0.78    125236
   macro avg       0.79      0.77      0.77    125236
weighted avg       0.78      0.78      0.77    125236

Accuracy: 0.64
[[4144 1295]
 [2301 2257]]
              precision    recall  f1-score   support

         0.0       0.64      0.76      0.70      5439
         1.0       0.64      0.50      0.56      4558

    accuracy                           0.64      9997
   macro avg       0.64      0.63      0.63      9997
weighted avg       0.64      0.64      0.63      9997



In [7]:
# data having null data
x_train.loc[x_train.I1.isna(), 'I1'] = rf.predict(x_train[x_train.I1.isna()].drop('I1', axis = 1))
x_valid.loc[x_valid.I1.isna(), 'I1'] = rf.predict(x_valid[x_valid.I1.isna()].drop('I1', axis = 1))

In [8]:
print("train null data num : ", x_train.isna().sum().sum())
print("valid null data num : ", x_valid.isna().sum().sum())

train null data num :  0
valid null data num :  0


In [10]:
# tmp data save
x_train.to_csv('./tmp_data/x_train.csv', index = None)
x_valid.to_csv('./tmp_data/x_valid.csv', index = None)
y_train.to_csv('./tmp_data/y_train.csv', index = None)
y_valid.to_csv('./tmp_data/y_valid.csv', index = None)

## **RF_modeling**

In [12]:
x_train = pd.read_csv('./tmp_data/x_train.csv') 
x_valid = pd.read_csv('./tmp_data/x_valid.csv')  
y_train = pd.read_csv('./tmp_data/y_train.csv')
y_valid = pd.read_csv('./tmp_data/y_valid.csv')  

In [14]:
model = RandomForestClassifier()  
model 

RandomForestClassifier()

In [15]:
start_T = time.time()  
rf_clf = RandomForestClassifier(random_state = 0)  
rf_clf.fit(x_train, y_train)  
end_T = time.time()
pred = rf_clf.predict(x_valid)  
accuracy = accuracy_score(y_valid, pred)  
print('랜덤 포레스트 정확도 : {:.4f}'.format(accuracy))  

  
print("model training time : ", end_T - start_T)

  rf_clf.fit(x_train, y_train)


랜덤 포레스트 정확도 : 0.7230
model training time :  33.23121213912964


### **GridSearchCV를 통한 랜덤포레스트의 하이퍼 파라미터 튜닝**

In [23]:
from sklearn.model_selection import GridSearchCV  

params = {'n_estimators' : [200,300], 
          'max_features' : ['auto', 'sqrt', 'log2'],
          'max_depth' : [8,10,12],
          'min_samples_leaf' :[8,12],
          }  

rf_clf = RandomForestClassifier(random_state = 0, n_jobs = -1)  
grid_cv = GridSearchCV(rf_clf, param_grid = params, cv = 3, n_jobs = -1)  
grid_cv.fit(x_train, y_train)  


print('최적의 하이퍼 파라미터 : ', grid_cv.best_params_)  
print('최고 예측 정확도 : {:.4f}'.format(grid_cv.best_score_))

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

최적의 하이퍼 파라미터 :  {'max_depth': 12, 'max_features': 'auto', 'min_samples_leaf': 8, 'n_estimators': 200}
최고 예측 정확도 : 0.7017


In [None]:
''' 
rf_c1f1 = RandomForestClassifier(n_estimators = , 
                                 max_depth = ,
                                 max_features = , 
                                 min_samples_leaf = , 
                                 random_state = 0, 
                                 n_jobs = -1)  
rf_c1f1.fit(x_train, y_train)  
pred = rf_c1f1.predict(x_valid)  
print('예측 정확도 : {:.4f}'.format(accuracy_score(y_valid,pred)))
'''

### **RF 각 피처의 중요도 시각화** 

In [None]:

import matplotlib.pyplot as plt 
import seaborn as sns  
%matplotlib inline 
ftr_importances_values = rf_clf.feature_importances_  
ftr_importances = pd.Series(ftr_importances_values, index = x_train.columns)  
ftr_top20 = ftr_importances_values.sort_values(ascending = False)[:20]  

plt.figure(figsize=(8,6))  
plt.tile('Top 20 Features Importances')  
sns.barplot(x = ftr_top20, y = ftr_top20.index)  
plt.show()  
