In [1]:
# author=cxf
# date=2020-8-8
# file for model training
# import warnings filter
from warnings import simplefilter

# ignore all future warnings
simplefilter(action='ignore')

import pandas as pd
import numpy as np
import sklearn.model_selection as ms
import sklearn.ensemble as se
import matplotlib.pyplot as mp
import sklearn.utils as su

# get features and expected cutoff
df_input = pd.read_csv('../2.extract_features/input_run1_run2.csv', index_col=0)
df_output = pd.read_csv('../2.extract_features/output_run1_run2.csv', index_col=0)
y = df_output['max_cutoff']
x = df_input
header = x.columns

# shuffle the data
x, y = su.shuffle(x, y, random_state=999)
train_size = int(len(x) * 0.75)
# divide data into train set and test set by 3:1
train_x, test_x, train_y, test_y = \
    x[:train_size], x[train_size:], \
    y[:train_size], y[train_size:]

# train model and get the best params by grid search
model = se.RandomForestClassifier(max_depth=4, n_estimators=150, random_state=999)
model = ms.GridSearchCV(model, [{'max_depth': range(1, 6, 1), 'n_estimators': range(10, 200, 10)}], cv=5)
model.fit(train_x, train_y)


print(model.best_params_)
print(model.best_score_)
print(model.best_estimator_)

{'max_depth': 4, 'n_estimators': 50}
0.7945876655747045
RandomForestClassifier(max_depth=4, n_estimators=50, random_state=999)


In [2]:
import pandas as pd
import numpy as np
import sklearn.model_selection as ms
import sklearn.ensemble as se
import matplotlib.pyplot as mp
import sklearn.utils as su
from sklearn import metrics 


# get features and expected cutoff
df_input = pd.read_csv('../2.extract_features/input_run1_run2.csv', index_col=0)
df_output = pd.read_csv('../2.extract_features/output_run1_run2.csv', index_col=0)
y = df_output['max_cutoff']
x = df_input
header = x.columns

# shuffle the data
x, y = su.shuffle(x, y, random_state=999)
train_size = int(len(x) * 0.75)
# divide data into train set and test set by 3:1
train_x, test_x, train_y, test_y = \
    x[:train_size], x[train_size:], \
    y[:train_size], y[train_size:]




# set the model with best_params_ and validate function in test set
model = se.RandomForestClassifier(max_depth=4, n_estimators=50, random_state=999)
# 5x cross validate
score = ms.cross_val_score(model, train_x, train_y, cv=5, scoring='f1_weighted')
model.fit(train_x, train_y)
print(f'5x cross validation mean score :{score.mean()}')
# predict cutoff of test set
pred_test_y = model.predict(test_x)
# confusion matrix
import sklearn.metrics as sm

cm = sm.confusion_matrix(test_y, pred_test_y)
print('confusion matrix')
print(cm)

# classification_report
cr = sm.classification_report(test_y, pred_test_y)
print('classification_report')
print(cr)



5x cross validation mean score :0.7639587318727842
confusion matrix
[[  0  10   0   0   0   0   0   0]
 [  0 102   5   0   0   0   0   0]
 [  0  15  42   0   0   0   0   0]
 [  0   2  12   3   0   0   0   0]
 [  0   1   3   0   0   0   0   0]
 [  0   1   0   0   0   0   0   0]
 [  0   1   0   0   0   0   0   0]
 [  0   0   1   0   0   0   0   0]]
classification_report
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        10
           1       0.77      0.95      0.85       107
           2       0.67      0.74      0.70        57
           3       1.00      0.18      0.30        17
           4       0.00      0.00      0.00         4
           5       0.00      0.00      0.00         1
           6       0.00      0.00      0.00         1
          10       0.00      0.00      0.00         1

    accuracy                           0.74       198
   macro avg       0.30      0.23      0.23       198
weighted avg       0.70      0.74

In [3]:
# save the model
import pickle
with open('RF_pick_cutoff.pkl', 'wb') as f:
    pickle.dump(model, f)
    print('the model has been saved')

the model has been saved


In [4]:
# load the model and predict the data including train and test set
# just take a test that the model can work
with open('RF_pick_cutoff.pkl', 'rb') as f:
    model=pickle.load(f)
    pred_y = model.predict(df_input)
    cm = sm.confusion_matrix(df_output, pred_y)
    print('confusion matrix')
    print(cm)

    # classification_report
    cr = sm.classification_report(df_output, pred_y)
    print('classification_report')
    print(cr)

confusion matrix
[[  0  26   0   0   0   0   0   0   0]
 [  0 416  14   0   0   0   0   0   0]
 [  0  50 210   0   0   0   0   0   0]
 [  0   8  23  22   0   0   0   0   0]
 [  0   3  12   1   0   0   0   0   0]
 [  0   2   1   0   0   0   0   0   0]
 [  0   2   0   0   0   0   0   0   0]
 [  0   0   1   0   0   0   0   0   0]
 [  0   0   1   0   0   0   0   0   0]]
classification_report
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        26
           1       0.82      0.97      0.89       430
           2       0.80      0.81      0.80       260
           3       0.96      0.42      0.58        53
           4       0.00      0.00      0.00        16
           5       0.00      0.00      0.00         3
           6       0.00      0.00      0.00         2
           7       0.00      0.00      0.00         1
          10       0.00      0.00      0.00         1

    accuracy                           0.82       792
   macro avg 

In [5]:
# save the predict result to assess the cutoff
df_output['pred_cutoff']=pred_y
df_output.to_csv('test_cutoff.csv')

df_feature=df_input['precise']
df_feature.to_csv('test_feature.csv',header=True)

In [6]:
print(df_output)

           max_cutoff  pred_cutoff
sample                            
sample107           1            1
sample105           1            1
sample103           0            1
sample108           0            1
sample106           0            1
...               ...          ...
sample299           2            2
sample301           2            2
sample300           1            1
sample302           1            1
sample303           1            1

[792 rows x 2 columns]
