In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [2]:
file_url = 'https://raw.githubusercontent.com/PacktWorkshops/The-Data-Science-Workshop/master/Chapter04/Dataset/phpB0xrNj.csv'

In [3]:
df = pd.read_csv(file_url)

In [4]:
df.head()

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,...,f609,f610,f611,f612,f613,f614,f615,f616,f617,class
0,-0.4394,-0.093,0.1718,0.462,0.6226,0.4704,0.3578,0.0478,-0.1184,-0.231,...,0.4102,0.2052,0.3846,0.359,0.5898,0.3334,0.641,0.5898,-0.4872,'1'
1,-0.4348,-0.1198,0.2474,0.4036,0.5026,0.6328,0.4948,0.0338,-0.052,-0.1302,...,0.0,0.2954,0.2046,0.4772,0.0454,0.2046,0.4318,0.4546,-0.091,'1'
2,-0.233,0.2124,0.5014,0.5222,-0.3422,-0.584,-0.7168,-0.6342,-0.8614,-0.8318,...,-0.1112,-0.0476,-0.1746,0.0318,-0.0476,0.1112,0.254,0.1588,-0.4762,'2'
3,-0.3808,-0.0096,0.2602,0.2554,-0.429,-0.6746,-0.6868,-0.665,-0.841,-0.9614,...,-0.0504,-0.036,-0.1224,0.1366,0.295,0.0792,-0.0072,0.0936,-0.151,'2'
4,-0.3412,0.0946,0.6082,0.6216,-0.1622,-0.3784,-0.4324,-0.4358,-0.4966,-0.5406,...,0.1562,0.3124,0.25,-0.0938,0.1562,0.3124,0.3124,0.2188,-0.25,'3'


In [5]:
y = df.pop('class')

In [6]:
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.3, random_state=888)

In [7]:
def train_rf(X_train, y_train, random_state=888, n_estimators=10, max_depth=None, min_samples_leaf=1, max_features='sqrt'):
  rf_model = RandomForestClassifier(random_state=random_state, n_estimators=n_estimators, max_depth=max_depth, min_samples_leaf=min_samples_leaf, max_features=max_features)
  rf_model.fit(X_train, y_train)
  return rf_model

In [8]:
rf_1 = train_rf(X_train, y_train)
rf_1.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 888,
 'verbose': 0,
 'warm_start': False}

In [0]:
def get_preds(rf_model, X_train, X_test):
  train_preds = rf_model.predict(X_train)
  test_preds = rf_model.predict(X_test)
  return train_preds, test_preds

In [0]:
trn_preds, tst_preds = get_preds(rf_1, X_train, X_test)

In [0]:
def print_accuracy(y_train, y_test, train_preds, test_preds):
  train_acc = accuracy_score(y_train, train_preds)
  test_acc = accuracy_score(y_test, test_preds)
  print(train_acc)
  print(test_acc)
  return train_acc, test_acc

In [15]:
trn_acc, tst_preds = print_accuracy(y_train, y_test, trn_preds, tst_preds)

0.9981674912955837
0.8935897435897436


In [0]:
def fit_predict_rf(X_train, X_test, y_train, y_test, random_state=888, n_estimators=10, max_depth=None, min_samples_leaf=1, max_features='sqrt'):
  rf_model = train_rf(X_train, y_train, random_state=random_state, n_estimators=n_estimators, max_depth=max_depth, min_samples_leaf=min_samples_leaf, max_features=max_features)
  train_preds, test_preds = get_preds(rf_model, X_train, X_test)
  train_acc, test_acc = print_accuracy(y_train, y_test, train_preds, test_preds)
  return rf_model, train_preds, test_preds, train_acc, test_acc

In [17]:
rf_model_1, trn_preds_1, tst_preds_1, trn_acc_1, tst_acc_1 = fit_predict_rf(X_train, X_test, y_train, y_test, random_state=888, n_estimators=20, max_depth=None, min_samples_leaf=1, max_features='sqrt')

0.9998167491295583
0.9192307692307692


In [18]:
rf_model_2, trn_preds_2, tst_preds_2, trn_acc_2, tst_acc_2 = fit_predict_rf(X_train, X_test, y_train, y_test, random_state=888, n_estimators=50, max_depth=None, min_samples_leaf=1, max_features='sqrt')

1.0
0.9333333333333333


In [19]:
rf_model_3, trn_preds_3, tst_preds_3, trn_acc_3, tst_acc_3 = fit_predict_rf(X_train, X_test, y_train, y_test, random_state=888, n_estimators=50, max_depth=5, min_samples_leaf=1, max_features='sqrt')

0.8552318123511087
0.8213675213675213


In [20]:
rf_model_4, trn_preds_4, tst_preds_4, trn_acc_4, tst_acc_4 = fit_predict_rf(X_train, X_test, y_train, y_test, random_state=888, n_estimators=50, max_depth=10, min_samples_leaf=1, max_features='sqrt')

0.9844236760124611
0.9260683760683761


In [21]:
rf_model_5, trn_preds_5, tst_preds_5, trn_acc_5, tst_acc_5 = fit_predict_rf(X_train, X_test, y_train, y_test, random_state=888, n_estimators=50, max_depth=10, min_samples_leaf=10, max_features='sqrt')

0.9622503206890233
0.9192307692307692


In [22]:
rf_model_6, trn_preds_6, tst_preds_6, trn_acc_6, tst_acc_6 = fit_predict_rf(X_train, X_test, y_train, y_test, random_state=888, n_estimators=50, max_depth=10, min_samples_leaf=50, max_features='sqrt')

0.9184533626534725
0.8940170940170941


In [23]:
rf_model_7, trn_preds_7, tst_preds_7, trn_acc_7, tst_acc_7 = fit_predict_rf(X_train, X_test, y_train, y_test, random_state=888, n_estimators=50, max_depth=10, min_samples_leaf=50, max_features=0.5)

0.8926149899212021
0.867948717948718


In [24]:
rf_model_8, trn_preds_8, tst_preds_8, trn_acc_8, tst_acc_8 = fit_predict_rf(X_train, X_test, y_train, y_test, random_state=888, n_estimators=50, max_depth=10, min_samples_leaf=50, max_features=0.3)

0.9008612790910757
0.8717948717948718
