# RandomForest

### Mix

In [15]:
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
import pandas as pd
import numpy as np

train = pd.read_csv('./data/etrain.csv')
test = pd.read_csv('./data/etest.csv')
validation = pd.read_csv('./data/evalidation.csv')
class_list = np.load('./class_list.npy')

train_target = train['target']
train = train.drop(columns = ['target'])
test_target = test['target']
test = test.drop(columns = ['target'])

In [3]:
classifier = RandomForestClassifier()

param_dist = {
    'max_depth': range(3, 100),
    'n_estimators': range(10, 100),
    "max_features": range(1, 60),
}

tree_cv = RandomizedSearchCV(classifier, param_dist, cv=5, n_iter=30, n_jobs=-1)

tree_cv.fit(train, train_target)

RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
               

In [5]:
print("Tuned Random Forest Parameters: {}".format(tree_cv.best_params_))
print("Best score is {}".format(tree_cv.best_score_))

Tuned Decision Tree Parameters: {'n_estimators': 95, 'max_features': 49, 'max_depth': 67}
Best score is 0.7593649425943861


In [7]:
predict_target = tree_cv.predict(test)
print(tree_cv.score(test, test_target))
print(classification_report(test_target, predict_target))

0.7554459020339391
              precision    recall  f1-score   support

           0       0.82      0.83      0.82      2051
           1       0.64      0.68      0.66      2090
           2       0.69      0.68      0.68      2110
           3       0.88      0.84      0.86      2058

    accuracy                           0.76      8309
   macro avg       0.76      0.76      0.76      8309
weighted avg       0.76      0.76      0.76      8309



### Only numeric feature

In [8]:
train = pd.read_csv('./data/etrain.csv')
test = pd.read_csv('./data/etest.csv')
validation = pd.read_csv('./data/evalidation.csv')
class_list = np.load('./class_list.npy')

train_target = train['target']
train = train.drop(columns = ['target'])
test_target = test['target']
test = test.drop(columns = ['target'])


train = pd.read_csv('./data/etrain.csv')
test = pd.read_csv('./data/etest.csv')
validation = pd.read_csv('./data/evalidation.csv')
numeric_feature = np.load('./numeric_features.npy')

#train_target = train['target']
#train = train.drop(columns = ['target'])
#test_target = test['target']
#test = test.drop(columns = ['target'])
train = train[numeric_feature]
test = test[numeric_feature]
validation = validation[numeric_feature]

In [9]:
classifier = RandomForestClassifier()

param_dist = {
    'max_depth': range(3, 100),
    'n_estimators': range(10, 100),
    "max_features": range(1, 60),
}

tree_cv = RandomizedSearchCV(classifier, param_dist, cv=5, n_iter=30, n_jobs=-1)

tree_cv.fit(train, train_target)

RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
               

In [10]:
print("Tuned Random Forest Parameters: {}".format(tree_cv.best_params_))
print("Best score is {}".format(tree_cv.best_score_))

Tuned Random Forest Parameters: {'n_estimators': 79, 'max_features': 22, 'max_depth': 94}
Best score is 0.759252238468463


In [11]:
predict_target = tree_cv.predict(test)
print(tree_cv.score(test, test_target))
print(classification_report(test_target, predict_target))

0.7566494162955831
              precision    recall  f1-score   support

           0       0.82      0.84      0.83      2051
           1       0.64      0.68      0.66      2090
           2       0.69      0.67      0.68      2110
           3       0.88      0.84      0.86      2058

    accuracy                           0.76      8309
   macro avg       0.76      0.76      0.76      8309
weighted avg       0.76      0.76      0.76      8309



### Only Categorical

In [3]:
train = pd.read_csv('./data/etrain.csv')
test = pd.read_csv('./data/etest.csv')
validation = pd.read_csv('./data/evalidation.csv')
numeric_feature = np.load('./numeric_features.npy')

train_target = train['target']
train = train.drop(columns = ['target'])
test_target = test['target']
test = test.drop(columns = ['target'])

train = train.drop(columns=numeric_feature)
test = test.drop(columns=numeric_feature)
validation = validation.drop(columns=numeric_feature)

In [13]:
classifier = RandomForestClassifier()

param_dist = {
    'max_depth': range(3, 100),
    'n_estimators': range(10, 100),
    "max_features": range(1, 60),
}

tree_cv = RandomizedSearchCV(classifier, param_dist, cv=5, n_iter=30, n_jobs=-1)

tree_cv.fit(train, train_target)

RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
               

In [14]:
print("Tuned Random Forest Parameters: {}".format(tree_cv.best_params_))
print("Best score is {}".format(tree_cv.best_score_))

Tuned Random Forest Parameters: {'n_estimators': 74, 'max_features': 11, 'max_depth': 18}
Best score is 0.4869115172172281


In [15]:
predict_target = tree_cv.predict(test)
print(tree_cv.score(test, test_target))
print(classification_report(test_target, predict_target))

0.4797207846912986
              precision    recall  f1-score   support

           0       0.54      0.62      0.58      2051
           1       0.38      0.29      0.33      2090
           2       0.37      0.31      0.34      2110
           3       0.55      0.72      0.63      2058

    accuracy                           0.48      8309
   macro avg       0.46      0.48      0.47      8309
weighted avg       0.46      0.48      0.46      8309



### aaaaaaaaaaa

In [16]:
forest = RandomForestClassifier(n_jobs=-1)
forest.fit(train, train_target)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [17]:
predict_target = forest.predict(test)
print(forest.score(test, test_target))
print(classification_report(test_target, predict_target))

0.7469009507762667
              precision    recall  f1-score   support

           0       0.82      0.82      0.82      2051
           1       0.63      0.68      0.65      2090
           2       0.68      0.65      0.67      2110
           3       0.88      0.83      0.85      2058

    accuracy                           0.75      8309
   macro avg       0.75      0.75      0.75      8309
weighted avg       0.75      0.75      0.75      8309

