# Data Mining Project
### Giorgio Donati, g.donati24@studenti.unipi.it
### Pietro Francaviglia, p.francaviglia1@studenti.unipi.it
#### A.Y. 2021-2022


## Libraries and datasets

In [1]:
from sklearn.ensemble import RandomForestClassifier

from t3_constants import *
from t3_utility import *

# Random Forest
## Training

In [2]:
with open(f'{PICKLE_FOLDER}{TRAIN}{X}.pkl', 'rb') as f:
    train_x = pickle.load(f)
with open(f'{PICKLE_FOLDER}{TRAIN}{Y}.pkl', 'rb') as f:
    train_y = pickle.load(f)

In [12]:
# Perform the hyper-parameters grid-search on random forest
param_d = {
    CRT : ['gini', 'entropy'],
    DPT : range(6, 9),
    MSL : range(1, 6),
    NEST : np.logspace(1, 2.5, num=10).astype(int).tolist(),
}
# iteration on parameters to collect results
grid = itertools.product(*[param_d[key] for key in sorted(param_d)])
results = []
for criterion, max_depth, min_samples_leaf, n_estimators in grid:
    model = RandomForestClassifier(
        criterion=criterion,
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_leaf=min_samples_leaf
    )
    mean_val_score, std_val_score = cross_validation_summary(model, train_x.values, train_y.values)
    res = criterion, max_depth, min_samples_leaf, n_estimators, mean_val_score, std_val_score
    results.append(res)

# print top 5 combination results
results_df = pd.DataFrame(results, columns=sorted(param_d) + [MVS, SVS])
results_df.sort_values(MVS, ascending=False).head(8)

Unnamed: 0,criterion,max_depth,min_samples_leaf,n_estimators,mean_val_score,std_val_score
109,gini,8,1,316,0.889714,0.019429
57,gini,7,1,146,0.888114,0.023384
106,gini,8,1,100,0.888102,0.030096
257,entropy,8,1,146,0.888089,0.028348
55,gini,7,1,68,0.888089,0.034088
107,gini,8,1,146,0.888089,0.033328
112,gini,8,2,21,0.883327,0.025783
108,gini,8,1,215,0.883314,0.026292


In [13]:
results_df.groupby(CRT)[[MVS, SVS]].mean().sort_values(MVS, ascending=False)

Unnamed: 0_level_0,mean_val_score,std_val_score
criterion,Unnamed: 1_level_1,Unnamed: 2_level_1
entropy,0.859351,0.034243
gini,0.859023,0.033599


In [14]:
results_df.groupby(DPT)[[MVS, SVS]].mean().sort_values(MVS, ascending=False)

Unnamed: 0_level_0,mean_val_score,std_val_score
max_depth,Unnamed: 1_level_1,Unnamed: 2_level_1
8,0.862002,0.032838
7,0.860574,0.034385
6,0.854984,0.034541


In [15]:
results_df.groupby(MSL)[[MVS, SVS]].mean().sort_values(MVS, ascending=False)

Unnamed: 0_level_0,mean_val_score,std_val_score
min_samples_leaf,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.870521,0.030821
2,0.865568,0.032293
3,0.860423,0.033789
4,0.85179,0.03598
5,0.847631,0.036723


In [16]:
results_df.groupby(NEST)[[MVS, SVS]].mean().sort_values(MVS, ascending=False)

Unnamed: 0_level_0,mean_val_score,std_val_score
n_estimators,Unnamed: 1_level_1,Unnamed: 2_level_1
146,0.865289,0.034429
68,0.864648,0.035674
316,0.863647,0.031852
215,0.862573,0.035026
100,0.862364,0.034298
31,0.859009,0.034186
46,0.85895,0.035247
21,0.858066,0.030351
14,0.851771,0.035552
10,0.845552,0.032597


### Test Model

In [17]:
best_result = results_df.sort_values(MVS).tail(1)

# best model
test_model = RandomForestClassifier(
        criterion=best_result[CRT].values[0],
        n_estimators=best_result[NEST].values[0],
        max_depth=best_result[DPT].values[0],
        min_samples_leaf=best_result[MSL].values[0]
)
_ = test_model.fit(train_x, train_y)

## Testing

In [18]:
with open(f'{PICKLE_FOLDER}{TEST}{X}.pkl', 'rb') as f:
    test_x = pickle.load(f)
with open(f'{PICKLE_FOLDER}{TEST}{Y}.pkl', 'rb') as f:
    test_y = pickle.load(f)

In [19]:
# Around 0.88
test_model.score(test_x, test_y)

0.910828025477707