# Data Mining Project
### Giorgio Donati, g.donati24@studenti.unipi.it
### Pietro Francaviglia, p.francaviglia1@studenti.unipi.it
#### A.Y. 2021-2022


## Libraries and datasets

In [13]:
from sklearn.ensemble import RandomForestClassifier

from t3_constants import *
from t3_utility import *

# Random Forest
## Training

In [14]:
with open(f'{PICKLE_FOLDER}{TRAIN}{X}.pkl', 'rb') as f:
    train_x = pickle.load(f)
with open(f'{PICKLE_FOLDER}{TRAIN}{Y}.pkl', 'rb') as f:
    train_y = pickle.load(f)

In [15]:
# Perform the hyper-parameters grid-search on random forest
param_d = {
    CRT : ['gini', 'entropy'],
    DPT : range(2, 5),
    FT : ['sqrt', 'log2'],
    MSL : range(1, 6),
    NEST : np.logspace(1, 2.5, num=10).astype(int).tolist(),
}
# iteration on parameters to collect results
grid = itertools.product(*[param_d[key] for key in sorted(param_d)])
results = []
for criterion, max_depth, max_features, min_samples_leaf, n_estimators in grid:
    model = RandomForestClassifier(
        criterion=criterion,
        n_estimators=n_estimators,
        max_features=max_features,
        max_depth=max_depth,
        min_samples_leaf=min_samples_leaf
    )
    mean_val_score, std_val_score = cross_validation_summary(model, train_x.values, train_y.values)
    res = criterion, max_depth, max_features, min_samples_leaf, n_estimators, mean_val_score, std_val_score
    results.append(res)

# print top 5 combination results
results_df = pd.DataFrame(results, columns=sorted(param_d) + [MVS, SVS])
results_df.sort_values(MVS, ascending=False).head(8)

Unnamed: 0,criterion,max_depth,max_features,min_samples_leaf,n_estimators,mean_val_score,std_val_score
256,gini,4,log2,1,100,0.85614,0.029768
523,entropy,4,sqrt,3,31,0.85454,0.036497
575,entropy,4,log2,3,68,0.85454,0.037192
513,entropy,4,sqrt,2,31,0.854514,0.039859
259,gini,4,log2,1,316,0.85294,0.033657
574,entropy,4,log2,3,46,0.85294,0.036573
502,entropy,4,sqrt,1,21,0.852927,0.039604
566,entropy,4,log2,2,100,0.851352,0.029603


In [16]:
results_df.groupby(CRT)[[MVS, SVS]].mean().sort_values(MVS, ascending=False)

Unnamed: 0_level_0,mean_val_score,std_val_score
criterion,Unnamed: 1_level_1,Unnamed: 2_level_1
entropy,0.818777,0.035844
gini,0.816787,0.035922


In [17]:
results_df.groupby(DPT)[[MVS, SVS]].mean().sort_values(MVS, ascending=False)

Unnamed: 0_level_0,mean_val_score,std_val_score
max_depth,Unnamed: 1_level_1,Unnamed: 2_level_1
4,0.837658,0.034975
3,0.819648,0.034374
2,0.79604,0.038299


In [18]:
results_df.groupby(FT)[[MVS, SVS]].mean().sort_values(MVS, ascending=False)

Unnamed: 0_level_0,mean_val_score,std_val_score
max_features,Unnamed: 1_level_1,Unnamed: 2_level_1
sqrt,0.818075,0.035616
log2,0.817489,0.03615


In [19]:
results_df.groupby(MSL)[[MVS, SVS]].mean().sort_values(MVS, ascending=False)

Unnamed: 0_level_0,mean_val_score,std_val_score
min_samples_leaf,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.819406,0.035277
3,0.818539,0.035703
2,0.818473,0.036451
4,0.816501,0.035751
5,0.815991,0.036232


In [20]:
results_df.groupby(NEST)[[MVS, SVS]].mean().sort_values(MVS, ascending=False)

Unnamed: 0_level_0,mean_val_score,std_val_score
n_estimators,Unnamed: 1_level_1,Unnamed: 2_level_1
316,0.822805,0.032096
68,0.822262,0.036493
146,0.82216,0.034227
100,0.821948,0.034154
46,0.820852,0.03636
215,0.82083,0.033689
31,0.819277,0.038828
21,0.815421,0.0373
14,0.809431,0.038292
10,0.802834,0.03739


### Test Model

In [24]:
best_result = results_df.sort_values(MVS).tail(1)

# best model
test_model = RandomForestClassifier(
        criterion=best_result[CRT].values[0],
        n_estimators=best_result[NEST].values[0],
        max_features=best_result[FT].values[0],
        max_depth=best_result[DPT].values[0],
        min_samples_leaf=best_result[MSL].values[0]
)
_ = test_model.fit(train_x, train_y)

## Testing

In [25]:
with open(f'{PICKLE_FOLDER}{TEST}{X}.pkl', 'rb') as f:
    test_x = pickle.load(f)
with open(f'{PICKLE_FOLDER}{TEST}{Y}.pkl', 'rb') as f:
    test_y = pickle.load(f)

In [27]:
# Around 0.88
test_model.score(test_x, test_y)

0.8789808917197452