# Data Mining Project
### Giorgio Donati, g.donati24@studenti.unipi.it
### Pietro Francaviglia, p.francaviglia1@studenti.unipi.it
#### A.Y. 2021-2022


## Libraries and datasets

In [1]:
from sklearn.ensemble import BaggingClassifier
from sklearn import tree

from t3_constants import *
from t3_utility import *

# Bagging
## Training

In [2]:
with open(f'{PICKLE_FOLDER}{TRAIN}{X}.pkl', 'rb') as f:
    train_x = pickle.load(f)
with open(f'{PICKLE_FOLDER}{TRAIN}{Y}.pkl', 'rb') as f:
    train_y = pickle.load(f)

In [3]:
# parameters dictionary for Bagging grid search
param_d = {
    CRT: ['gini', 'entropy'],
    MSL : range(1,6),
    NEST: np.logspace(1, 2.5, num=10).astype(int).tolist()
}

# iteration on parameters to collect results
combinations = itertools.product(*(param_d[key] for key in sorted(param_d)))

results = []

for criterion, min_samples_leaf, n_estimators in combinations:
    model = BaggingClassifier(
        base_estimator=tree.DecisionTreeClassifier(criterion=criterion, min_samples_leaf=min_samples_leaf),
        n_estimators=n_estimators
    )
    
    mean_val_score, std_val_score = cross_validation_summary(model, train_x.values, train_y.values)
    res = criterion, min_samples_leaf, n_estimators, mean_val_score, std_val_score
    results.append(res)

# print top 5 combination results
bagging_results_df = pd.DataFrame(results, columns=sorted(param_d)+[MVS, SVS])

In [11]:
bagging_results_df.sort_values(MVS, ascending = False).head(8)

Unnamed: 0,criterion,min_samples_leaf,n_estimators,mean_val_score,std_val_score
65,entropy,2,68,0.878514,0.0281
8,gini,1,215,0.876914,0.039536
64,entropy,2,46,0.87534,0.029935
59,entropy,1,316,0.875327,0.025322
7,gini,1,146,0.875302,0.048022
6,gini,1,100,0.873752,0.028064
51,entropy,1,14,0.873752,0.017355
14,gini,2,46,0.87374,0.032721


In [5]:
bagging_results_df.groupby(CRT)[[MVS, SVS]].mean()

Unnamed: 0_level_0,mean_val_score,std_val_score
criterion,Unnamed: 1_level_1,Unnamed: 2_level_1
entropy,0.860425,0.033158
gini,0.855501,0.039126


In [6]:
bagging_results_df.groupby(MSL)[[MVS, SVS]].mean()

Unnamed: 0_level_0,mean_val_score,std_val_score
min_samples_leaf,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.865013,0.035298
2,0.865977,0.033388
3,0.858211,0.038546
4,0.851907,0.035119
5,0.848707,0.038357


In [7]:
bagging_results_df.groupby(NEST)[[MVS, SVS]].mean()

Unnamed: 0_level_0,mean_val_score,std_val_score
n_estimators,Unnamed: 1_level_1,Unnamed: 2_level_1
10,0.844477,0.035753
14,0.851677,0.031917
21,0.850225,0.037607
31,0.859176,0.037904
46,0.860613,0.037543
68,0.859651,0.03838
100,0.863496,0.035848
146,0.862532,0.0359
215,0.863808,0.036533
316,0.863973,0.034029


## Testing

In [8]:
# gini result to be better
best_result = bagging_results_df.sort_values(MVS).tail(1)

# best model
test_model = BaggingClassifier(
    base_estimator=tree.DecisionTreeClassifier(criterion=best_result[CRT].values[0], min_samples_leaf=best_result[MSL].values[0]),
    n_estimators=best_result[NEST].values[0]
)

_ = test_model.fit(train_x, train_y)

In [9]:
with open(f'{PICKLE_FOLDER}{TEST}{X}.pkl', 'rb') as f:
    test_x = pickle.load(f)
with open(f'{PICKLE_FOLDER}{TEST}{Y}.pkl', 'rb') as f:
    test_y = pickle.load(f)

In [10]:
# Around 
test_model.score(test_x, test_y)

0.9426751592356688