# Data Mining Project
### Giorgio Donati, g.donati24@studenti.unipi.it
### Pietro Francaviglia, p.francaviglia1@studenti.unipi.it
#### A.Y. 2021-2022


## Libraries and datasets

In [1]:
from sklearn.ensemble import BaggingClassifier
from sklearn import tree

from t3_constants import *
from t3_utility import *

# Bagging
## Training

In [2]:
with open(f'{PICKLE_FOLDER}{TRAIN}{X}.pkl', 'rb') as f:
    train_x = pickle.load(f)
with open(f'{PICKLE_FOLDER}{TRAIN}{Y}.pkl', 'rb') as f:
    train_y = pickle.load(f)

In [3]:
# parameters dictionary for Bagging grid search
param_d = {
    CRT: ['gini', 'entropy'],
    MSL : range(1,6),
    NEST: np.logspace(1, 2.5, num=10).astype(int).tolist()
}

# iteration on parameters to collect results
combinations = itertools.product(*(param_d[key] for key in sorted(param_d)))

results = []

for criterion, min_samples_leaf, n_estimators in combinations:
    model = BaggingClassifier(
        base_estimator=tree.DecisionTreeClassifier(criterion=criterion, min_samples_leaf=min_samples_leaf),
        n_estimators=n_estimators
    )
    
    mean_val_score, std_val_score = cross_validation_summary(model, train_x.values, train_y.values)
    res = criterion, min_samples_leaf, n_estimators, mean_val_score, std_val_score
    results.append(res)

# print top 5 combination results
bagging_results_df = pd.DataFrame(results, columns=sorted(param_d)+[MVS, SVS])
bagging_results_df.sort_values(MVS, ascending = False).head(8)

Unnamed: 0,criterion,min_samples_leaf,n_estimators,mean_val_score,std_val_score
5,gini,1,68,0.884902,0.038202
65,entropy,2,68,0.878527,0.026201
59,entropy,1,316,0.878527,0.025707
55,entropy,1,68,0.876952,0.024227
3,gini,1,31,0.876927,0.03079
17,gini,2,146,0.875352,0.022023
69,entropy,2,316,0.875302,0.031632
53,entropy,1,31,0.875302,0.033973


In [11]:
bagging_results_df.groupby(CRT)[[MVS, SVS]].mean().sort_values(MVS, ascending=False)

Unnamed: 0_level_0,mean_val_score,std_val_score
criterion,Unnamed: 1_level_1,Unnamed: 2_level_1
entropy,0.865359,0.032956
gini,0.860988,0.038524


In [12]:
bagging_results_df.groupby(MSL)[[MVS, SVS]].mean().sort_values(MVS, ascending=False)

Unnamed: 0_level_0,mean_val_score,std_val_score
min_samples_leaf,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.865438,0.035351
3,0.860908,0.036129


In [13]:
bagging_results_df.groupby(NEST)[[MVS, SVS]].mean().sort_values(MVS, ascending=False)

Unnamed: 0_level_0,mean_val_score,std_val_score
n_estimators,Unnamed: 1_level_1,Unnamed: 2_level_1
215,0.870257,0.034022
46,0.868923,0.03578
100,0.868387,0.036785
146,0.868383,0.037833
316,0.867326,0.034848
68,0.865596,0.037287
31,0.861061,0.036927
14,0.859611,0.034565
21,0.85495,0.035572
10,0.847237,0.033782


### Test Model

In [7]:
best_result = bagging_results_df.sort_values(MVS).tail(1)

# best model
test_model = BaggingClassifier(
    base_estimator=tree.DecisionTreeClassifier(criterion=best_result[CRT].values[0], min_samples_leaf=best_result[MSL].values[0]),
    n_estimators=best_result[NEST].values[0]
)

_ = test_model.fit(train_x, train_y)

## Testing

In [8]:
with open(f'{PICKLE_FOLDER}{TEST}{X}.pkl', 'rb') as f:
    test_x = pickle.load(f)
with open(f'{PICKLE_FOLDER}{TEST}{Y}.pkl', 'rb') as f:
    test_y = pickle.load(f)

### Score

In [9]:
# Around 0.89
test_model.score(test_x, test_y)

0.89171974522293