# Data Mining Project
### Giorgio Donati, g.donati24@studenti.unipi.it
### Pietro Francaviglia, p.francaviglia1@studenti.unipi.it
#### A.Y. 2021-2022


## Libraries and datasets

In [1]:
from sklearn.ensemble import BaggingClassifier
from sklearn import tree

from t3_constants import *
from t3_utility import *

# Bagging
## Training

In [2]:
with open(f'{PICKLE_FOLDER}{TRAIN}{X}.pkl', 'rb') as f:
    train_x = pickle.load(f)
with open(f'{PICKLE_FOLDER}{TRAIN}{Y}.pkl', 'rb') as f:
    train_y = pickle.load(f)

In [3]:
# parameters dictionary for Bagging grid search
param_d = {
    CRT: ['gini', 'entropy'],
    MSL : range(1,6),
    NEST: np.logspace(1, 2.5, num=10).astype(int).tolist()
}

# iteration on parameters to collect results
combinations = itertools.product(*(param_d[key] for key in sorted(param_d)))

results = []

for criterion, min_samples_leaf, n_estimators in combinations:
    model = BaggingClassifier(
        base_estimator=tree.DecisionTreeClassifier(criterion=criterion, min_samples_leaf=min_samples_leaf),
        n_estimators=n_estimators
    )
    
    mean_val_score, std_val_score = cross_validation_summary(model, train_x.values, train_y.values)
    res = criterion, min_samples_leaf, n_estimators, mean_val_score, std_val_score
    results.append(res)

# print top 5 combination results
bagging_results_df = pd.DataFrame(results, columns=sorted(param_d)+[MVS, SVS])
bagging_results_df.sort_values(MVS, ascending = False).head(8)

Unnamed: 0,criterion,min_samples_leaf,n_estimators,mean_val_score,std_val_score
5,gini,1,68,0.884902,0.038202
65,entropy,2,68,0.878527,0.026201
59,entropy,1,316,0.878527,0.025707
55,entropy,1,68,0.876952,0.024227
3,gini,1,31,0.876927,0.03079
17,gini,2,146,0.875352,0.022023
69,entropy,2,316,0.875302,0.031632
53,entropy,1,31,0.875302,0.033973


In [4]:
bagging_results_df.groupby(CRT)[[MVS, SVS]].mean()

Unnamed: 0_level_0,mean_val_score,std_val_score
criterion,Unnamed: 1_level_1,Unnamed: 2_level_1
entropy,0.860326,0.03381
gini,0.857615,0.035867


In [5]:
bagging_results_df.groupby(MSL)[[MVS, SVS]].mean()

Unnamed: 0_level_0,mean_val_score,std_val_score
min_samples_leaf,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.868771,0.034476
2,0.864217,0.032231
3,0.859174,0.03491
4,0.854142,0.035058
5,0.848547,0.03752


In [6]:
bagging_results_df.groupby(NEST)[[MVS, SVS]].mean()

Unnamed: 0_level_0,mean_val_score,std_val_score
n_estimators,Unnamed: 1_level_1,Unnamed: 2_level_1
10,0.847514,0.035169
14,0.856453,0.038436
21,0.854546,0.034205
31,0.860455,0.034396
46,0.859018,0.032899
68,0.860935,0.03545
100,0.859648,0.037379
146,0.864783,0.031761
215,0.863017,0.034007
316,0.863333,0.034686


### Test Model

In [7]:
# gini result to be better
best_result = bagging_results_df.sort_values(MVS).tail(1)

# best model
test_model = BaggingClassifier(
    base_estimator=tree.DecisionTreeClassifier(criterion=best_result[CRT].values[0], min_samples_leaf=best_result[MSL].values[0]),
    n_estimators=best_result[NEST].values[0]
)

_ = test_model.fit(train_x, train_y)

## Testing

In [8]:
with open(f'{PICKLE_FOLDER}{TEST}{X}.pkl', 'rb') as f:
    test_x = pickle.load(f)
with open(f'{PICKLE_FOLDER}{TEST}{Y}.pkl', 'rb') as f:
    test_y = pickle.load(f)

### Score

In [9]:
# Around 
test_model.score(test_x, test_y)

0.89171974522293