# Data Mining Project
### Giorgio Donati, g.donati24@studenti.unipi.it
### Pietro Francaviglia, p.francaviglia1@studenti.unipi.it
#### A.Y. 2021-2022


## Libraries and datasets

In [1]:
from sklearn.ensemble import GradientBoostingClassifier

from t3_constants import *
from t3_utility import *

In [2]:
with open(f'{PICKLE_FOLDER}{TRAIN}{X}.pkl', 'rb') as f:
    train_x = pickle.load(f)
with open(f'{PICKLE_FOLDER}{TRAIN}{Y}.pkl', 'rb') as f:
    train_y = pickle.load(f)

# Gradient Boosting
## Training

In [3]:
## Hyper-parameters grid search for KNN
param_d = {
    DPT: [2, 3, 4],
    MSL: range(1,6),
    NEST: np.logspace(1.8, 2.8, num=10).astype(int).tolist()
}

# iteration on parameters to collect results
combinations = itertools.product(*(param_d[key] for key in sorted(param_d)))

results = []

for max_depth, min_samples_leaf, n_estimators in combinations:
    model = GradientBoostingClassifier(
        max_depth=max_depth,
        min_samples_leaf=min_samples_leaf,
        n_estimators=n_estimators
    )

    mean_val_score, std_val_score = cross_validation_summary(model, train_x.values, train_y.values)
    res =  max_depth, min_samples_leaf, n_estimators, mean_val_score, std_val_score
    results.append(res)

gradient_boosting_results_df = pd.DataFrame(results, columns=sorted(param_d)+[MVS, SVS])
gradient_boosting_results_df.sort_values(MVS, ascending = False).head(8)

Unnamed: 0,max_depth,min_samples_leaf,n_estimators,mean_val_score,std_val_score
44,2,5,175,0.902502,0.023002
28,2,3,488,0.902489,0.022451
47,2,5,378,0.900902,0.023668
48,2,5,488,0.900902,0.020788
49,2,5,630,0.900902,0.020788
46,2,5,292,0.900889,0.024213
9,2,1,630,0.900889,0.024213
45,2,5,226,0.899314,0.019504


In [4]:
gradient_boosting_results_df.groupby(MSL)[[MVS, SVS]].mean().sort_values(MVS, ascending=False)

Unnamed: 0_level_0,mean_val_score,std_val_score
min_samples_leaf,Unnamed: 1_level_1,Unnamed: 2_level_1
5,0.888782,0.03435
4,0.884369,0.032912
3,0.882934,0.032292
2,0.881594,0.037007
1,0.879461,0.036913


In [5]:
gradient_boosting_results_df.groupby(DPT)[[MVS, SVS]].mean().sort_values(MVS, ascending=False)

Unnamed: 0_level_0,mean_val_score,std_val_score
max_depth,Unnamed: 1_level_1,Unnamed: 2_level_1
2,0.888783,0.027148
4,0.880931,0.038603
3,0.88057,0.038334


In [6]:
gradient_boosting_results_df.groupby(NEST)[[MVS, SVS]].mean().sort_values(MVS, ascending=False)

Unnamed: 0_level_0,mean_val_score,std_val_score
n_estimators,Unnamed: 1_level_1,Unnamed: 2_level_1
292,0.886927,0.033302
175,0.885965,0.033877
378,0.885647,0.033189
226,0.885647,0.03394
135,0.885328,0.033837
630,0.884905,0.03069
488,0.884797,0.032082
105,0.881275,0.036197
81,0.880412,0.039323
63,0.873377,0.040509


### Test Model

In [15]:
best_result = gradient_boosting_results_df.sort_values(MVS).tail(1)

# best model
test_model = GradientBoostingClassifier(
    max_depth=best_result[DPT].values[0],
    min_samples_leaf=best_result[MSL].values[0],
    n_estimators=best_result[NEST].values[0]
)

_ = test_model.fit(train_x, train_y)

## Testing

In [16]:
with open(f'{PICKLE_FOLDER}{TEST}{X}.pkl', 'rb') as f:
    test_x = pickle.load(f)
with open(f'{PICKLE_FOLDER}{TEST}{Y}.pkl', 'rb') as f:
    test_y = pickle.load(f)

### Score

In [18]:
# Around 0.91
test_model.score(test_x, test_y)

0.9171974522292994