In [6]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_digits
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import GradientBoostingClassifier
try:
    import matplotlib.pyplot as plt
except ImportError:
    # Not required for demo
    pass

from qboost import QBoostClassifier, qboost_lambda_sweep
from datasets import *
import pandas as pd

In [2]:
credit=pd.read_csv('German_Preprocessed.csv').sample(frac=1,random_state=3)
credit=credit.groupby('classification').head(300).sample(frac=1,random_state=3)
X, Y = credit.drop('classification',axis=1).values, credit.classification.squeeze().values
#Y=[i if i ==1 else 0 for i in Y]
id_dataset='GermanCredit'

In [3]:
X_train=X[:500,:]
X_test=X[500:,:]
Y_train=Y[:500]
Y_test=Y[500:]

In [6]:
n_features = np.size(X_train, 1)

In [7]:
normalized_lambdas = np.linspace(0.0, 0.5, 20)

In [8]:
lambdas = normalized_lambdas / n_features

In [9]:
#cv-search on lambda parameter
qboost, lam = qboost_lambda_sweep(
    X_train, Y_train, lambdas, verbose=True)

lambda  n_features score:
trial 1 for lambda 0.0 gave score 0.68
trial 2 for lambda 0.0 gave score 0.78
trial 3 for lambda 0.0 gave score 0.56
0.0000  54         0.673 
trial 1 for lambda 0.0004314063848144952 gave score 0.74
trial 2 for lambda 0.0004314063848144952 gave score 0.72
trial 3 for lambda 0.0004314063848144952 gave score 0.66
0.0004  50         0.707 
trial 1 for lambda 0.0008628127696289904 gave score 0.7
trial 2 for lambda 0.0008628127696289904 gave score 0.64
trial 3 for lambda 0.0008628127696289904 gave score 0.66
0.0009  41         0.667 
trial 1 for lambda 0.0012942191544434857 gave score 0.64
trial 2 for lambda 0.0012942191544434857 gave score 0.62
trial 3 for lambda 0.0012942191544434857 gave score 0.62
0.0013  36         0.627 
trial 1 for lambda 0.0017256255392579809 gave score 0.68
trial 2 for lambda 0.0017256255392579809 gave score 0.76
trial 3 for lambda 0.0017256255392579809 gave score 0.8
0.0017  24         0.747 
trial 1 for lambda 0.0021570319240724763 gave

In [11]:
#fitting con il best parameter lambda = 0.00173
qboost_final = QBoostClassifier(X_train, Y_train, 0.00173)

In [86]:
print('Number of selected features:',len(qboost_final.get_selected_features()))
print('Selected features:',qboost_final.get_selected_features())
print('Score on test set: {:.3f}'.format(qboost_final.score(X_test, Y_test)))

Number of selected features: 22
Selected features: [0, 1, 2, 4, 7, 8, 10, 15, 16, 17, 20, 26, 29, 30, 35, 38, 43, 46, 47, 49, 50, 51]
Score on test set: 0.700


# Calcolo Reference Classica

## classic RF - best performance con random forest

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC

In [71]:
rfc = RandomForestClassifier()
param_grid = {
    'n_estimators': [20,50,100,150,200,500],
    'max_depth' : [2,3,4,5,6,8,10],
    'min_samples_split' : [1,2,3,5],
    'bootstrap':[True,False]
}

In [72]:
search_rfc = GridSearchCV(rfc, param_grid,scoring='accuracy',refit='accuracy', n_jobs=-1,cv=3,verbose=2)
search_rfc.fit(X_train, Y_train)

Fitting 3 folds for each of 336 candidates, totalling 1008 fits


 0.67804151 0.71007383 0.69208571 0.70807782 0.70407378 0.69808576
 0.66400933 0.69607772 0.70409783 0.69207368 0.6900416  0.69409374
 0.6961138  0.68010966 0.70008176 0.70611788 0.69410576 0.69405767
        nan        nan        nan        nan        nan        nan
 0.66798932 0.69607772 0.70006974 0.69605368 0.7080658  0.70206575
 0.66208547 0.68803357 0.69607772 0.69207368 0.69807373 0.69206166
 0.69007768 0.69007768 0.69403362 0.7040858  0.69006565 0.69406969
        nan        nan        nan        nan        nan        nan
 0.68604959 0.69806171 0.69804968 0.70808984 0.70407378 0.70206575
 0.68805762 0.70202968 0.70805377 0.71606185 0.6960176  0.6960657
 0.69599356 0.68006156 0.70608181 0.70608181 0.70003367 0.71209388
        nan        nan        nan        nan        nan        nan
 0.6759974  0.68803357 0.7100618  0.70404973 0.69807373 0.70006974
 0.68802155 0.6900416  0.71808191 0.70004569 0.7080658  0.70606979
 0.69007768 0.71202174 0.71007383 0.69202559 0.70006974 0.70810

GridSearchCV(cv=3, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'bootstrap': [True, False],
                         'max_depth': [2, 3, 4, 5, 6, 8, 10],
                         'min_samples_split': [1, 2, 3, 5],
                         'n_estimators': [20, 50, 100, 150, 200, 500]},
             refit='accuracy', scoring='accuracy', verbose=2)

In [73]:
print("Best parameter (CV score=%0.3f):" % search_rfc.best_score_)
print(search_rfc.best_params_)

Best parameter (CV score=0.730):
{'bootstrap': False, 'max_depth': 8, 'min_samples_split': 2, 'n_estimators': 150}


In [89]:
preds_rfc=search_rfc.predict(X_test)
score_rf =sum([1 if (pred==tru) else 0 for pred, tru in zip(preds_rfc,Y_test)])/len(preds_rfc)
print(f'Random Forest Accuracy: {score_rf}')

Random Forest Accuracy: 0.66


## Classic SVC - best performance con support vector classifier

In [76]:
minmax=MinMaxScaler()
X_train_SVC = minmax.fit_transform(X_train)
X_test_SVC = minmax.transform(X_test)

In [82]:
svc = SVC()
param_grid = {'C': [0.1,0.5,0.8,1,2, 5,10], 'gamma': [1,0.8,0.5,0.3,0.2,0.1,0.001, 0.0001], 'kernel': ['rbf']}

In [83]:
search_svm = GridSearchCV(svc, param_grid,scoring='accuracy',refit='accuracy', n_jobs=-1,cv=3,verbose=2)
search_svm.fit(X_train_SVC, Y_train)

Fitting 3 folds for each of 56 candidates, totalling 168 fits


GridSearchCV(cv=3, estimator=SVC(), n_jobs=-1,
             param_grid={'C': [0.1, 0.5, 0.8, 1, 2, 5, 10],
                         'gamma': [1, 0.8, 0.5, 0.3, 0.2, 0.1, 0.001, 0.0001],
                         'kernel': ['rbf']},
             refit='accuracy', scoring='accuracy', verbose=2)

In [84]:
print("Best parameter (CV score=%0.3f):" % search_svm.best_score_)
print(search_svm.best_params_)

Best parameter (CV score=0.720):
{'C': 1, 'gamma': 0.5, 'kernel': 'rbf'}


In [90]:
preds_rfc=search_rfc.predict(X_test_SVC)
score_svc=sum([1 if (pred==tru) else 0 for pred, tru in zip(preds_rfc,Y_test)])/len(preds_rfc)
print(f'Support Vector Classifier Accuracy: {score_svc}')

Support Vector Classifier Accuracy: 0.65


# Gradient Boosting: Best performance con gradient boosting

In [16]:
parameters = {
    "loss":["deviance"],
    "learning_rate": [0.01, 0.05, 0.1,  0.2],
    "min_samples_split": np.linspace(0.1, 0.5, 2),
    #"min_samples_leaf": np.linspace(0.1, 0.5, 2),
    "max_depth":[3,5,8],
    "max_features":["log2","sqrt"],
    "criterion": ["friedman_mse",  "mae"],
    "subsample":[0.5,  0.8, 0.9, 1.0],
    "n_estimators":[10,50,100]
    }

In [8]:
parameters = {
    #"loss":["deviance"],
    "learning_rate": [0.01, 0.05, 0.1,  0.2,0.5],
    "min_samples_split": np.linspace(0.01, 0.5, 8),
    "min_samples_leaf": np.linspace(0.01, 0.5, 8),
    "max_depth":[3,5,8],
    "max_features":["log2","sqrt"],
    "criterion": ["friedman_mse",  "mse"],
    "subsample":[0.5,  0.8, 0.9, 1.0],
    "n_estimators":[50,100,200,500,1000]
    }

In [9]:
search_gb = GridSearchCV(GradientBoostingClassifier(), parameters, cv=10, n_jobs=-1,verbose=3)

search_gb.fit(X_train, Y_train)

Fitting 10 folds for each of 76800 candidates, totalling 768000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 40 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:   15.4s
[Parallel(n_jobs=-1)]: Done 208 tasks      | elapsed:   18.4s
[Parallel(n_jobs=-1)]: Done 432 tasks      | elapsed:   21.4s
[Parallel(n_jobs=-1)]: Done 720 tasks      | elapsed:   24.9s
[Parallel(n_jobs=-1)]: Done 1072 tasks      | elapsed:   29.9s
[Parallel(n_jobs=-1)]: Done 1488 tasks      | elapsed:   35.3s
[Parallel(n_jobs=-1)]: Done 1968 tasks      | elapsed:   42.8s
[Parallel(n_jobs=-1)]: Done 2512 tasks      | elapsed:   49.0s
[Parallel(n_jobs=-1)]: Done 3120 tasks      | elapsed:   56.9s
[Parallel(n_jobs=-1)]: Done 3792 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 4528 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 5328 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 6192 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 7120 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 8112 tasks      |

[Parallel(n_jobs=-1)]: Done 524208 tasks      | elapsed: 106.6min
[Parallel(n_jobs=-1)]: Done 532432 tasks      | elapsed: 108.6min
[Parallel(n_jobs=-1)]: Done 540720 tasks      | elapsed: 110.1min
[Parallel(n_jobs=-1)]: Done 549072 tasks      | elapsed: 111.7min
[Parallel(n_jobs=-1)]: Done 557488 tasks      | elapsed: 113.5min
[Parallel(n_jobs=-1)]: Done 565968 tasks      | elapsed: 115.2min
[Parallel(n_jobs=-1)]: Done 574512 tasks      | elapsed: 116.8min
[Parallel(n_jobs=-1)]: Done 583120 tasks      | elapsed: 118.7min
[Parallel(n_jobs=-1)]: Done 591792 tasks      | elapsed: 120.5min
[Parallel(n_jobs=-1)]: Done 600528 tasks      | elapsed: 122.1min
[Parallel(n_jobs=-1)]: Done 609328 tasks      | elapsed: 124.1min
[Parallel(n_jobs=-1)]: Done 618192 tasks      | elapsed: 125.8min
[Parallel(n_jobs=-1)]: Done 627120 tasks      | elapsed: 127.5min
[Parallel(n_jobs=-1)]: Done 636112 tasks      | elapsed: 129.3min
[Parallel(n_jobs=-1)]: Done 645168 tasks      | elapsed: 131.2min
[Parallel(

GridSearchCV(cv=10, estimator=GradientBoostingClassifier(), n_jobs=-1,
             param_grid={'criterion': ['friedman_mse', 'mse'],
                         'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.5],
                         'max_depth': [3, 5, 8],
                         'max_features': ['log2', 'sqrt'],
                         'min_samples_leaf': array([0.01, 0.08, 0.15, 0.22, 0.29, 0.36, 0.43, 0.5 ]),
                         'min_samples_split': array([0.01, 0.08, 0.15, 0.22, 0.29, 0.36, 0.43, 0.5 ]),
                         'n_estimators': [50, 100, 200, 500, 1000],
                         'subsample': [0.5, 0.8, 0.9, 1.0]},
             verbose=3)

In [57]:
print("Best parameter (CV score=%0.3f):" % search_gb.best_score_)
print(search_gb.best_params_)

Best parameter (CV score=0.742):
{'criterion': 'friedman_mse', 'learning_rate': 0.2, 'max_depth': 3, 'max_features': 'log2', 'min_samples_leaf': 0.01, 'min_samples_split': 0.15, 'n_estimators': 50, 'subsample': 0.5}


In [58]:
preds_gb=search_gb.predict(X_test)
score_gb=sum([1 if (pred==tru) else 0 for pred, tru in zip(preds_gb,Y_test)])/len(preds_gb)
print(f'Gradient Boosting Accuracy: {score_gb}')

Gradient Boosting Accuracy: 0.71
