In [32]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, GridSearchCV, ParameterGrid, train_test_split, StratifiedShuffleSplit
import ml_utilities


In [33]:
TESTS_PATH = "./tests/"
INFO_PATH = "./tests/experiments_info"
DATASET_TEST_PATH = 'DBs/PenDigits/pendigits_te.txt'
DATASET_PATH = 'DBs/PenDigits/pendigits_tr.txt'

In [35]:
def read_data():
    feat_count = 16
    data_path = DATASET_PATH # Impostare il percorso corretto
    patterns, labels = ml_utilities.load_labeled_dataset_from_txt(data_path, feat_count)
    test_path = DATASET_TEST_PATH
    x_test = ml_utilities.load_unlabeled_dataset_from_txt(test_path, feature_count)
    return patterns, labels, x_test

In [36]:
def normalize_data(x_train, x_test):
    scaler = MinMaxScaler(feature_range = (0,1))
    scaler.fit(dataset_patterns)
    x_train_normalized = scaler.fit_transform(x_train)
    x_test_normalized = scaler.fit_transform(x_test)

    return x_train_normalized, x_test_normalized

In [37]:
# def feature_extractor(x_grid):
#     SIZE = x_grid.shape[1]
#     activation = 'sigmoid'
#     feature = Sequential()
#
#     feature.add(Conv2D(32, 3, activation = activation, padding = 'same', input_shape = (SIZE, SIZE, 1)))
#     feature.add(BatchNormalization())
#
#     feature.add(Conv2D(32, 3, activation = activation, padding = 'same', kernel_initializer = 'he_uniform'))
#     feature.add(BatchNormalization())
#     feature.add(MaxPool2D())
#
#     feature.add(Flatten())
#
#     return feature

In [65]:
model_params = {
    'SVC': {
        'model': SVC( ),
        'params' : {
            'C': [2**i for i in range(-5,16)] + [1],  #Regularization parameter. Providing only two as SVM is slow
            'kernel': ['rbf','linear'],
            'gamma': [2**i for i in range(-5,16)] + [0.00009111627561154887],
            'class_weight':['balanced', None],
            'degree' : [1,2,3,4,5,6]
        }
    },
    'KNeighbors': {
        'model': KNeighborsClassifier(),
        'params' : {
            'n_neighbors': [1,3,5,7,9,11,13,15,17,19,21,23,25,27],
            'weights': ['uniform','distance'],
            'algorithm': ["ball_tree","kd_tree","brute"],
            'metric': ["minkowski","euclidean","l1", "l2","manhattan"],

        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear',multi_class='auto'),
        'params': {
            'C': [2**i for i in range(-5,16)] + [1],  #Regularization. . Providing only two as LR can be slow
            'penalty': ['l1', 'l2']
        }
    }
}
scores=[]

In [46]:
# Caricamento del dataset
feature_count = 16
dataset_path = 'DBs/PenDigits/pendigits_tr.txt'  # Impostare il percorso corretto

dataset_patterns, dataset_labels = ml_utilities.load_labeled_dataset_from_txt(dataset_path, feature_count)
print('Shape dataset:', dataset_patterns.shape)
print('Shape labels:', dataset_labels.shape)

Shape dataset: (442, 16)
Shape labels: (442,)


In [48]:
!conda create -n xgboost_env -c nvidia -c rapidsai py-xgboost cudatoolkit=10.2
np.random.seed(42)

x_train_raw, y_train, x_test_raw = read_data()
x_train, x_test = normalize_data(x_train_raw,x_test_raw)
for data_test_size in [0.2]:
    for folds in [1]:
        cross_val = StratifiedShuffleSplit(n_splits=folds, test_size=data_test_size, random_state=42)
        xgb = XGBClassifier(learning_rate=0.02, n_estimators=600, objective='binary:logistic', nthread=1,tree_method='gpu_hist')
        params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 5, 7, 10],
        'learning_rate': [0.01, 0.02, 0.05]
        }
        for model_name, mp in model_params.items():
            # pipe = Pipeline([('classifier' , mp['model'])])
            grid =  RandomizedSearchCV(xgb,
                                       param_distributions=params,
                                       cv=cross_val,
                                       n_jobs=16,
                                       n_iter=2,
                                       return_train_score=False,
                                       random_state=42)

            grid.fit(x_train,y_train)
            scores.append({
                'data_test_size': data_test_size,
                'folds': folds,
                'model': model_name,
                'best_score': grid.best_score_,
                'mean_test_score': grid.cv_results_['mean_test_score'],
                'best_params': grid.best_params_,
            })

        df = pd.DataFrame(scores,columns=['data_test_size','folds','model','best_score','mean_test_score','best_params',])

The history saving thread hit an unexpected error (OperationalError('database or disk is full')).History will not be written to the database.


2 fits failed out of a total of 2.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
2 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\David\.conda\envs\Test\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\David\.conda\envs\Test\lib\site-packages\xgboost\core.py", line 506, in inner_f
    return f(**kwargs)
  File "C:\Users\David\.conda\envs\Test\lib\site-packages\xgboost\sklearn.py", line 1250, in fit
    self._Booster = train(
  File "C:\Users\David\.conda\envs\Test\lib\site-packages\xgboost\training.py", line 188, in train
    bst = _train_internal(params, dtrain,
  File "C:\Users\David\.con

XGBoostError: [12:37:58] c:\windows\temp\abs_557yfx631l\croots\recipe\xgboost-split_1659548953302\work\src\common\common.h:157: XGBoost version not compiled with GPU support.

In [69]:
np.random.seed(42)

x_train, y_train, x_test = read_data()
x_train, x_test = normalize_data(x_train,x_test)

for data_test_size in[0.05]:
    for folds in  np.arange(1, 10, 1):
        cross_val = StratifiedShuffleSplit(n_splits=folds, test_size=data_test_size, random_state=42)
        for model_name, mp in model_params.items():
            grid =  GridSearchCV(estimator=mp['model'],
                                 param_grid=mp['params'],
                                 cv=cross_val,
                                 n_jobs=16,
                                 return_train_score=False)

            grid.fit(x_train,y_train)
            scores.append({
                'data_test_size': data_test_size,
                'folds': folds,
                'model': model_name,
                'best_score': grid.best_score_,
                'mean_test_score': grid.cv_results_['mean_test_score'],
                'best_params': grid.best_params_,
            })
            print(f"fit completed")
        dd = pd.DataFrame(scores,columns=['data_test_size','folds','model','best_score','mean_test_score','best_params',])
        print(f"data_test_size= {data_test_size} and folds = {folds}")
        print(f"fit completed")

fit completed
fit completed
fit completed
data_test_size= 0.05 and folds = 1
fit completed
fit completed
fit completed
fit completed
data_test_size= 0.05 and folds = 2
fit completed
fit completed
fit completed
fit completed
data_test_size= 0.05 and folds = 3
fit completed
fit completed
fit completed
fit completed
data_test_size= 0.05 and folds = 4
fit completed
fit completed
fit completed
fit completed
data_test_size= 0.05 and folds = 5
fit completed
fit completed
fit completed
fit completed
data_test_size= 0.05 and folds = 6
fit completed
fit completed
fit completed
fit completed
data_test_size= 0.05 and folds = 7
fit completed
fit completed
fit completed
fit completed
data_test_size= 0.05 and folds = 8
fit completed
fit completed
fit completed
fit completed
data_test_size= 0.05 and folds = 9
fit completed


In [141]:
dataframe = pd.read_pickle(TESTS_PATH+"/Training")
dataframe

Unnamed: 0,data_test_size,folds,model,best_score,mean_test_score,best_params
0,0.10,1,SVC,0.933333,"[0.15555555555555556, 0.3111111111111111, 0.15...","{'C': 0.125, 'class_weight': 'balanced', 'degr..."
1,0.10,1,KNeighbors,0.888889,"[0.7555555555555555, 0.7555555555555555, 0.711...","{'algorithm': 'ball_tree', 'metric': 'minkowsk..."
2,0.10,1,logistic_regression,0.777778,"[0.13333333333333333, 0.4666666666666667, 0.26...","{'C': 0.5, 'penalty': 'l2'}"
3,0.10,2,SVC,0.866667,"[0.14444444444444443, 0.28888888888888886, 0.1...","{'C': 1, 'class_weight': 'balanced', 'degree':..."
4,0.10,2,KNeighbors,0.855556,"[0.7333333333333334, 0.7333333333333334, 0.766...","{'algorithm': 'ball_tree', 'metric': 'minkowsk..."
...,...,...,...,...,...,...
508,0.05,8,KNeighbors,0.864130,"[0.7663043478260869, 0.7663043478260869, 0.826...","{'algorithm': 'ball_tree', 'metric': 'minkowsk..."
509,0.05,8,logistic_regression,0.782609,"[0.13043478260869565, 0.4782608695652174, 0.35...","{'C': 1, 'penalty': 'l1'}"
510,0.05,9,SVC,0.864734,"[0.08695652173913043, 0.2657004830917874, 0.08...","{'C': 1, 'class_weight': None, 'degree': 1, 'g..."
511,0.05,9,KNeighbors,0.855072,"[0.7536231884057971, 0.7536231884057971, 0.806...","{'algorithm': 'ball_tree', 'metric': 'minkowsk..."


In [143]:
dataframe['score'] = 0
# for SVC
datawork = dataframe.loc[dataframe['model'] == "SVC"]
dataframe['score'].update(datawork['best_params'].map(lambda x: SVC(**x).fit(x_train, y_train).score(x_train, y_train)))
# for KNN
datawork = dataframe.loc[dataframe['model'] == "KNeighbors"]
dataframe['score'].update(datawork['best_params'].map(lambda x: KNeighborsClassifier(**x).fit(x_train, y_train).score(x_train, y_train)))# for KNN
dataframe.sort_values(by='score', ascending=False)

Unnamed: 0,data_test_size,folds,model,best_score,mean_test_score,best_params,score
256,0.55,5,KNeighbors,0.799180,"[0.7475409836065573, 0.7475409836065573, 0.768...","{'algorithm': 'ball_tree', 'metric': 'minkowsk...",1.0
292,0.60,8,KNeighbors,0.783365,"[0.743421052631579, 0.743421052631579, 0.75187...","{'algorithm': 'ball_tree', 'metric': 'minkowsk...",1.0
145,0.35,4,KNeighbors,0.830645,"[0.7709677419354839, 0.7709677419354839, 0.779...","{'algorithm': 'ball_tree', 'metric': 'minkowsk...",1.0
430,0.85,9,KNeighbors,0.713357,"[0.6959219858156028, 0.6959219858156028, 0.688...","{'algorithm': 'ball_tree', 'metric': 'minkowsk...",1.0
148,0.35,5,KNeighbors,0.823226,"[0.76, 0.76, 0.7819354838709678, 0.79483870967...","{'algorithm': 'ball_tree', 'metric': 'minkowsk...",1.0
...,...,...,...,...,...,...,...
212,0.45,8,logistic_regression,0.758166,"[0.13190954773869348, 0.4491206030150754, 0.13...","{'C': 4, 'penalty': 'l2'}",0.0
398,0.80,7,logistic_regression,0.732849,"[0.13276836158192087, 0.23930589184826473, 0.1...","{'C': 4, 'penalty': 'l2'}",0.0
209,0.45,7,logistic_regression,0.758794,"[0.1313711414213927, 0.4465183058147882, 0.144...","{'C': 2, 'penalty': 'l2'}",0.0
206,0.45,6,logistic_regression,0.758794,"[0.13149078726968175, 0.44388609715242877, 0.1...","{'C': 2, 'penalty': 'l2'}",0.0


In [124]:
databest = dataframe.sort_values(by='best_score', ascending=True)
best_classifier = SVC(**databest['best_params'][0])
# Addestramento del classificatore
best_classifier.fit(x_train, y_train)
# Calcolo delle prediction
predictions = best_classifier.predict(x_test)
predictions

array([8., 4., 8., ..., 0., 8., 2.])