In [32]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV,StratifiedShuffleSplit
import ml_utilities

In [1]:
TESTS_PATH = "./tests/"
INFO_PATH = "./tests/experiments_info"
DATASET_TEST_PATH = 'DBs/PenDigits/pendigits_te.txt'
DATASET_PATH = 'DBs/PenDigits/pendigits_tr.txt'
RESULT_PATH = 'Es3Predictions.txt'

In [35]:
def read_data():
    feat_count = 16
    data_path = DATASET_PATH # Impostare il percorso corretto
    patterns, labels = ml_utilities.load_labeled_dataset_from_txt(data_path, feat_count)
    test_path = DATASET_TEST_PATH
    x_test = ml_utilities.load_unlabeled_dataset_from_txt(test_path, feature_count)
    return patterns, labels, x_test

In [36]:
def normalize_data(x_train, x_test):
    scaler = MinMaxScaler(feature_range = (0,1))
    scaler.fit(dataset_patterns)
    x_train_normalized = scaler.fit_transform(x_train)
    x_test_normalized = scaler.fit_transform(x_test)

    return x_train_normalized, x_test_normalized

In [65]:
model_params = {
    'SVC': {
        'model': SVC( ),
        'params' : {
            'C': [2**i for i in range(-5,16)] + [1],  #Regularization parameter. Providing only two as SVM is slow
            'kernel': ['rbf','linear'],
            'gamma': [2**i for i in range(-5,16)] + [0.00009111627561154887],
            'class_weight':['balanced', None],
            'degree' : [1,2,3,4,5,6]
        }
    },
    'KNeighbors': {
        'model': KNeighborsClassifier(),
        'params' : {
            'n_neighbors': [1,3,5,7,9,11,13,15,17,19,21,23,25,27],
            'weights': ['uniform','distance'],
            'algorithm': ["ball_tree","kd_tree","brute"],
            'metric': ["minkowski","euclidean","l1", "l2","manhattan"],

        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear',multi_class='auto'),
        'params': {
            'C': [2**i for i in range(-5,16)] + [1],  #Regularization. . Providing only two as LR can be slow
            'penalty': ['l1', 'l2']
        }
    }
}
scores=[]

In [46]:
# Caricamento del dataset
feature_count = 16
dataset_path = 'DBs/PenDigits/pendigits_tr.txt'  # Impostare il percorso corretto

dataset_patterns, dataset_labels = ml_utilities.load_labeled_dataset_from_txt(dataset_path, feature_count)
print('Shape dataset:', dataset_patterns.shape)
print('Shape labels:', dataset_labels.shape)

Shape dataset: (442, 16)
Shape labels: (442,)


In [None]:
np.random.seed(42)

x_train, y_train, x_test = read_data()
x_train, x_test = normalize_data(x_train,x_test)

In [69]:
for data_test_size in[0.05]:
    for folds in  np.arange(1, 10, 1):
        cross_val = StratifiedShuffleSplit(n_splits=folds, test_size=data_test_size, random_state=42)
        for model_name, mp in model_params.items():
            grid =  GridSearchCV(estimator=mp['model'],
                                 param_grid=mp['params'],
                                 cv=cross_val,
                                 n_jobs=16,
                                 return_train_score=False)

            grid.fit(x_train,y_train)
            scores.append({
                'data_test_size': data_test_size,
                'folds': folds,
                'model': model_name,
                'best_score': grid.best_score_,
                'mean_test_score': grid.cv_results_['mean_test_score'],
                'best_params': grid.best_params_,
            })
            print(f"fit completed")
        dd = pd.DataFrame(scores,columns=['data_test_size','folds','model','best_score','mean_test_score','best_params',])
        print(f"data_test_size= {data_test_size} and folds = {folds}")
        print(f"fit completed")

fit completed
fit completed
fit completed
data_test_size= 0.05 and folds = 1
fit completed
fit completed
fit completed
fit completed
data_test_size= 0.05 and folds = 2
fit completed
fit completed
fit completed
fit completed
data_test_size= 0.05 and folds = 3
fit completed
fit completed
fit completed
fit completed
data_test_size= 0.05 and folds = 4
fit completed
fit completed
fit completed
fit completed
data_test_size= 0.05 and folds = 5
fit completed
fit completed
fit completed
fit completed
data_test_size= 0.05 and folds = 6
fit completed
fit completed
fit completed
fit completed
data_test_size= 0.05 and folds = 7
fit completed
fit completed
fit completed
fit completed
data_test_size= 0.05 and folds = 8
fit completed
fit completed
fit completed
fit completed
data_test_size= 0.05 and folds = 9
fit completed


In [148]:
dataframe = pd.read_pickle(TESTS_PATH+"/Training")
dataframe.sort_values(by='best_score', ascending=False)
dataframe

Unnamed: 0,data_test_size,folds,model,best_score,mean_test_score,best_params
0,0.10,1,SVC,0.933333,"[0.15555555555555556, 0.3111111111111111, 0.15...","{'C': 0.125, 'class_weight': 'balanced', 'degr..."
1,0.10,1,KNeighbors,0.888889,"[0.7555555555555555, 0.7555555555555555, 0.711...","{'algorithm': 'ball_tree', 'metric': 'minkowsk..."
2,0.10,1,logistic_regression,0.777778,"[0.13333333333333333, 0.4666666666666667, 0.26...","{'C': 0.5, 'penalty': 'l2'}"
3,0.10,2,SVC,0.866667,"[0.14444444444444443, 0.28888888888888886, 0.1...","{'C': 1, 'class_weight': 'balanced', 'degree':..."
4,0.10,2,KNeighbors,0.855556,"[0.7333333333333334, 0.7333333333333334, 0.766...","{'algorithm': 'ball_tree', 'metric': 'minkowsk..."
...,...,...,...,...,...,...
508,0.05,8,KNeighbors,0.864130,"[0.7663043478260869, 0.7663043478260869, 0.826...","{'algorithm': 'ball_tree', 'metric': 'minkowsk..."
509,0.05,8,logistic_regression,0.782609,"[0.13043478260869565, 0.4782608695652174, 0.35...","{'C': 1, 'penalty': 'l1'}"
510,0.05,9,SVC,0.864734,"[0.08695652173913043, 0.2657004830917874, 0.08...","{'C': 1, 'class_weight': None, 'degree': 1, 'g..."
511,0.05,9,KNeighbors,0.855072,"[0.7536231884057971, 0.7536231884057971, 0.806...","{'algorithm': 'ball_tree', 'metric': 'minkowsk..."


In [149]:
dataframe['score'] = 0
# for SVC
datawork = dataframe.loc[dataframe['model'] == "SVC"]
dataframe['score'].update(datawork['best_params'].map(lambda x: SVC(**x).fit(x_train, y_train).score(x_train, y_train)))
# for KNN
datawork = dataframe.loc[dataframe['model'] == "KNeighbors"]
dataframe['score'].update(datawork['best_params'].map(lambda x: KNeighborsClassifier(**x).fit(x_train, y_train).score(x_train, y_train)))# for KNN
dataframe.sort_values(by='best_score', ascending=False)

Unnamed: 0,data_test_size,folds,model,best_score,mean_test_score,best_params,score
0,0.10,1,SVC,0.933333,"[0.15555555555555556, 0.3111111111111111, 0.15...","{'C': 0.125, 'class_weight': 'balanced', 'degr...",0.843891
486,0.05,1,SVC,0.913043,"[0.08695652173913043, 0.21739130434782608, 0.0...","{'C': 0.125, 'class_weight': 'balanced', 'degr...",0.857466
1,0.10,1,KNeighbors,0.888889,"[0.7555555555555555, 0.7555555555555555, 0.711...","{'algorithm': 'ball_tree', 'metric': 'minkowsk...",0.866516
507,0.05,8,SVC,0.880435,"[0.08695652173913043, 0.2663043478260869, 0.08...","{'C': 1, 'class_weight': 'balanced', 'degree':...",0.990950
9,0.10,4,SVC,0.877778,"[0.13888888888888887, 0.26111111111111107, 0.1...","{'C': 0.5, 'class_weight': 'balanced', 'degree...",0.902715
...,...,...,...,...,...,...,...
473,0.95,5,logistic_regression,0.611429,"[0.08571428571428572, 0.2680952380952381, 0.08...","{'C': 32, 'penalty': 'l2'}",0.000000
484,0.95,9,KNeighbors,0.607407,"[0.6074074074074074, 0.6074074074074074, 0.552...","{'algorithm': 'ball_tree', 'metric': 'minkowsk...",1.000000
482,0.95,8,logistic_regression,0.605060,"[0.08571428571428572, 0.28482142857142856, 0.0...","{'C': 16, 'penalty': 'l2'}",0.000000
485,0.95,9,logistic_regression,0.603968,"[0.08571428571428572, 0.2783068783068783, 0.08...","{'C': 16, 'penalty': 'l2'}",0.000000


In [150]:
databest = dataframe.sort_values(by='best_score', ascending=True)
best_classifier = SVC(**databest['best_params'][0])
# Addestramento del classificatore
best_classifier.fit(x_train, y_train)
# Calcolo delle prediction
predictions = best_classifier.predict(x_test)
predictions

array([8., 4., 8., ..., 0., 8., 2.])

In [151]:
RESULT_PATH = 'Es3Predictions.txt'
with open(RESULT_PATH, "w") as f:
    for prediction in predictions:
        f.write(str(int(prediction)) + '\n')
print('Salvataggio delle prediction riuscito')

Salvataggio delle prediction riuscito
