In [2]:
import mock
import time
import os
import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler
from functions import data_recovery

date = time.strftime("%Y_%b_%d_%H_%M_%S", time.localtime(round(time.time())))

dataset_list=["abalone20","abalone17",\
                    "abalone8","segmentation","wine4",\
                    "german","vehicle","pima","balance","autompg","libras",\
                    "iono","glass","wine","hayes"]

opt = mock.Mock()
dataset_stats=[]
datasets=[]
opt.dataset = None
for ds in dataset_list:
	setattr(opt, 'dataset', ds)
	X, y, dim = data_recovery(opt, date)
	size = X.shape[0]
	unique, counts = np.unique(y, return_counts=True)
	stat=dict(zip(unique, counts))
	if(1 in stat.keys()):
		pos_part=stat[1]/size*100
		neg_part=100-pos_part
	else:
		pos_part=0
		neg_part=100
	r_n_part=round(neg_part,2)
	r_p_part=round(pos_part,2)
	if(pos_part>0):
		r_IR=round(neg_part/r_p_part,2)
	else:
		r_IR=0
	scaler = MinMaxScaler(feature_range=(-1,1))
	X=scaler.fit_transform(X)
	datasets.append({"dataset":ds,"size":size,"dim":dim,"pos_part":r_p_part,"neg_part":r_n_part,"IR": r_IR,"X":X,"y":y})
    

In [3]:
from numpy import mean
from numpy import std
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

def HyperParamTuning(model,grid_param,X,y):
    

    # To be used within GridSearch (5 in your case)
    inner_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=i)

    # To be used in outer CV (you asked for 10)
    outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)


    # Pass the gridSearch estimator to cross_val_score
    # This will be your required 10 x 5 cvs
    # 10 for outer cv and 5 for gridSearch's internal CV
    clf = GridSearchCV(estimator=model, param_grid=grid_param, cv=inner_cv, scoring='f1',verbose=1)
    scores = cross_val_score(clf, X=X, y=y, cv=outer_cv)
    
    return (scores.mean(),scores.std())

In [14]:
from sklearn.model_selection import train_test_split
from models_perso import GammaKnn, SimpleKnn, wKnn, dupKnn, LMNN_perso, cwKnn
from time import perf_counter

models={"knn":SimpleKnn(nb_nn=3),"wKnn":wKnn(nb_nn=3),"cwKnn":cwKnn(nb_nn=3),"dupKnn":dupKnn(nb_nn=3),"lmnn":LMNN_perso(nb_nn=3),"GammaKnn":GammaKnn(nb_nn=3)}
i=456
#results=[]
for ds in datasets:
    print(ds["dataset"])
    print(ds["size"])
    print(ds["IR"])
    current_res = {}
    X_train, X_test, y_train, y_test = train_test_split(ds["X"], ds["y"], test_size=0.2, random_state=i)
    for m in models.keys():
        print(m)
        debut = perf_counter()
        if(m == "GammaKnn"):
            f_measure_mean,f_measure_std=HyperParamTuning(models[m],{"gamma":np.arange(0, 1.1, 0.1)},X_train,y_train)
        elif(m == "lmnn"):
            f_measure_mean,f_measure_std=HyperParamTuning(models[m],{"lr":np.arange(0, 1.1, 0.1)},X_train,y_train)
        else:
            score = cross_val_score(models[m], X=X_train, y=y_train, cv=5, scoring='f1')
            f_measure_mean = score.mean()
            f_measure_std = score.std()
        fin = perf_counter()
        current_res[m] = {"f_measure_mean":f_measure_mean,"f_measure_std":f_measure_std,"perf":(fin-debut)}
        print(current_res[m])
    results.append(current_res)


abalone20
4177
160.29
knn
{'f_measure_mean': 0.0, 'f_measure_std': 0.0, 'perf': 0.1891654999999446}
wKnn
{'f_measure_mean': 0.0, 'f_measure_std': 0.0, 'perf': 0.10821159999977681}
cwKnn
{'f_measure_mean': 0.0, 'f_measure_std': 0.0, 'perf': 101.103514800001}
dupKnn
{'f_measure_mean': 0.025, 'f_measure_std': 0.05000000000000001, 'perf': 0.22700390000318293}
lmnn
Fitting 10 folds for each of 11 candidates, totalling 110 fits
Fitting 10 folds for each of 11 candidates, totalling 110 fits
Fitting 10 folds for each of 11 candidates, totalling 110 fits
Fitting 10 folds for each of 11 candidates, totalling 110 fits
Fitting 10 folds for each of 11 candidates, totalling 110 fits
{'f_measure_mean': 0.0, 'f_measure_std': 0.0, 'perf': 9930.790534800002}
GammaKnn
Fitting 10 folds for each of 11 candidates, totalling 110 fits
Fitting 10 folds for each of 11 candidates, totalling 110 fits
Fitting 10 folds for each of 11 candidates, totalling 110 fits
Fitting 10 folds for each of 11 candidates, totalli

In [1]:
analyze=[]
j=0
for i in range(len(datasets)):
    ds=datasets[i]
    for k in results[j].keys():
        m=results[j][k]
        current={"dataset":ds["dataset"],"size":ds["size"],"IR":ds["IR"],"method":k,"f_measure":m["f_measure_mean"],"perf":m["perf"]}
        analyze.append(current)
    j+=1


NameError: name 'datasets' is not defined

In [18]:
results

[{'knn': {'f_measure_mean': 0.8658482442362487,
   'f_measure_std': 0.03284894729863545,
   'perf': 0.0476839},
  'wKnn': {'f_measure_mean': 0.8692723004258888,
   'f_measure_std': 0.030336765208957787,
   'perf': 0.020306099999999994},
  'cwKnn': {'f_measure_mean': 0.852456081873511,
   'f_measure_std': 0.016945921191321288,
   'perf': 12.7922184},
  'dupKnn': {'f_measure_mean': 0.8753993728525853,
   'f_measure_std': 0.03350492454773057,
   'perf': 0.1265705999999973},
  'lmnn': {'f_measure_mean': 0.883028706733711,
   'f_measure_std': 0.026853288452379477,
   'perf': 1777.280083},
  'GammaKnn': {'f_measure_mean': 0.8918645276292334,
   'f_measure_std': 0.014111927396069283,
   'perf': 4.708573999999999}},
 {'knn': {'f_measure_mean': 0.5752015743006063,
   'f_measure_std': 0.019281737391040495,
   'perf': 0.11514469999997345},
  'wKnn': {'f_measure_mean': 0.5714398086401065,
   'f_measure_std': 0.014996478169223008,
   'perf': 0.06134739999993144},
  'cwKnn': {'f_measure_mean': 0.517

In [13]:
df = pd.DataFrame(analyze)
df.to_csv(f'../Outputs/benchmark_analyse.csv',sep = ';')