# kNN06 - Normalizer + SMOTE + RFE

In [1]:
import numpy as np
import pandas as pd
import sklearn

#Model selection
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV

#Pipelines
from imblearn.pipeline import Pipeline as imbpipeline

#Balanceo de clases
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import BorderlineSMOTE

#Preprocessing
from sklearn.preprocessing import StandardScaler, Normalizer

#Feature selection
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.feature_selection import RFE
from sklearn.decomposition import PCA, KernelPCA

#Classifiers
from sklearn.svm import LinearSVC, SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

#Metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

#Others
from collections import Counter
import datetime
import warnings

warnings.filterwarnings('ignore')

# Cargar datos

In [2]:
ds = pd.read_csv('train.csv')
ds.head()

Unnamed: 0,KeyT,BackT,DelT,EntT,SpbT,ClicT,LClicT,RClicT,ScrT,ScrUT,...,FloT,HRA,StpA,DistA,FloA,HRSD,StpSD,DistSD,FloSD,clase
0,751,125,6,11,4,586,581,1,732614,282773,...,0,79.833333,1.841667,0.001322,0.0,4.398879,6.305921,0.004527,0.0,High
1,2222,236,28,46,23,706,702,4,193660,84133,...,2,75.0,1.583333,0.001085,0.016667,5.746038,7.41516,0.00508,0.128556,Medium
2,1370,97,24,51,24,943,932,11,1,1,...,1,72.041667,1.116667,0.000757,0.008333,7.117732,4.348637,0.002952,0.091287,Low
3,4953,230,76,3052,139,508,491,7,312581,82231,...,0,80.258333,0.533333,0.000383,0.0,5.557183,2.643951,0.001898,0.0,High
4,184,35,8,10,4,225,225,0,42270,16939,...,0,100.515152,2.825,0.002107,0.0,10.311163,11.964698,0.008901,0.0,High


In [3]:
ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1387 entries, 0 to 1386
Data columns (total 60 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   KeyT     1387 non-null   int64  
 1   BackT    1387 non-null   int64  
 2   DelT     1387 non-null   int64  
 3   EntT     1387 non-null   int64  
 4   SpbT     1387 non-null   int64  
 5   ClicT    1387 non-null   int64  
 6   LClicT   1387 non-null   int64  
 7   RClicT   1387 non-null   int64  
 8   ScrT     1387 non-null   int64  
 9   ScrUT    1387 non-null   int64  
 10  ScrDT    1387 non-null   int64  
 11  PixT     1387 non-null   int64  
 12  WinT     1387 non-null   int64  
 13  ChgeT    1387 non-null   int64  
 14  KeyA     1387 non-null   float64
 15  BackA    1387 non-null   float64
 16  DelA     1387 non-null   float64
 17  EntA     1387 non-null   float64
 18  SpbA     1387 non-null   float64
 19  ClicA    1387 non-null   float64
 20  LClicA   1387 non-null   float64
 21  RClicA   1387 

In [4]:
X = ds.drop('clase', axis=1)
X.shape

(1387, 59)

In [5]:
y = ds['clase']
y.shape

(1387,)

In [6]:
y.value_counts()

clase
Low       822
Medium    334
High      231
Name: count, dtype: int64

In [7]:
Counter(y)

Counter({'Low': 822, 'Medium': 334, 'High': 231})

# kNN

In [8]:
clf = KNeighborsClassifier()

Preprocesamiento

In [9]:
#prep1 = StandardScaler() 
prep2 = Normalizer() 

Balanceo de clases

In [10]:
smt1 = SMOTE(sampling_strategy='auto',n_jobs=-1,random_state=42)
smt2 = BorderlineSMOTE(sampling_strategy='auto',n_jobs=-1,random_state=42)

Feature selection

In [11]:
#fs1 = SelectKBest()
fs2 = RFE(RandomForestClassifier())
#fs3 = SelectFromModel(clf)

Pipelines

In [12]:
#smt + clf
pipeline01 = imbpipeline([('prep', prep2), ('smote', smt1), ('fs', fs2), ('classifier', clf)])
pipeline02 = imbpipeline([('prep', prep2), ('smote', smt2), ('fs', fs2), ('classifier', clf)])


Parameters

In [13]:
#Hyper-parameter tuning
#SMOTE
params01_01 = {}
params01_01['smote__k_neighbors'] =  [3]#[3, 5, 7]
params01_01['fs__n_features_to_select'] = [55]#range(55, 59)
params01_01['classifier__n_neighbors'] = [3]#[3,5,7]
params01_01['classifier__weights'] = ['uniform']#,'distance']
params01_01['classifier__algorithm'] = ['ball_tree']#,'kd_tree','brute']
params01_01['classifier__metric'] = ['minkowski']#,'euclidean','manhattan']
params01_01['classifier'] = [clf]

params01_02 = {}
params01_02['smote__k_neighbors'] =  [9]#[9, 11, 13]
params01_02['fs__n_features_to_select'] = [55]#range(55, 59)
params01_02['classifier__n_neighbors'] = [7]#[3,5,7]
params01_02['classifier__weights'] = ['distance']#['uniform','distance']
params01_02['classifier__algorithm'] = ['ball_tree']#['ball_tree','kd_tree','brute']
params01_02['classifier__metric'] = ['manhattan']#['minkowski','euclidean','manhattan']
params01_02['classifier'] = [clf]

#BordelineSMOTE
params02_01 = {}
params02_01['smote__k_neighbors'] =  [3, 5, 7]
params02_01['smote__m_neighbors'] = [3, 5, 7]
params02_01['smote__kind'] = ['borderline-1','borderline-2']
params02_01['fs__n_features_to_select'] = range(55, 59)
params02_01['classifier__n_neighbors'] = [3,5,7]
params02_01['classifier__weights'] = ['uniform','distance']
params02_01['classifier__algorithm'] = ['ball_tree','kd_tree','brute']
params02_01['classifier__metric'] = ['minkowski','euclidean','manhattan']
params02_01['classifier'] = [clf]

params02_02 = {}
params02_02['smote__k_neighbors'] =  [9, 11, 13]
params02_02['smote__m_neighbors'] = [3, 5, 7]
params02_02['smote__kind'] = ['borderline-1','borderline-2']
params02_02['fs__n_features_to_select'] = range(55, 59)
params02_02['classifier__n_neighbors'] = [3,5,7]
params02_02['classifier__weights'] = ['uniform','distance']
params02_02['classifier__algorithm'] = ['ball_tree','kd_tree','brute']
params02_02['classifier__metric'] = ['minkowski','euclidean','manhattan']
params02_02['classifier'] = [clf]

params02_03 = {}
params02_03['smote__k_neighbors'] =  [3, 5, 7]
params02_03['smote__m_neighbors'] = [9, 10, 11]
params02_03['smote__kind'] = ['borderline-1','borderline-2']
params02_03['fs__n_features_to_select'] = range(55, 59)
params02_03['classifier__n_neighbors'] = [3,5,7]
params02_03['classifier__weights'] = ['uniform','distance']
params02_03['classifier__algorithm'] = ['ball_tree','kd_tree','brute']
params02_03['classifier__metric'] = ['minkowski','euclidean','manhattan']
params02_03['classifier'] = [clf]

params02_04 = {}
params02_04['smote__k_neighbors'] =  [9, 11, 13]
params02_04['smote__m_neighbors'] = [9, 10, 11]
params02_04['smote__kind'] = ['borderline-1','borderline-2']
params02_04['fs__n_features_to_select'] = range(55, 59)
params02_04['classifier__n_neighbors'] = [3,5,7]
params02_04['classifier__weights'] = ['uniform','distance']
params02_04['classifier__algorithm'] = ['ball_tree','kd_tree','brute']
params02_04['classifier__metric'] = ['minkowski','euclidean','manhattan']
params02_04['classifier'] = [clf]




In [14]:
#Aqui es donde hay que cambiarle manualmente los datos
# pipeline01, luego pipeline02
#al igual en params y file2

pipeline = pipeline02
params = params01_02
file1='kNN06-test-'
file2='params01_02_'
file3='v24.csv'

CV

In [15]:
cv=RepeatedStratifiedKFold(n_splits=10, n_repeats=10,random_state=42)
scoring=['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted']

# GridSearchCV

In [16]:
ct = datetime.datetime.now()
print("Ini: ", ct, " ")
grid=GridSearchCV(pipeline, params, cv=cv, scoring=scoring, n_jobs=-1, refit=False, verbose=3)
grid.fit(X, y)
ct = datetime.datetime.now()
print("Fin: ", ct, " ")

Ini:  2024-07-11 12:21:47.279558  
Fitting 100 folds for each of 1 candidates, totalling 100 fits
[CV 1/100] END classifier=KNeighborsClassifier(), classifier__algorithm=ball_tree, classifier__metric=manhattan, classifier__n_neighbors=7, classifier__weights=distance, fs__n_features_to_select=55, smote__k_neighbors=9; accuracy: (test=0.511) f1_weighted: (test=0.524) precision_weighted: (test=0.559) recall_weighted: (test=0.511) total time=   5.7s
[CV 2/100] END classifier=KNeighborsClassifier(), classifier__algorithm=ball_tree, classifier__metric=manhattan, classifier__n_neighbors=7, classifier__weights=distance, fs__n_features_to_select=55, smote__k_neighbors=9; accuracy: (test=0.446) f1_weighted: (test=0.474) precision_weighted: (test=0.528) recall_weighted: (test=0.446) total time=   5.3s
[CV 3/100] END classifier=KNeighborsClassifier(), classifier__algorithm=ball_tree, classifier__metric=manhattan, classifier__n_neighbors=7, classifier__weights=distance, fs__n_features_to_select=55,

In [17]:
results=pd.DataFrame(grid.cv_results_)
results.shape

(1, 424)

In [18]:
results.to_csv(file1+file2+file3, index=False, encoding='utf-8-sig')

In [19]:
# Este es el resultado que hay que anotar en excel
results['mean_test_f1_weighted'].max()

0.4555000627311603

In [20]:
print("Finished!")

Finished!
