# NB06 - Normalizer + SMOTE + RFE

In [1]:
import numpy as np
import pandas as pd
import sklearn

#Model selection
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV

#Pipelines
from imblearn.pipeline import Pipeline as imbpipeline

#Balanceo de clases
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import BorderlineSMOTE

#Preprocessing
from sklearn.preprocessing import StandardScaler, Normalizer

#Feature selection
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import RFE

#Classifiers
from sklearn.svm import LinearSVC, SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

#Metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

#Others
from collections import Counter
import datetime
import warnings

warnings.filterwarnings('ignore')


# Cargar datos

In [2]:
ds = pd.read_csv('train.csv')
#ds.head()

In [None]:
#ds.info()

In [3]:
X = ds.drop('clase', axis=1)
X.shape

(1387, 59)

In [8]:
y = ds['clase']
#y.value_counts()

In [9]:
Counter(y)

Counter({'Low': 822, 'Medium': 334, 'High': 231})

# NB

In [None]:
clf = GaussianNB()

Preprocesamiento

In [None]:
#prep1 = StandardScaler() 
prep2 = Normalizer() 

Balanceo de clases

In [None]:
smt1 = SMOTE(sampling_strategy='auto',random_state=42)
smt2 = BorderlineSMOTE(sampling_strategy='auto',random_state=42)

Feature selection

In [None]:
#fs1 = SelectKBest()
fs2 = RFE(RandomForestClassifier())
#fs3 = SelectFromModel(clf)

Pipelines

In [None]:
#smt + clf
pipeline01 = imbpipeline([('prep', prep2), ('smote', smt1), ('fs', fs2), ('classifier', clf)])
pipeline02 = imbpipeline([('prep', prep2), ('smote', smt2), ('fs', fs2), ('classifier', clf)])


Parameters

In [None]:

#SMOTE
params01_01 = {}
params01_01['smote__k_neighbors'] =  [5]#, 5, 7]
params01_01['fs__n_features_to_select'] = [55]#range(55, 59)
params01_01['classifier__var_smoothing'] = [1e-11]#, 1e-10, 1e-9]
params01_01['classifier'] = [clf]

params01_02 = {}
params01_02['smote__k_neighbors'] =  [13]#[9, 11, 13]
params01_02['fs__n_features_to_select'] = [55]#range(55, 59)
params01_02['classifier__var_smoothing'] = [1e-11]#, 1e-10, 1e-9]
params01_02['classifier'] = [clf]

#BordelineSMOTE
params02_01 = {}
params02_01['smote__k_neighbors'] =  [7]#[3, 5, 7]
params02_01['smote__m_neighbors'] = [3]#, 5, 7]
params02_01['smote__kind'] = ['borderline-2']#['borderline-1','borderline-2']
params02_01['fs__n_features_to_select'] = [55]#range(55, 59)
params02_01['classifier__var_smoothing'] = [1e-11]#, 1e-10, 1e-9]
params02_01['classifier'] = [clf]

params02_02 = {}
params02_02['smote__k_neighbors'] =  [9]#, 11, 13]
params02_02['smote__m_neighbors'] = [3]#, 5, 7]
params02_02['smote__kind'] = ['borderline-2']#['borderline-1','borderline-2']
params02_02['fs__n_features_to_select'] = [55]#range(55, 59)
params02_02['classifier__var_smoothing'] = [1e-11]#, 1e-10, 1e-9]
params02_02['classifier'] = [clf]

params02_03 = {}
params02_03['smote__k_neighbors'] =  [3]#[3, 5, 7]
params02_03['smote__m_neighbors'] = [9]#, 10, 11]
params02_03['smote__kind'] = ['borderline-1']#['borderline-1','borderline-2']
params02_03['fs__n_features_to_select'] = [55]#range(55, 59)
params02_03['classifier__var_smoothing'] = [1e-11]#, 1e-10, 1e-9]
params02_03['classifier'] = [clf]

params02_04 = {}
params02_04['smote__k_neighbors'] =  [13]#[9, 11, 13]
params02_04['smote__m_neighbors'] = [10]#, 10, 11]
params02_04['smote__kind'] = ['borderline-2']#,'borderline-2']
params02_04['fs__n_features_to_select'] = [55]#range(55, 59)
params02_04['classifier__var_smoothing'] = [1e-11]#, 1e-10, 1e-9]
params02_04['classifier'] = [clf]




In [None]:
#Aqui es donde hay que cambiarle manualmente los datos
# pipeline01, luego pipeline02
#al igual en params y file2

pipeline = pipeline02
params = params02_04
file1='NB06-test-'
file2='params02_04.csv'

CV

In [None]:
cv=RepeatedStratifiedKFold(n_splits=10, n_repeats=10,random_state=42)
scoring=['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted']

# GridSearchCV

In [None]:
ct = datetime.datetime.now()
print("Ini: ", ct, " ")
grid=GridSearchCV(pipeline, params, cv=cv, scoring=scoring, n_jobs=-1, refit=False, verbose=3)
grid.fit(X, y)
ct = datetime.datetime.now()
print("Fin: ", ct, " ")

In [None]:
results=pd.DataFrame(grid.cv_results_)
results.shape

In [None]:
results.to_csv(file1+file2, index=False, encoding='utf-8-sig')

In [None]:
# Este es el resultado que hay que anotar en excel
results['mean_test_f1_weighted'].max()

In [None]:
print("Finished!")