In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import helpers.Utilidades as utilidades

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from sklearn.metrics import roc_auc_score, make_scorer, accuracy_score, confusion_matrix, classification_report

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [20]:
dataset = pd.read_csv("./data/fe_dataset.csv")
dataset.head()

Unnamed: 0.1,Unnamed: 0,QUANTITYORDERED,PRICEEACH,SALES,STATUS,QTR_ID,MONTH_ID,YEAR_ID,PRODUCTLINE,MSRP,PRODUCTCODE,CUSTOMERNAME,PHONE,ADDRESSLINE1,CITY,POSTALCODE,COUNTRY,CONTACTLASTNAME,CONTACTFIRSTNAME,DEALSIZE
0,320,54.0,100.0,205.5,1737,2,5,2005,198,118.0,18,178,178,178,207,178.0,237,178,178,0
1,20,20.0,72.55,205.5,1737,4,12,2004,218,95.0,18,35,35,35,35,35.0,120,35,35,1
2,2435,31.0,65.77,205.5,1737,1,2,2003,161,72.0,16,21,21,21,21,21.0,40,21,21,1
3,1913,39.0,100.0,205.5,1737,3,8,2004,610,118.0,23,109,109,109,109,126.0,650,123,165,0
4,933,27.0,100.0,205.5,1737,4,11,2004,610,169.0,25,18,18,18,32,32.0,650,46,18,0


## Dvisión del dataset

In [21]:
X = dataset.drop(['DEALSIZE'], axis=1)
y = dataset['DEALSIZE']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=2022)

## Modelos

In [22]:
lr = 'LogisticRegression().fit(X_train, y_train)'
svm = 'SVC(gamma="scale", kernel="rbf").fit(X_train, y_train)'
dt = 'DecisionTreeClassifier(criterion="gini", max_depth=4).fit(X_train, y_train)'
knn = 'KNeighborsClassifier(n_neighbors=5).fit(X_train, y_train)'
nb = 'GaussianNB().fit(X_train, y_train)'
lda = 'LinearDiscriminantAnalysis(solver="svd", store_covariance=True).fit(X_train, y_train)'
qda = 'QuadraticDiscriminantAnalysis(store_covariance=True).fit(X_train, y_train)'

In [23]:
lr = GridSearchCV(LogisticRegression(),{'C': [0.1,1.0, 10,100], 'solver': ['sag', 'lbfgs'], 'max_iter': [10000]}, cv=10).fit(X_train, y_train)

In [24]:
svm = GridSearchCV(SVC(), {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}, cv=10).fit(X_train, y_train)

In [25]:
dt = GridSearchCV(DecisionTreeClassifier(),{'criterion': ['gini', 'entropy'], 'max_depth':[3,4,5]}, cv= 10).fit(X_train, y_train)

In [26]:
knn = GridSearchCV(KNeighborsClassifier(), {'n_neighbors': [3,4,5]}, cv = 10).fit(X_train, y_train)

In [27]:
nb = GridSearchCV(GaussianNB(), {},cv = 10).fit(X_train, y_train)

In [28]:
lda = GridSearchCV(LinearDiscriminantAnalysis(), {"solver": ["svd", "lsqr"], "store_covariance": [True]},cv = 10).fit(X_train, y_train)

In [29]:
qda = GridSearchCV(QuadraticDiscriminantAnalysis(), {"store_covariance": [True]},cv = 10).fit(X_train, y_train)



In [30]:
randomForest = GridSearchCV(RandomForestClassifier(), {'n_estimators': [20, 50, 100, 500, 1000], 
                                                        'max_depth': [10, 100, 500]}).fit(X_train, y_train)

In [31]:
modelos = {"lr": lr, "svm": svm, "dt": dt, "knn": knn, "nb": nb, "lda": lda, "qda": qda, "rf": randomForest}
modelos

{'lr': GridSearchCV(cv=10, estimator=LogisticRegression(),
              param_grid={'C': [0.1, 1.0, 10, 100], 'max_iter': [10000],
                          'solver': ['sag', 'lbfgs']}),
 'svm': GridSearchCV(cv=10, estimator=SVC(),
              param_grid={'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}),
 'dt': GridSearchCV(cv=10, estimator=DecisionTreeClassifier(),
              param_grid={'criterion': ['gini', 'entropy'],
                          'max_depth': [3, 4, 5]}),
 'knn': GridSearchCV(cv=10, estimator=KNeighborsClassifier(),
              param_grid={'n_neighbors': [3, 4, 5]}),
 'nb': GridSearchCV(cv=10, estimator=GaussianNB(), param_grid={}),
 'lda': GridSearchCV(cv=10, estimator=LinearDiscriminantAnalysis(),
              param_grid={'solver': ['svd', 'lsqr'], 'store_covariance': [True]}),
 'qda': GridSearchCV(cv=10, estimator=QuadraticDiscriminantAnalysis(),
              param_grid={'store_covariance': [True]}),
 'rf': GridSearchCV(estimator=RandomForestClassifier(),

In [32]:
aucScores = {}
matrizResultados = pd.DataFrame(columns=np.arange(0, len(y_test), 1).tolist())
for nomreModelo, modelo in modelos.items():
    predicciones = modelo.predict(X_test)
    auc = roc_auc_score(y_test, predicciones)
    aucScores[nomreModelo] = auc
    tempDf = pd.DataFrame(predicciones).T
    matrizResultados = pd.concat([matrizResultados, tempDf])
    
matrizResultados.index=list(modelos.keys())

matrizResultados

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,536,537,538,539,540,541,542,543,544,545
lr,0,0,1,0,0,1,1,1,0,0,...,0,0,1,1,1,1,0,1,1,1
svm,0,0,1,0,0,1,1,0,0,0,...,0,0,1,1,1,1,0,1,1,1
dt,0,0,1,0,0,1,1,0,1,0,...,0,1,1,1,1,1,0,1,1,1
knn,0,1,1,0,0,0,0,1,1,0,...,1,1,1,0,1,1,0,1,0,1
nb,0,1,1,0,0,1,1,1,0,0,...,0,0,1,1,1,1,0,1,1,1
lda,0,0,1,0,0,1,1,0,0,0,...,0,0,1,1,1,1,0,1,1,1
qda,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
rf,0,1,1,0,0,1,1,0,0,0,...,0,1,1,1,1,1,0,1,1,1


In [33]:
aucScores

{'lr': 0.9303723582690373,
 'svm': 0.9375914122777591,
 'dt': 0.9487688695068769,
 'knn': 0.6426635357262662,
 'nb': 0.8514927876551494,
 'lda': 0.9245420999664542,
 'qda': 0.5289567259308957,
 'rf': 0.9321368668232136}

In [34]:
for nombreModelo, modelo in modelos.items():
    predicciones = modelo.predict(X_test)
    utilidades.MatrizConfusion(nombreModelo, y_test, predicciones)


Matriz de Confusión - lr: 

 Predicción     0    1
observación          
0            257   18
1             20  251

Sentitividad:  0.9261992619926199
Especificidad:  0.9345454545454546

Matriz de Confusión - svm: 

 Predicción     0    1
observación          
0            263   12
1             22  249

Sentitividad:  0.9188191881918819
Especificidad:  0.9563636363636364

Matriz de Confusión - dt: 

 Predicción     0    1
observación          
0            259   16
1             12  259

Sentitividad:  0.955719557195572
Especificidad:  0.9418181818181818

Matriz de Confusión - knn: 

 Predicción     0    1
observación          
0            184   91
1            104  167

Sentitividad:  0.6162361623616236
Especificidad:  0.6690909090909091

Matriz de Confusión - nb: 

 Predicción     0    1
observación          
0            240   35
1             46  225

Sentitividad:  0.8302583025830258
Especificidad:  0.8727272727272727

Matriz de Confusión - lda: 

 Predicción     0    1
observ

# Elección

De acuerdo a la matriz de confusión el mejor modelo es con una regresión logistica, con los parametros: C = 0.1, max_iter = 10000 y solver = lbfgs

In [37]:
lrResultados = pd.DataFrame(lr.cv_results_)
lrResultados.sort_values('rank_test_score', ascending=True)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_max_iter,param_solver,params,split0_test_score,split1_test_score,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
1,0.58832,0.255628,0.001609,0.000293,0.1,10000,lbfgs,"{'C': 0.1, 'max_iter': 10000, 'solver': 'lbfgs'}",0.96875,0.945312,...,0.929134,0.96063,0.929134,0.937008,0.937008,0.929134,0.937008,0.936374,0.01992,1
0,0.822375,0.031146,0.003598,0.006448,0.1,10000,sag,"{'C': 0.1, 'max_iter': 10000, 'solver': 'sag'}",0.96875,0.953125,...,0.913386,0.968504,0.929134,0.937008,0.937008,0.937008,0.913386,0.935575,0.022146,2
2,0.821584,0.021715,0.001548,0.000323,1.0,10000,sag,"{'C': 1.0, 'max_iter': 10000, 'solver': 'sag'}",0.96875,0.953125,...,0.913386,0.968504,0.929134,0.937008,0.937008,0.937008,0.913386,0.935575,0.022146,2
4,0.842143,0.067626,0.001692,0.000209,10.0,10000,sag,"{'C': 10, 'max_iter': 10000, 'solver': 'sag'}",0.96875,0.953125,...,0.913386,0.968504,0.929134,0.937008,0.937008,0.937008,0.913386,0.935575,0.022146,2
6,1.0074,0.088559,0.001985,0.000612,100.0,10000,sag,"{'C': 100, 'max_iter': 10000, 'solver': 'sag'}",0.96875,0.953125,...,0.913386,0.968504,0.929134,0.937008,0.937008,0.937008,0.913386,0.935575,0.022146,2
3,0.685703,0.300908,0.001447,0.00026,1.0,10000,lbfgs,"{'C': 1.0, 'max_iter': 10000, 'solver': 'lbfgs'}",0.953125,0.945312,...,0.929134,0.96063,0.92126,0.937008,0.937008,0.937008,0.937008,0.934031,0.020113,6
7,1.555816,1.118673,0.001649,0.000641,100.0,10000,lbfgs,"{'C': 100, 'max_iter': 10000, 'solver': 'lbfgs'}",0.953125,0.945312,...,0.929134,0.96063,0.92126,0.937008,0.937008,0.937008,0.937008,0.934031,0.020113,6
5,1.235886,0.630723,0.001712,0.000469,10.0,10000,lbfgs,"{'C': 10, 'max_iter': 10000, 'solver': 'lbfgs'}",0.953125,0.945312,...,0.929134,0.96063,0.92126,0.937008,0.937008,0.929134,0.937008,0.933243,0.020135,8
