## Cargamos Librerias

In [30]:
from sklearn.model_selection import train_test_split

#Weak Modeles
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

#Metrics
from sklearn.metrics import roc_auc_score

from sklearn.preprocessing import StandardScaler #Estandarizador

from sklearn.utils import resample #para bootstraping.


import pandas as pd
import numpy as np

import modulos as mm
import preprocessors as pp
from sklearn.pipeline import Pipeline

## Cargamos los Datos

In [46]:
# Definimos algunas constantes a utilizar para mejor mantenimiento del programa
dataset_entrada = 'marketing_campaign.csv'
columna_target = 'Response'

dataset = pd.read_csv(dataset_entrada, sep=';')
dataset.dropna(inplace=True)

In [48]:
#obtenemos tipos de datos de columnas de dataframe
categoric_vars, discrete_vars , continues_vars = mm.getColumnsDataTypes(df=dataset)

In [49]:
# Definimos los pasos del pipeline 
my_feature_engineering_pipeline = Pipeline(steps = [
    # hacemos encoding de las variables categorias utilizando el tipo de mapeo de frecuencia
    ('categorical_variables_codification',
        pp.CategoricalEncoderOperator(varNames = categoric_vars, map_type='freq'))
])

# Ejecutamos el pipeline sobre la data
procesed_data = my_feature_engineering_pipeline.fit_transform(dataset)
# mostramos la data imputada.
print(procesed_data)

print('\n---------- Despues  de la Imputacion ----------')

cat_cols = mm.getCategoricalCols(procesed_data)
print('Columnas categoricas: ',cat_cols)

         ID  Year_Birth  Education  Marital_Status   Income  Kidhome  \
0      5524        1957     1116.0           471.0  58138.0        0   
1      2174        1954     1116.0           471.0  46344.0        1   
2      4141        1965     1116.0           573.0  71613.0        0   
3      6182        1984     1116.0           573.0  26646.0        1   
4      5324        1981      481.0           857.0  58293.0        1   
...     ...         ...        ...             ...      ...      ...   
2235  10870        1967     1116.0           857.0  61223.0        0   
2236   4001        1946      481.0           573.0  64014.0        2   
2237   7270        1981     1116.0           232.0  56981.0        0   
2238   8235        1956      365.0           573.0  69245.0        0   
2239   9405        1954      481.0           857.0  52869.0        1   

      Teenhome  Dt_Customer  Recency  MntWines  ...  NumWebVisitsMonth  \
0            0          4.0       58       635  ...          

In [50]:
dataset = procesed_data

## Estandarización de Variables Numericas

In [51]:
df_scaled = dataset.copy()

cols_to_scale = continues_vars

features = df_scaled[cols_to_scale]
scaler = StandardScaler().fit(features.values)
features = scaler.transform(features.values)

df_scaled[cols_to_scale] = features
df_scaled

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
0,-0.019809,-0.986443,1116.0,471.0,0.234063,0,0,4.0,0.310532,0.978226,...,7,0,0,0,0,0,0,3,11,1
1,-1.051009,-1.236801,1116.0,471.0,-0.234559,1,1,5.0,-0.380509,-0.872024,...,5,0,0,0,0,0,0,3,11,0
2,-0.445526,-0.318822,1116.0,573.0,0.769478,0,0,4.0,-0.795134,0.358511,...,4,0,0,0,0,0,0,3,11,0
3,0.182737,1.266777,1116.0,573.0,-1.017239,1,0,5.0,-0.795134,-0.872024,...,6,0,0,0,0,0,0,3,11,0
4,-0.081373,1.016420,481.0,857.0,0.240221,1,0,3.0,1.554407,-0.391671,...,5,0,0,0,0,0,0,3,11,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2235,1.625801,-0.151917,1116.0,857.0,0.356642,0,1,5.0,-0.104093,1.197646,...,5,0,0,0,0,0,0,3,11,0
2236,-0.488620,-1.904422,481.0,573.0,0.467539,2,1,3.0,0.241428,0.299208,...,7,0,0,0,1,0,0,3,11,0
2237,0.517646,1.016420,1116.0,232.0,0.188091,0,0,5.0,1.450751,1.787710,...,6,0,1,0,0,0,0,3,11,0
2238,0.814693,-1.069896,365.0,573.0,0.675388,0,1,3.0,-1.417072,0.364441,...,3,0,0,0,0,0,0,3,11,0


## Seleccionamos Variable Target y Variables Predictoras

In [52]:
#X = dataset.loc[:, dataset.columns != 'target']
#y = dataset.loc[:, dataset.columns == 'target']

X = df_scaled.drop(columna_target,axis = 1)
y = df_scaled[columna_target]

## Separamos en data para Entrenamiento y Prueba

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2020)

## Creamos Configuración de Modelos

In [54]:
lr = 'LogisticRegression().fit(X_train, y_train)'
svm = 'SVC(gamma="scale", kernel="rbf").fit(X_train, y_train)'
dt = 'DecisionTreeClassifier(criterion="gini", max_depth=4).fit(X_train, y_train)'
knn = 'KNeighborsClassifier(n_neighbors=5).fit(X_train, y_train)'
nb = 'GaussianNB().fit(X_train, y_train)'
lda = 'LinearDiscriminantAnalysis(solver="svd", store_covariance=True).fit(X_train, y_train)'
qda = 'QuadraticDiscriminantAnalysis(store_covariance=True).fit(X_train, y_train)'

models_list = {"lr" :lr, "svm":svm, "dt": dt,  "knn":knn, "nb":nb, "lda":lda, "qda":qda}
models_list

{'lr': 'LogisticRegression().fit(X_train, y_train)',
 'svm': 'SVC(gamma="scale", kernel="rbf").fit(X_train, y_train)',
 'dt': 'DecisionTreeClassifier(criterion="gini", max_depth=4).fit(X_train, y_train)',
 'knn': 'KNeighborsClassifier(n_neighbors=5).fit(X_train, y_train)',
 'nb': 'GaussianNB().fit(X_train, y_train)',
 'lda': 'LinearDiscriminantAnalysis(solver="svd", store_covariance=True).fit(X_train, y_train)',
 'qda': 'QuadraticDiscriminantAnalysis(store_covariance=True).fit(X_train, y_train)'}

## Creamos Dataset Nuevamente

In [55]:
dataset = pd.concat([X_train, y_train], axis=1)
dataset

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
1482,-0.765351,1.016420,200.0,573.0,-1.304595,1,0,6.0,-0.587822,-0.898710,...,8,0,0,0,0,0,0,3,11,0
2208,-0.535101,-0.485727,54.0,573.0,-1.483239,0,1,5.0,0.103220,-0.895745,...,6,0,0,0,0,0,0,3,11,0
1808,-1.221850,-1.069896,1116.0,857.0,1.514728,0,0,4.0,-1.624384,-0.038818,...,1,0,0,0,0,0,0,3,11,1
1146,-1.489654,1.016420,1116.0,573.0,-0.400727,1,0,3.0,-0.622374,-0.768244,...,6,1,0,0,0,0,0,3,11,1
1448,-1.340976,1.099872,481.0,573.0,0.706897,0,0,7.0,0.172324,0.835899,...,2,0,0,0,0,0,0,3,11,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1993,1.408479,-0.902990,1116.0,471.0,1.677756,0,1,3.0,0.897917,2.110911,...,5,0,1,1,0,0,0,3,11,0
1678,-0.468920,0.599157,481.0,857.0,-0.107411,1,0,5.0,-1.693488,0.008624,...,7,0,0,0,0,0,0,3,11,0
2162,-1.419471,-0.986443,1116.0,573.0,-0.195779,0,1,5.0,-1.347968,-0.311612,...,6,0,0,0,0,0,0,3,11,0
405,-1.660494,-0.318822,1116.0,857.0,-0.168522,1,1,3.0,0.206876,-0.836442,...,6,0,0,0,0,0,0,3,11,0


## Creamos Boost y Modelos

In [56]:
train_models_list = {}

for model_name, model in models_list.items():
    boot = resample(dataset, replace=True, n_samples=200, random_state=2020)
    X_train = boot.drop(columna_target, axis = 1)
    y_train = boot[columna_target]
    train_model = eval(model)
    train_models_list[model_name] = train_model
    
train_models_list

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'lr': LogisticRegression(),
 'svm': SVC(),
 'dt': DecisionTreeClassifier(max_depth=4),
 'knn': KNeighborsClassifier(),
 'nb': GaussianNB(),
 'lda': LinearDiscriminantAnalysis(store_covariance=True),
 'qda': QuadraticDiscriminantAnalysis(store_covariance=True)}

## Realizamos Predicciones con cada Modelo

In [57]:
auc_scores = {}
results_matrix = pd.DataFrame(columns=np.arange(0, len(y_test), 1).tolist())

for model_name, train_model in train_models_list.items():
    predicciones = train_model.predict(X_test)
    auc = roc_auc_score(y_test, predicciones)
    auc_scores[model_name] = auc
    tempDf = pd.DataFrame(predicciones).T
    results_matrix = results_matrix.append(tempDf)
    
    
    
results_matrix.index=list(train_models_list.keys())

 
results_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,655,656,657,658,659,660,661,662,663,664
lr,0,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
svm,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
dt,0,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,1,0,0,0,0
knn,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
nb,0,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,1,0,0,0,0
lda,0,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
qda,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Ralizamos Votación resultados

In [58]:
votacion = results_matrix.apply(pd.value_counts)
votacion

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,655,656,657,658,659,660,661,662,663,664
0,7.0,7.0,6,7.0,7.0,7.0,4,7.0,7.0,7.0,...,7.0,7.0,7.0,3,7.0,5,7.0,7.0,7.0,7.0
1,,,1,,,,3,,,,...,,,,4,,2,,,,


In [59]:
final_predictions = []

for (columnName, columnData) in votacion.iteritems():
    column_result = columnData.values
    final_predictions.append(np.nanargmax(column_result, axis=0))

final_predictions

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [60]:
auc = roc_auc_score(y_test, final_predictions)
auc

0.5538053097345133

In [61]:
auc_scores

{'lr': 0.589646017699115,
 'svm': 0.5,
 'dt': 0.623141592920354,
 'knn': 0.5099557522123894,
 'nb': 0.6815486725663717,
 'lda': 0.6537610619469028,
 'qda': 0.5}