## 1. Preparando el problema

### Cargando datos
Cargamos el conjunto de datos y mostramos los 20 primeros elementos


In [91]:
from itertools import product

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline

from sklearn import datasets

#Importamos modelos:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import KFold
from sklearn import metrics
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier



datos = pd.read_csv('german_data_original.csv')
datos.head()

Unnamed: 0,Account Balance,Duration of Credit (month),Payment Status of Previous Credit,Purpose,Credit Amount,Value Savings/Stocks,Length of current employment,Instalment per cent,Sex & Marital Status,Guarantors,...,Most valuable available asset,Age (years),Concurrent Credits,Type of apartment,No of Credits at this Bank,Occupation,No of dependents,Telephone,Foreign Worker,Creditability
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,...,A121,67,A143,A152,2,A173,1,A192,A201,1
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,...,A121,22,A143,A152,1,A173,1,A191,A201,2
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,...,A121,49,A143,A152,1,A172,2,A191,A201,1
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,...,A122,45,A143,A153,1,A173,2,A191,A201,1
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,...,A124,53,A143,A153,2,A173,2,A191,A201,2


## Transformando data
Transformaremos los datos cualitativos en datos numericos

In [92]:
datos['Account Balance'] = datos['Account Balance'].map({'A11':1,'A12':2,'A13':3,'A14':4})
datos['Payment Status of Previous Credit'] = datos['Payment Status of Previous Credit'].map({'A30':0,'A31':1,'A32':2,'A33':3,'A34':4})
datos['Purpose'] = datos['Purpose'].map({'A40':0,'A41':1,'A42':2,'A43':3,'A44':4,'A45':5,'A46':6,'A47':7,'A48':8,'A49':9,'A410':10})
datos['Value Savings/Stocks'] = datos['Value Savings/Stocks'].map({'A61':1,'A62':2,'A63':3,'A64':4,'A65':5})
datos['Length of current employment'] = datos['Length of current employment'].map({'A71':1,'A72':2,'A73':3,'A74':4,'A75':5})
datos['Sex & Marital Status'] = datos['Sex & Marital Status'].map({'A91':1,'A92':2,'A93':3,'A94':4,'A95':5})
datos['Guarantors'] = datos['Guarantors'].map({'A101':1,'A102':2,'A103':3})
datos['Most valuable available asset'] = datos['Most valuable available asset'].map({'A121':1,'A122':2,'A123':3,'A124':4})
datos['Concurrent Credits'] = datos['Concurrent Credits'].map({'A141':1,'A142':2,'A143':3})
datos['Type of apartment'] = datos['Type of apartment'].map({'A151':1,'A152':2,'A153':3})
datos['Occupation'] = datos['Occupation'].map({'A171':1,'A172':2,'A173':3,'A174':4})
datos['Telephone'] = datos['Telephone'].map({'A191':1,'A192':2})
datos['Foreign Worker'] = datos['Foreign Worker'].map({'A201':1,'A202':2})
datos['Creditability'] = datos['Creditability'].map({1:1,2:0})
datos.head()


Unnamed: 0,Account Balance,Duration of Credit (month),Payment Status of Previous Credit,Purpose,Credit Amount,Value Savings/Stocks,Length of current employment,Instalment per cent,Sex & Marital Status,Guarantors,...,Most valuable available asset,Age (years),Concurrent Credits,Type of apartment,No of Credits at this Bank,Occupation,No of dependents,Telephone,Foreign Worker,Creditability
0,1,6,4,3,1169,5,5,4,3,1,...,1,67,3,2,2,3,1,2,1,1
1,2,48,2,3,5951,1,3,2,2,1,...,1,22,3,2,1,3,1,1,1,0
2,4,12,4,6,2096,1,4,2,3,1,...,1,49,3,2,1,2,2,1,1,1
3,1,42,2,2,7882,1,4,2,3,3,...,2,45,3,3,1,3,2,1,1,1
4,1,24,3,0,4870,1,3,3,3,1,...,4,53,3,3,2,3,2,1,1,0


### Balanceo de datos
Observamos si los datos estan balanceados

In [93]:
datos['Creditability'].value_counts()

1    700
0    300
Name: Creditability, dtype: int64

En vista de que los datos estan desvalanceados (70% credito aprovado, 30% credito denegado), aplicaremos la tecnica de oversampling para tener un balanceado correcto.

### Aplicación de oversampling y undersampling

In [94]:
#OVERSAMPLING
num_cred_aprobado = datos['Creditability'].value_counts()[1]
num_cred_denegado = datos['Creditability'].value_counts()[0]

#Muestras faltantes, count 400
num_muestras = num_cred_aprobado - num_cred_denegado 

#Guardamos en denegados la data con Creditability=0, count 300 
denegados = datos[datos['Creditability'] == 0] 
aprobados = datos[datos['Creditability'] == 1] 

muestra_aprobados = aprobados.sample(n=500, random_state=0,replace='false') 

#Guardamos una muestra de con datos duplicacdos de denegados para completar el faltante
#replace= false (el numero de datos duplicados no debe ser mayor al de la poblacion total)
muestra_denegados = denegados.sample(n=200, random_state=0,replace='false') 

data_sample = muestra_aprobados #500
data_sample = data_sample.append(denegados) #300
data_sample = data_sample.append(muestra_denegados) #200
#Verificamos la distribución de la clase
data_sample['Creditability'].value_counts()



1    500
0    500
Name: Creditability, dtype: int64

In [95]:
data_sample.head()

Unnamed: 0,Account Balance,Duration of Credit (month),Payment Status of Previous Credit,Purpose,Credit Amount,Value Savings/Stocks,Length of current employment,Instalment per cent,Sex & Marital Status,Guarantors,...,Most valuable available asset,Age (years),Concurrent Credits,Type of apartment,No of Credits at this Bank,Occupation,No of dependents,Telephone,Foreign Worker,Creditability
982,3,21,2,0,2923,2,3,1,2,1,...,3,28,1,2,1,4,1,2,1,1
798,4,24,3,0,717,5,5,4,4,1,...,3,54,3,2,2,3,1,2,1,1
896,1,21,2,3,2606,1,2,4,2,1,...,2,28,3,1,1,4,1,2,1,1
264,4,10,4,0,1231,1,5,3,3,1,...,1,32,3,2,2,2,2,1,2,1
493,2,6,2,3,368,5,5,4,3,1,...,2,38,3,2,1,3,1,1,1,1


In [96]:
data_sample.describe()

Unnamed: 0,Account Balance,Duration of Credit (month),Payment Status of Previous Credit,Purpose,Credit Amount,Value Savings/Stocks,Length of current employment,Instalment per cent,Sex & Marital Status,Guarantors,...,Most valuable available asset,Age (years),Concurrent Credits,Type of apartment,No of Credits at this Bank,Occupation,No of dependents,Telephone,Foreign Worker,Creditability
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,...,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,2.372,21.523,2.515,2.737,3333.18,2.008,3.332,3.034,2.637,1.132,...,2.401,34.916,2.65,1.914,1.403,2.914,1.159,1.375,1.03,0.5
std,1.246262,12.343331,1.091314,2.741048,3006.695016,1.525865,1.223631,1.103658,0.713961,0.441328,...,1.045606,10.953763,0.723895,0.557597,0.592404,0.651943,0.365859,0.484365,0.170673,0.50025
min,1.0,4.0,0.0,0.0,250.0,1.0,1.0,1.0,1.0,1.0,...,1.0,19.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
25%,1.0,12.0,2.0,0.0,1293.0,1.0,2.0,2.0,2.0,1.0,...,1.0,26.0,3.0,2.0,1.0,3.0,1.0,1.0,1.0,0.0
50%,2.0,18.0,2.0,2.0,2327.0,1.0,3.0,4.0,3.0,1.0,...,3.0,32.0,3.0,2.0,1.0,3.0,1.0,1.0,1.0,0.5
75%,4.0,24.0,4.0,3.0,4025.5,3.0,4.0,4.0,3.0,1.0,...,3.0,42.0,3.0,2.0,2.0,3.0,1.0,2.0,1.0,1.0
max,4.0,72.0,4.0,10.0,18424.0,5.0,5.0,4.0,4.0,3.0,...,4.0,75.0,3.0,3.0,4.0,4.0,2.0,2.0,2.0,1.0


### Separacion del conjunto de validación
Separamos el atributo "Creditability" del resto de atributos

In [97]:
X_train_val = data_sample.drop('Creditability', axis=1)
y_train_val = data_sample['Creditability']
X_train_val.count()

Account Balance                      1000
Duration of Credit (month)           1000
Payment Status of Previous Credit    1000
Purpose                              1000
Credit Amount                        1000
Value Savings/Stocks                 1000
Length of current employment         1000
Instalment per cent                  1000
Sex & Marital Status                 1000
Guarantors                           1000
Duration in Current address          1000
Most valuable available asset        1000
Age (years)                          1000
Concurrent Credits                   1000
Type of apartment                    1000
No of Credits at this Bank           1000
Occupation                           1000
No of dependents                     1000
Telephone                            1000
Foreign Worker                       1000
dtype: int64

Separamos el conjunto de entrenamiento (80%) y el de validación (20%)

In [98]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=0)
y_train.value_counts()




1    379
0    371
Name: Creditability, dtype: int64

In [99]:
y_train.value_counts()

1    379
0    371
Name: Creditability, dtype: int64

## Modelo de clasificación 

Definiremos un modelo de clasificacion para realizar un validacion cruzada.

In [100]:
#Generic function for making a classification model and accessing the performance. 
# From AnalyticsVidhya tutorial
def classification_model(model, data, predictors, outcome):
  #Fit the model:
  model.fit(data[predictors],data[outcome])
  
  #Make predictions on training set:
  predictions = model.predict(data[predictors])
  
  #Print accuracy
  accuracy = metrics.accuracy_score(predictions,data[outcome])
  print("Accuracy : %s" % "{0:.3%}".format(accuracy))

  #Perform k-fold cross-validation with 5 folds
  kf = KFold(data.shape[0], n_folds=5)
  error = []
  for train, test in kf:
    # Filter training data
    train_predictors = (data[predictors].iloc[train,:])
    
    # The target we're using to train the algorithm.
    train_target = data[outcome].iloc[train]
    
    # Training the algorithm using the predictors and target.
    model.fit(train_predictors, train_target)
    
    #Record error from each cross-validation run
    error.append(model.score(data[predictors].iloc[test,:], data[outcome].iloc[test]))
    
    print("Cross-Validation Score : %s" % "{0:.3%}".format(np.mean(error)))
    
  #Fit the model again so that it can be refered outside the function:
  model.fit(data[predictors],data[outcome]) 

## Aplicación del modelo
Aplicaremos el clasificador de arboles de decisión.
 - max_depth=none para visualizar sin limites de particiones.
 - random_state=0 sólo para asegurar resultados reproducibles. 

In [166]:
predictor_var = ['Account Balance','Duration of Credit (month)','Payment Status of Previous Credit','Purpose','Credit Amount','Value Savings/Stocks','Length of current employment','Instalment per cent','Sex & Marital Status','Guarantors','Duration in Current address','Most valuable available asset','Age (years)','Concurrent Credits','Type of apartment','No of Credits at this Bank','Occupation','No of dependents','Telephone','Foreign Worker']
outcome_var = 'Creditability'
model = DecisionTreeClassifier()
classification_model(model,data_sample,predictor_var,outcome_var)

Accuracy : 100.000%
Cross-Validation Score : 73.000%
Cross-Validation Score : 76.000%
Cross-Validation Score : 78.333%
Cross-Validation Score : 78.375%
Cross-Validation Score : 82.700%


In [163]:
modelo = LogisticRegression().fit(X_train, y_train)


predicciones_test = modelo.predict(X_test)
num_predicciones_correctas = (y_test == predicciones_test).sum()
num_total_de_muestras = len(y_test)
exactitud = num_predicciones_correctas / num_total_de_muestras

print ( 'Predicciones correctas : ', num_predicciones_correctas )
print ( 'Número de muestras     : ', num_total_de_muestras )
print ( 'Exactitud (manual)     : ', exactitud )

# Usando scikit-learn
print ( 'Exactitud (score)      : ', modelo.score(X_test, y_test) )
print ( 'Exactitud (metrics)    : ', metrics.accuracy_score(y_test, predicciones_test) )

Predicciones correctas :  174
Número de muestras     :  250
Exactitud (manual)     :  0.696
Exactitud (score)      :  0.696
Exactitud (metrics)    :  0.696


## Mejorando data de entrenamiento
Aplicaremos one hot encoding a las caracteristicas cualitativas, para obtener un mejor resultado.


In [103]:
datos_original = pd.read_csv('german_data_original.csv')

datos_numericos = datos_original[['Duration of Credit (month)','Credit Amount','Instalment per cent','Duration in Current address','Age (years)','No of Credits at this Bank','No of dependents','Creditability']]
datos_numericos.head()


Unnamed: 0,Duration of Credit (month),Credit Amount,Instalment per cent,Duration in Current address,Age (years),No of Credits at this Bank,No of dependents,Creditability
0,6,1169,4,4,67,2,1,1
1,48,5951,2,2,22,1,1,2
2,12,2096,2,3,49,1,2,1
3,42,7882,2,4,45,1,2,1
4,24,4870,3,4,53,2,2,2


In [104]:

data_categorica = datos_original[['Account Balance','Payment Status of Previous Credit','Purpose','Value Savings/Stocks','Length of current employment','Sex & Marital Status','Guarantors','Most valuable available asset','Concurrent Credits','Type of apartment','Occupation','Telephone','Foreign Worker']]
data_categorica.head()
#data_sample2 = pd.get_dummies(datos_original['Account Balance'])
#data_sample2.head()

Unnamed: 0,Account Balance,Payment Status of Previous Credit,Purpose,Value Savings/Stocks,Length of current employment,Sex & Marital Status,Guarantors,Most valuable available asset,Concurrent Credits,Type of apartment,Occupation,Telephone,Foreign Worker
0,A11,A34,A43,A65,A75,A93,A101,A121,A143,A152,A173,A192,A201
1,A12,A32,A43,A61,A73,A92,A101,A121,A143,A152,A173,A191,A201
2,A14,A34,A46,A61,A74,A93,A101,A121,A143,A152,A172,A191,A201
3,A11,A32,A42,A61,A74,A93,A103,A122,A143,A153,A173,A191,A201
4,A11,A33,A40,A61,A73,A93,A101,A124,A143,A153,A173,A191,A201


In [105]:
data_categorica = pd.get_dummies(data_categorica)
data_categorica.head()

Unnamed: 0,Account Balance_A11,Account Balance_A12,Account Balance_A13,Account Balance_A14,Payment Status of Previous Credit_A30,Payment Status of Previous Credit_A31,Payment Status of Previous Credit_A32,Payment Status of Previous Credit_A33,Payment Status of Previous Credit_A34,Purpose_A40,...,Type of apartment_A152,Type of apartment_A153,Occupation_A171,Occupation_A172,Occupation_A173,Occupation_A174,Telephone_A191,Telephone_A192,Foreign Worker_A201,Foreign Worker_A202
0,1,0,0,0,0,0,0,0,1,0,...,1,0,0,0,1,0,0,1,1,0
1,0,1,0,0,0,0,1,0,0,0,...,1,0,0,0,1,0,1,0,1,0
2,0,0,0,1,0,0,0,0,1,0,...,1,0,0,1,0,0,1,0,1,0
3,1,0,0,0,0,0,1,0,0,0,...,0,1,0,0,1,0,1,0,1,0
4,1,0,0,0,0,0,0,1,0,1,...,0,1,0,0,1,0,1,0,1,0


In [106]:
data_final = data_categorica.join(datos_numericos)
data_final.head()

Unnamed: 0,Account Balance_A11,Account Balance_A12,Account Balance_A13,Account Balance_A14,Payment Status of Previous Credit_A30,Payment Status of Previous Credit_A31,Payment Status of Previous Credit_A32,Payment Status of Previous Credit_A33,Payment Status of Previous Credit_A34,Purpose_A40,...,Foreign Worker_A201,Foreign Worker_A202,Duration of Credit (month),Credit Amount,Instalment per cent,Duration in Current address,Age (years),No of Credits at this Bank,No of dependents,Creditability
0,1,0,0,0,0,0,0,0,1,0,...,1,0,6,1169,4,4,67,2,1,1
1,0,1,0,0,0,0,1,0,0,0,...,1,0,48,5951,2,2,22,1,1,2
2,0,0,0,1,0,0,0,0,1,0,...,1,0,12,2096,2,3,49,1,2,1
3,1,0,0,0,0,0,1,0,0,0,...,1,0,42,7882,2,4,45,1,2,1
4,1,0,0,0,0,0,0,1,0,1,...,1,0,24,4870,3,4,53,2,2,2


## Aplicamos undersampling y oversampling 

In [107]:
#OVERSAMPLING
num_cred_aprobado = data_final['Creditability'].value_counts()[1]
num_cred_denegado = data_final['Creditability'].value_counts()[2]

#Muestras faltantes, count 400
num_muestras = num_cred_aprobado - num_cred_denegado 

#Guardamos en denegados la data con Creditability=0, count 300 
denegados = data_final[data_final['Creditability'] == 2] 
aprobados = data_final[data_final['Creditability'] == 1] 

muestra_aprobados = aprobados.sample(n=500, random_state=0,replace='false') 

#Guardamos una muestra de con datos duplicacdos de denegados para completar el faltante
#replace= false (el numero de datos duplicados no debe ser mayor al de la poblacion total)
muestra_denegados = denegados.sample(n=200, random_state=0,replace='false') 

data_sample_final = muestra_aprobados #500
data_sample_final = data_sample_final.append(denegados) #300
data_sample_final = data_sample_final.append(muestra_denegados) #200
#Verificamos la distribución de la clase
data_sample_final['Creditability'].value_counts()

2    500
1    500
Name: Creditability, dtype: int64

In [108]:
data_sample_final.head()

Unnamed: 0,Account Balance_A11,Account Balance_A12,Account Balance_A13,Account Balance_A14,Payment Status of Previous Credit_A30,Payment Status of Previous Credit_A31,Payment Status of Previous Credit_A32,Payment Status of Previous Credit_A33,Payment Status of Previous Credit_A34,Purpose_A40,...,Foreign Worker_A201,Foreign Worker_A202,Duration of Credit (month),Credit Amount,Instalment per cent,Duration in Current address,Age (years),No of Credits at this Bank,No of dependents,Creditability
982,0,0,1,0,0,0,1,0,0,1,...,1,0,21,2923,1,1,28,1,1,1
798,0,0,0,1,0,0,0,1,0,1,...,1,0,24,717,4,4,54,2,1,1
896,1,0,0,0,0,0,1,0,0,0,...,1,0,21,2606,4,4,28,1,1,1
264,0,0,0,1,0,0,0,0,1,1,...,0,1,10,1231,3,4,32,2,2,1
493,0,1,0,0,0,0,1,0,0,0,...,1,0,6,368,4,4,38,1,1,1


In [109]:
#my_df = pd.DataFrame(a)
data_sample_final.to_csv('output.csv', index=False, header=True)

###  Dividimos la data en test y entrenamiento

In [110]:
X_train_val_final = data_sample_final.drop('Creditability', axis=1)
y_train_val_final = data_sample_final['Creditability']


X_train_final, X_test_final, y_train_final, y_test_final = train_test_split(X_train_val_final, y_train_val_final, test_size=0.25, random_state=0)


### Aplicamos el modelo

In [111]:
modelo = LogisticRegression().fit(X_train_final, y_train_final)


predicciones_test_final = modelo.predict(X_test_final)
num_predicciones_correctas = (y_test_final == predicciones_test_final).sum()
num_total_de_muestras = len(y_test_final)
exactitud = num_predicciones_correctas / num_total_de_muestras

print ( 'Predicciones correctas : ', num_predicciones_correctas )
print ( 'Número de muestras     : ', num_total_de_muestras )
print ( 'Exactitud (manual)     : ', exactitud )

# Usando scikit-learn
print ( 'Exactitud (score)      : ', modelo.score(X_test_final, y_test_final) )
print ( 'Exactitud (metrics)    : ', metrics.accuracy_score(y_test_final, predicciones_test_final) )

Predicciones correctas :  176
Número de muestras     :  250
Exactitud (manual)     :  0.704
Exactitud (score)      :  0.704
Exactitud (metrics)    :  0.704


In [112]:
predictor_var = ['Account Balance_A11','Account Balance_A12','Account Balance_A13','Account Balance_A14','Payment Status of Previous Credit_A30','Payment Status of Previous Credit_A31','Payment Status of Previous Credit_A32','Payment Status of Previous Credit_A33','Payment Status of Previous Credit_A34','Purpose_A40','Purpose_A41','Purpose_A410','Purpose_A42','Purpose_A43','Purpose_A44','Purpose_A45','Purpose_A46','Purpose_A48','Purpose_A49','Value Savings/Stocks_A61','Value Savings/Stocks_A62','Value Savings/Stocks_A63','Value Savings/Stocks_A64','Value Savings/Stocks_A65','Length of current employment_A71','Length of current employment_A72','Length of current employment_A73','Length of current employment_A74','Length of current employment_A75','Sex & Marital Status_A91','Sex & Marital Status_A92','Sex & Marital Status_A93','Sex & Marital Status_A94','Guarantors_A101','Guarantors_A102','Guarantors_A103','Most valuable available asset_A121','Most valuable available asset_A122','Most valuable available asset_A123','Most valuable available asset_A124','Concurrent Credits_A141','Concurrent Credits_A142','Concurrent Credits_A143','Type of apartment_A151','Type of apartment_A152','Type of apartment_A153','Occupation_A171','Occupation_A172','Occupation_A173','Occupation_A174','Telephone_A191','Telephone_A192','Foreign Worker_A201','Foreign Worker_A202','Duration of Credit (month)','Credit Amount','Instalment per cent','Duration in Current address','Age (years)','No of Credits at this Bank','No of dependents']
outcome_var = 'Creditability'
model = DecisionTreeClassifier()
classification_model(model,data_sample_final,predictor_var,outcome_var)

Accuracy : 100.000%
Cross-Validation Score : 77.500%
Cross-Validation Score : 76.250%
Cross-Validation Score : 78.500%
Cross-Validation Score : 78.625%
Cross-Validation Score : 82.900%


In [113]:
from sklearn.neural_network import MLPClassifier


A continuación, creamos una instancia del modelo, hay una gran cantidad de parámetros que puede elegir para definir y personalizar aquí, sólo vamos a definir el hidden_layer_sizes. Para este parámetro se pasa una tupla que consiste en el número de neuronas que se desea en cada capa, donde la enésima entrada en la tupla representa el número de neuronas en la capa n-ésima del modelo MLP. Hay muchas maneras de elegir estos números, pero por simplicidad vamos a elegir 3 capas con el mismo número de características que hay  en nuestro conjunto de datos:

In [114]:
mlp = MLPClassifier(hidden_layer_sizes=(20,20,20))

Ahora que el modelo se ha hecho podemos ajustar los datos de entrenamiento a nuestro modelo, recuerde que estos datos ya han sido procesados y escalados:

In [115]:
mlp.fit(X_train,y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', 
       beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(30, 30, 30), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)


MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(30, 30, 30), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

Ahora que tenemos un modelo, es hora de usarlo para obtener predicciones! Podemos hacer esto simplemente con el método predict fuera de nuestro modelo ajustado:

In [116]:
predictions = mlp.predict(X_test)

Ahora podemos usar las métricas integradas de SciKit-Learn como un informe de clasificación y una matriz de confusión para evaluar qué tan bien nuestro modelo ha sido realizado:

In [117]:
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_test,predictions))

[[129   0]
 [121   0]]


In [159]:
matriz_confusion = metrics.confusion_matrix(y_test, predictions)
TN = matriz_confusion[0,0]
FN = matriz_confusion[1,0]
FP = matriz_confusion[0,1]
TP = matriz_confusion[1,1]
exactitud_nn=(TP+TN)/(TP+FN+FP+TN)

print ('              +-----------------+')
print ('              |   Predicción    |')
print ('              +-----------------+')
print ('              |    +   |    -   |')
print ('+-------+-----+--------+--------+')
print ('| Valor |  +  |   %d |   %d   |'   % (TP, FN) )
print ('| real  +-----+--------+--------+')
print ('|       |  -  |   %d  |   %d  |'    % (FP, TN) )
print ('+-------+-----+--------+--------+')
print ()
print ( 'Exactitud    : ', (TP+TN)/(TP+FN+FP+TN) )

              +-----------------+
              |   Predicción    |
              +-----------------+
              |    +   |    -   |
+-------+-----+--------+--------+
| Valor |  +  |   79 |   42   |
| real  +-----+--------+--------+
|       |  -  |   36  |   93  |
+-------+-----+--------+--------+

Exactitud    :  0.688


In [148]:
svc = SVC()
svc.fit(X_train, y_train)
predictions = svc.predict(X_test)

matriz_confusion = metrics.confusion_matrix(y_test, predictions)
TN = matriz_confusion[0,0]
FN = matriz_confusion[1,0]
FP = matriz_confusion[0,1]
TP = matriz_confusion[1,1]
exactitud_svc=(TP+TN)/(TP+FN+FP+TN)

print ('              +-----------------+')
print ('              |   Predicción    |')
print ('              +-----------------+')
print ('              |    +   |    -   |')
print ('+-------+-----+--------+--------+')
print ('| Valor |  +  |   %d |   %d   |'   % (TP, FN) )
print ('| real  +-----+--------+--------+')
print ('|       |  -  |   %d  |   %d  |'    % (FP, TN) )
print ('+-------+-----+--------+--------+')
print ()
print ( 'Exactitud    : ', (TP+TN)/(TP+FN+FP+TN) )

              +-----------------+
              |   Predicción    |
              +-----------------+
              |    +   |    -   |
+-------+-----+--------+--------+
| Valor |  +  |   120 |   1   |
| real  +-----+--------+--------+
|       |  -  |   56  |   73  |
+-------+-----+--------+--------+

Exactitud    :  0.772


In [154]:
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)
matriz_confusion = metrics.confusion_matrix(y_test, predictions)
TN = matriz_confusion[0,0]
FN = matriz_confusion[1,0]
FP = matriz_confusion[0,1]
TP = matriz_confusion[1,1]
exactitud_knn=(TP+TN)/(TP+FN+FP+TN)

print ('              +-----------------+')
print ('              |   Predicción    |')
print ('              +-----------------+')
print ('              |    +   |    -   |')
print ('+-------+-----+--------+--------+')
print ('| Valor |  +  |   %d |   %d   |'   % (TP, FN) )
print ('| real  +-----+--------+--------+')
print ('|       |  -  |   %d  |   %d  |'    % (FP, TN) )
print ('+-------+-----+--------+--------+')
print ()
print ( 'Exactitud    : ', (TP+TN)/(TP+FN+FP+TN) )

              +-----------------+
              |   Predicción    |
              +-----------------+
              |    +   |    -   |
+-------+-----+--------+--------+
| Valor |  +  |   86 |   35   |
| real  +-----+--------+--------+
|       |  -  |   42  |   87  |
+-------+-----+--------+--------+

Exactitud    :  0.692


In [155]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
predictions = logreg.predict(X_test)
matriz_confusion = metrics.confusion_matrix(y_test, predictions)
TN = matriz_confusion[0,0]
FN = matriz_confusion[1,0]
FP = matriz_confusion[0,1]
TP = matriz_confusion[1,1]
exactitud_logreg=(TP+TN)/(TP+FN+FP+TN)

print ('              +-----------------+')
print ('              |   Predicción    |')
print ('              +-----------------+')
print ('              |    +   |    -   |')
print ('+-------+-----+--------+--------+')
print ('| Valor |  +  |   %d |   %d   |'   % (TP, FN) )
print ('| real  +-----+--------+--------+')
print ('|       |  -  |   %d  |   %d  |'    % (FP, TN) )
print ('+-------+-----+--------+--------+')
print ()
print ( 'Exactitud    : ', (TP+TN)/(TP+FN+FP+TN) )

              +-----------------+
              |   Predicción    |
              +-----------------+
              |    +   |    -   |
+-------+-----+--------+--------+
| Valor |  +  |   91 |   30   |
| real  +-----+--------+--------+
|       |  -  |   46  |   83  |
+-------+-----+--------+--------+

Exactitud    :  0.696


In [156]:
# Gaussian Naive Bayes

gaussian = GaussianNB()
gaussian.fit(X_train, y_train)
predictions = gaussian.predict(X_test)
matriz_confusion = metrics.confusion_matrix(y_test, predictions)
TN = matriz_confusion[0,0]
FN = matriz_confusion[1,0]
FP = matriz_confusion[0,1]
TP = matriz_confusion[1,1]
exactitud_gausNB=(TP+TN)/(TP+FN+FP+TN)

print ('              +-----------------+')
print ('              |   Predicción    |')
print ('              +-----------------+')
print ('              |    +   |    -   |')
print ('+-------+-----+--------+--------+')
print ('| Valor |  +  |   %d |   %d   |'   % (TP, FN) )
print ('| real  +-----+--------+--------+')
print ('|       |  -  |   %d  |   %d  |'    % (FP, TN) )
print ('+-------+-----+--------+--------+')
print ()
print ( 'Exactitud    : ', (TP+TN)/(TP+FN+FP+TN) )

              +-----------------+
              |   Predicción    |
              +-----------------+
              |    +   |    -   |
+-------+-----+--------+--------+
| Valor |  +  |   79 |   42   |
| real  +-----+--------+--------+
|       |  -  |   36  |   93  |
+-------+-----+--------+--------+

Exactitud    :  0.688


In [150]:
linear_svc = LinearSVC()
linear_svc.fit(X_train, y_train)
predictions = linear_svc.predict(X_test)
matriz_confusion = metrics.confusion_matrix(y_test, predictions)
TN = matriz_confusion[0,0]
FN = matriz_confusion[1,0]
FP = matriz_confusion[0,1]
TP = matriz_confusion[1,1]
exactitud_linearsvc=(TP+TN)/(TP+FN+FP+TN)

print ('              +-----------------+')
print ('              |   Predicción    |')
print ('              +-----------------+')
print ('              |    +   |    -   |')
print ('+-------+-----+--------+--------+')
print ('| Valor |  +  |   %d |   %d   |'   % (TP, FN) )
print ('| real  +-----+--------+--------+')
print ('|       |  -  |   %d  |   %d  |'    % (FP, TN) )
print ('+-------+-----+--------+--------+')
print ()
print ( 'Exactitud    : ', (TP+TN)/(TP+FN+FP+TN) )

              +-----------------+
              |   Predicción    |
              +-----------------+
              |    +   |    -   |
+-------+-----+--------+--------+
| Valor |  +  |   1 |   120   |
| real  +-----+--------+--------+
|       |  -  |   0  |   129  |
+-------+-----+--------+--------+

Exactitud    :  0.52


In [151]:
# Decision Tree

decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)
predictions = decision_tree.predict(X_test)
matriz_confusion = metrics.confusion_matrix(y_test, predictions)
TN = matriz_confusion[0,0]
FN = matriz_confusion[1,0]
FP = matriz_confusion[0,1]
TP = matriz_confusion[1,1]
exactitud_dt=(TP+TN)/(TP+FN+FP+TN)

print ('              +-----------------+')
print ('              |   Predicción    |')
print ('              +-----------------+')
print ('              |    +   |    -   |')
print ('+-------+-----+--------+--------+')
print ('| Valor |  +  |   %d |   %d   |'   % (TP, FN) )
print ('| real  +-----+--------+--------+')
print ('|       |  -  |   %d  |   %d  |'    % (FP, TN) )
print ('+-------+-----+--------+--------+')
print ()
print ( 'Exactitud    : ', (TP+TN)/(TP+FN+FP+TN) )

              +-----------------+
              |   Predicción    |
              +-----------------+
              |    +   |    -   |
+-------+-----+--------+--------+
| Valor |  +  |   101 |   20   |
| real  +-----+--------+--------+
|       |  -  |   23  |   106  |
+-------+-----+--------+--------+

Exactitud    :  0.828


In [152]:
# Random Forest

random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, y_train)
predictions = random_forest.predict(X_test)
matriz_confusion = metrics.confusion_matrix(y_test, predictions)
TN = matriz_confusion[0,0]
FN = matriz_confusion[1,0]
FP = matriz_confusion[0,1]
TP = matriz_confusion[1,1]
exactitud_RF=(TP+TN)/(TP+FN+FP+TN)

print ('              +-----------------+')
print ('              |   Predicción    |')
print ('              +-----------------+')
print ('              |    +   |    -   |')
print ('+-------+-----+--------+--------+')
print ('| Valor |  +  |   %d |   %d   |'   % (TP, FN) )
print ('| real  +-----+--------+--------+')
print ('|       |  -  |   %d  |   %d  |'    % (FP, TN) )
print ('+-------+-----+--------+--------+')
print ()
print ( 'Exactitud    : ', (TP+TN)/(TP+FN+FP+TN) )

              +-----------------+
              |   Predicción    |
              +-----------------+
              |    +   |    -   |
+-------+-----+--------+--------+
| Valor |  +  |   105 |   16   |
| real  +-----+--------+--------+
|       |  -  |   14  |   115  |
+-------+-----+--------+--------+

Exactitud    :  0.88


In [160]:
models = pd.DataFrame({
    'Model': ['Redes Neuronales','Support Vector Machines', 'KNN', 'Logistic Regression', 
              'Random Forest', 'Naive Bayes', 
              'Linear SVC', 
              'Decision Tree'],
    'Score': [exactitud_nn,exactitud_svc, exactitud_knn, exactitud_logreg, 
              exactitud_RF, exactitud_gausNB,
              exactitud_linearsvc, exactitud_dt]})
models.sort_values(by='Score', ascending=False)

Unnamed: 0,Model,Score
4,Random Forest,0.88
7,Decision Tree,0.828
1,Support Vector Machines,0.772
3,Logistic Regression,0.696
2,KNN,0.692
0,Redes Neuronales,0.688
5,Naive Bayes,0.688
6,Linear SVC,0.52
