# Modelo utilizando Naive Bayes

## Importación de las librerias

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from pandas.plotting import scatter_matrix
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

## Importamos el dataset

In [2]:
dataset = pd.read_csv('/content/datasetParcial.csv')

## Preprocesamiento de los datos

In [3]:
X = dataset.iloc[:, 1:-1].values
y = dataset.iloc[:, -1].values

In [4]:
print(X)

[[2.17600000e+01 3.11333333e+01 4.37333333e+02 1.02966667e+03
  5.02101089e-03]
 [2.17900000e+01 3.10000000e+01 4.37333333e+02 1.00000000e+03
  5.00858127e-03]
 [2.17675000e+01 3.11225000e+01 4.34000000e+02 1.00375000e+03
  5.02156913e-03]
 ...
 [2.08900000e+01 2.77450000e+01 4.23500000e+02 1.52150000e+03
  4.23681810e-03]
 [2.08900000e+01 2.80225000e+01 4.18750000e+02 1.63200000e+03
  4.27948547e-03]
 [2.10000000e+01 2.81000000e+01 4.09000000e+02 1.86400000e+03
  4.32073200e-03]]


In [5]:
print(y)

[1 1 1 ... 1 1 1]


## Particion de los datos entre el conjunto de entrenamiento y el conjunto para test

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [7]:
print(X_train)

[[1.98900000e+01 3.54000000e+01 0.00000000e+00 5.29000000e+02
  5.08894379e-03]
 [2.00000000e+01 3.57000000e+01 0.00000000e+00 5.40000000e+02
  5.16782401e-03]
 [2.07900000e+01 2.62000000e+01 0.00000000e+00 1.49800000e+03
  3.97467591e-03]
 ...
 [2.05000000e+01 2.37000000e+01 0.00000000e+00 5.81500000e+02
  3.52920391e-03]
 [2.01000000e+01 3.22000000e+01 0.00000000e+00 5.69666667e+02
  4.68652701e-03]
 [2.15000000e+01 2.88233333e+01 0.00000000e+00 6.22333333e+02
  4.57184298e-03]]


In [8]:
print(X_test)

[[2.03900000e+01 3.27900000e+01 0.00000000e+00 6.59000000e+02
  4.86008397e-03]
 [2.10000000e+01 3.37000000e+01 0.00000000e+00 1.50000000e+03
  5.18898657e-03]
 [2.00000000e+01 3.05000000e+01 0.00000000e+00 7.16500000e+02
  4.40975111e-03]
 ...
 [2.06000000e+01 2.19700000e+01 6.00000000e+00 9.26750000e+02
  3.29057323e-03]
 [2.01000000e+01 3.35000000e+01 0.00000000e+00 5.51000000e+02
  4.87721823e-03]
 [2.06000000e+01 2.50000000e+01 0.00000000e+00 5.50000000e+02
  3.74712771e-03]]


In [9]:
print(y_train)

[0 0 0 ... 0 0 0]


In [10]:
print(y_test)

[0 0 0 ... 0 0 0]


## Escalando las caracteristicas

In [11]:
sc = StandardScaler()
X_train[:, 1:-1] = sc.fit_transform(X_train[:, 1:-1])
X_test[:, 1:-1] = sc.transform(X_test[:, 1:-1])

## Entrenando el modelo de Naive Bayes con el conjunto de entrenamiento

In [12]:
classifier = GaussianNB()
classifier.fit(X_train, y_train)

## Prediciendo los resultados del test

In [13]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[0 0]
 [0 0]
 [0 0]
 ...
 [0 0]
 [0 0]
 [0 0]]


## Evaluación de las metricas

##### Realizando la matriz de confusión

In [14]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[1451   75]
 [   2  423]]


##### Accuracy Score

In [15]:
accuracy_score(y_test, y_pred)

0.9605330599692465

##### Precision

In [16]:
precision_score(y_test, y_pred)

0.8493975903614458

##### Recall

In [17]:
recall_score(y_test, y_pred)

0.9952941176470588

##### F1-Score

In [18]:
f1_score(y_test, y_pred)

0.9165763813651138

## Score de Naive Bayes

In [19]:
classifier.score(X_test, y_test)

0.9605330599692465

## Calculamos la tasa de error

In [20]:
1 - accuracy_score(y_test, y_pred)

0.03946694003075346

# Optimización usando Grid Search

### Definicion de los hiperparametros

In [21]:
params_grid = {
    'var_smoothing': np.logspace(0, -9, num=100),
    'priors': [None, [0.3, 0.7], [0.4, 0.6]],
}

### Realizamos Grid Search con validación cruzada.

In [22]:
grid_search = GridSearchCV(
    estimator=classifier,
    param_grid=params_grid,
    scoring='accuracy',
    cv=5,
    n_jobs=-1
)

### Se ajusta el mejor modelo

In [23]:
grid_search.fit(X_train, y_train)

### Obtenemos el mejor modelo y los mejores hiperparametros

In [24]:
best_model = grid_search.best_estimator_
best_parameters = grid_search.best_params_

In [57]:
print(best_parameters)

{'priors': None, 'var_smoothing': 0.657933224657568}


### Prediciendo los resultados del test optimizado

In [25]:
y_predOpt = best_model.predict(X_test)

### Evaluación de las metricas del modelo optimizado

##### Realizando la matriz de confusión

In [26]:
cm = confusion_matrix(y_test, y_predOpt)
print(cm)

[[1462   64]
 [   4  421]]


##### Accuracy Score

In [27]:
accuracy_score(y_test, y_predOpt)

0.96514607893388

##### Precision

In [28]:
precision_score(y_test, y_predOpt)

0.8680412371134021

##### Recall

In [29]:
recall_score(y_test, y_predOpt)

0.9905882352941177

##### F1-Score

In [30]:
f1_score(y_test, y_predOpt)

0.9252747252747252

##### Tabla metricas

In [31]:
print(classification_report(y_test, y_predOpt,  labels=[0,1], target_names=['No ocupado', ' ocupado']))

              precision    recall  f1-score   support

  No ocupado       1.00      0.96      0.98      1526
     ocupado       0.87      0.99      0.93       425

    accuracy                           0.97      1951
   macro avg       0.93      0.97      0.95      1951
weighted avg       0.97      0.97      0.97      1951



### Score de naive bayes optimizado

In [32]:
best_model.score(X_test, y_test)

0.96514607893388

## Calculamos la tasa de error del modelo optimizado

In [33]:
1 - accuracy_score(y_test, y_predOpt)

0.03485392106611995

## Preprocesamiento de los datos mediante clustering

### Nos creamos un pipeline para combinar la ejecución, primero K-Means y luego Naive Bayes

In [34]:
pipeline = Pipeline([
    ("kmeans", KMeans(n_clusters=60, random_state=42)),
    ("naiveBayes", GaussianNB()),
])
pipeline.fit(X_train, y_train)



## Prediciendo los resultados del test

In [35]:
y_predK = pipeline.predict(X_test)
print(np.concatenate((y_predK.reshape(len(y_predK),1), y_test.reshape(len(y_test),1)),1))

[[0 0]
 [0 0]
 [0 0]
 ...
 [0 0]
 [0 0]
 [0 0]]


## Evaluación de las metricas

##### Realizando la matriz de confusión

In [36]:
cmK = confusion_matrix(y_test, y_predK)
print(cmK)

[[1426  100]
 [   1  424]]


##### Accuracy Score

In [37]:
accuracy_score(y_test, y_predK)

0.9482316760635572

##### Precision

In [38]:
precision_score(y_test, y_predK)

0.8091603053435115

##### Recall

In [39]:
recall_score(y_test, y_predK)

0.9976470588235294

##### F1-Score

In [40]:
f1_score(y_test, y_predK)

0.8935721812434141

## Score de Naive Bayes usando clustering

In [41]:
pipeline.score(X_test, y_test)

0.9482316760635572

## Calculamos la tasa de error usando clustering

In [42]:
1 - accuracy_score(y_test, y_predK)

0.05176832393644282

# Optimización usando Grid Search con clustering

### Definición de los hiperparametros

In [43]:
param_grid = dict(kmeans__n_clusters=range(50, 90))

### Realizamos Grid Search

In [44]:
grid_clf = GridSearchCV(pipeline, param_grid, cv=3, verbose=2)

### Se ajusta el mejor modelo

In [45]:
grid_clf.fit(X_train, y_train)

Fitting 3 folds for each of 40 candidates, totalling 120 fits




[CV] END ..............................kmeans__n_clusters=50; total time=   3.4s




[CV] END ..............................kmeans__n_clusters=50; total time=   2.4s




[CV] END ..............................kmeans__n_clusters=50; total time=   2.1s




[CV] END ..............................kmeans__n_clusters=51; total time=   1.6s




[CV] END ..............................kmeans__n_clusters=51; total time=   1.1s




[CV] END ..............................kmeans__n_clusters=51; total time=   0.9s




[CV] END ..............................kmeans__n_clusters=52; total time=   0.9s




[CV] END ..............................kmeans__n_clusters=52; total time=   2.0s




[CV] END ..............................kmeans__n_clusters=52; total time=   1.9s




[CV] END ..............................kmeans__n_clusters=53; total time=   1.1s




[CV] END ..............................kmeans__n_clusters=53; total time=   1.1s




[CV] END ..............................kmeans__n_clusters=53; total time=   1.0s




[CV] END ..............................kmeans__n_clusters=54; total time=   1.2s




[CV] END ..............................kmeans__n_clusters=54; total time=   1.1s




[CV] END ..............................kmeans__n_clusters=54; total time=   1.1s




[CV] END ..............................kmeans__n_clusters=55; total time=   1.1s




[CV] END ..............................kmeans__n_clusters=55; total time=   1.0s




[CV] END ..............................kmeans__n_clusters=55; total time=   1.5s




[CV] END ..............................kmeans__n_clusters=56; total time=   2.6s




[CV] END ..............................kmeans__n_clusters=56; total time=   1.2s




[CV] END ..............................kmeans__n_clusters=56; total time=   1.2s




[CV] END ..............................kmeans__n_clusters=57; total time=   1.3s




[CV] END ..............................kmeans__n_clusters=57; total time=   1.0s




[CV] END ..............................kmeans__n_clusters=57; total time=   1.1s




[CV] END ..............................kmeans__n_clusters=58; total time=   1.1s




[CV] END ..............................kmeans__n_clusters=58; total time=   1.1s




[CV] END ..............................kmeans__n_clusters=58; total time=   1.1s




[CV] END ..............................kmeans__n_clusters=59; total time=   1.1s




[CV] END ..............................kmeans__n_clusters=59; total time=   2.7s




[CV] END ..............................kmeans__n_clusters=59; total time=   1.4s




[CV] END ..............................kmeans__n_clusters=60; total time=   1.1s




[CV] END ..............................kmeans__n_clusters=60; total time=   1.1s




[CV] END ..............................kmeans__n_clusters=60; total time=   1.2s




[CV] END ..............................kmeans__n_clusters=61; total time=   1.1s




[CV] END ..............................kmeans__n_clusters=61; total time=   1.2s




[CV] END ..............................kmeans__n_clusters=61; total time=   1.1s




[CV] END ..............................kmeans__n_clusters=62; total time=   1.0s




[CV] END ..............................kmeans__n_clusters=62; total time=   1.2s




[CV] END ..............................kmeans__n_clusters=62; total time=   2.2s




[CV] END ..............................kmeans__n_clusters=63; total time=   1.8s




[CV] END ..............................kmeans__n_clusters=63; total time=   1.1s




[CV] END ..............................kmeans__n_clusters=63; total time=   1.2s




[CV] END ..............................kmeans__n_clusters=64; total time=   1.2s




[CV] END ..............................kmeans__n_clusters=64; total time=   1.1s




[CV] END ..............................kmeans__n_clusters=64; total time=   1.3s




[CV] END ..............................kmeans__n_clusters=65; total time=   1.3s




[CV] END ..............................kmeans__n_clusters=65; total time=   1.2s




[CV] END ..............................kmeans__n_clusters=65; total time=   1.3s




[CV] END ..............................kmeans__n_clusters=66; total time=   2.5s




[CV] END ..............................kmeans__n_clusters=66; total time=   1.5s




[CV] END ..............................kmeans__n_clusters=66; total time=   1.2s




[CV] END ..............................kmeans__n_clusters=67; total time=   1.0s




[CV] END ..............................kmeans__n_clusters=67; total time=   1.1s




[CV] END ..............................kmeans__n_clusters=67; total time=   1.2s




[CV] END ..............................kmeans__n_clusters=68; total time=   1.1s




[CV] END ..............................kmeans__n_clusters=68; total time=   1.1s




[CV] END ..............................kmeans__n_clusters=68; total time=   1.3s




[CV] END ..............................kmeans__n_clusters=69; total time=   1.2s




[CV] END ..............................kmeans__n_clusters=69; total time=   2.9s




[CV] END ..............................kmeans__n_clusters=69; total time=   1.5s




[CV] END ..............................kmeans__n_clusters=70; total time=   1.2s




[CV] END ..............................kmeans__n_clusters=70; total time=   1.2s




[CV] END ..............................kmeans__n_clusters=70; total time=   1.2s




[CV] END ..............................kmeans__n_clusters=71; total time=   1.2s




[CV] END ..............................kmeans__n_clusters=71; total time=   1.2s




[CV] END ..............................kmeans__n_clusters=71; total time=   1.4s




[CV] END ..............................kmeans__n_clusters=72; total time=   1.2s




[CV] END ..............................kmeans__n_clusters=72; total time=   2.4s




[CV] END ..............................kmeans__n_clusters=72; total time=   1.8s




[CV] END ..............................kmeans__n_clusters=73; total time=   1.3s




[CV] END ..............................kmeans__n_clusters=73; total time=   1.2s




[CV] END ..............................kmeans__n_clusters=73; total time=   1.3s




[CV] END ..............................kmeans__n_clusters=74; total time=   1.2s




[CV] END ..............................kmeans__n_clusters=74; total time=   1.3s




[CV] END ..............................kmeans__n_clusters=74; total time=   1.3s




[CV] END ..............................kmeans__n_clusters=75; total time=   1.3s




[CV] END ..............................kmeans__n_clusters=75; total time=   1.8s




[CV] END ..............................kmeans__n_clusters=75; total time=   2.4s




[CV] END ..............................kmeans__n_clusters=76; total time=   1.2s




[CV] END ..............................kmeans__n_clusters=76; total time=   1.2s




[CV] END ..............................kmeans__n_clusters=76; total time=   1.3s




[CV] END ..............................kmeans__n_clusters=77; total time=   1.2s




[CV] END ..............................kmeans__n_clusters=77; total time=   1.4s




[CV] END ..............................kmeans__n_clusters=77; total time=   1.6s




[CV] END ..............................kmeans__n_clusters=78; total time=   1.3s




[CV] END ..............................kmeans__n_clusters=78; total time=   1.7s




[CV] END ..............................kmeans__n_clusters=78; total time=   2.5s




[CV] END ..............................kmeans__n_clusters=79; total time=   1.0s




[CV] END ..............................kmeans__n_clusters=79; total time=   1.3s




[CV] END ..............................kmeans__n_clusters=79; total time=   1.3s




[CV] END ..............................kmeans__n_clusters=80; total time=   1.3s




[CV] END ..............................kmeans__n_clusters=80; total time=   1.2s




[CV] END ..............................kmeans__n_clusters=80; total time=   1.3s




[CV] END ..............................kmeans__n_clusters=81; total time=   1.2s




[CV] END ..............................kmeans__n_clusters=81; total time=   1.3s




[CV] END ..............................kmeans__n_clusters=81; total time=   2.5s




[CV] END ..............................kmeans__n_clusters=82; total time=   1.8s




[CV] END ..............................kmeans__n_clusters=82; total time=   1.2s




[CV] END ..............................kmeans__n_clusters=82; total time=   1.2s




[CV] END ..............................kmeans__n_clusters=83; total time=   1.2s




[CV] END ..............................kmeans__n_clusters=83; total time=   1.2s




[CV] END ..............................kmeans__n_clusters=83; total time=   1.2s




[CV] END ..............................kmeans__n_clusters=84; total time=   1.3s




[CV] END ..............................kmeans__n_clusters=84; total time=   1.1s




[CV] END ..............................kmeans__n_clusters=84; total time=   1.9s




[CV] END ..............................kmeans__n_clusters=85; total time=   2.4s




[CV] END ..............................kmeans__n_clusters=85; total time=   1.3s




[CV] END ..............................kmeans__n_clusters=85; total time=   1.2s




[CV] END ..............................kmeans__n_clusters=86; total time=   1.3s




[CV] END ..............................kmeans__n_clusters=86; total time=   1.1s




[CV] END ..............................kmeans__n_clusters=86; total time=   1.3s




[CV] END ..............................kmeans__n_clusters=87; total time=   1.3s




[CV] END ..............................kmeans__n_clusters=87; total time=   1.3s




[CV] END ..............................kmeans__n_clusters=87; total time=   1.8s




[CV] END ..............................kmeans__n_clusters=88; total time=   2.5s




[CV] END ..............................kmeans__n_clusters=88; total time=   1.3s




[CV] END ..............................kmeans__n_clusters=88; total time=   1.3s




[CV] END ..............................kmeans__n_clusters=89; total time=   1.2s




[CV] END ..............................kmeans__n_clusters=89; total time=   1.2s




[CV] END ..............................kmeans__n_clusters=89; total time=   1.3s




### Se obtiene el mejor modelo y los mejores hiperparametros

In [46]:
best_modelK = grid_clf.best_estimator_
best_parametersK = grid_clf.best_params_

In [56]:
print(best_parametersK)

{'kmeans__n_clusters': 86}


### Prediciendo los resultados del test optimizado con clustering

In [47]:
y_predKOpt = best_modelK.predict(X_test)

### Evaluación de las metricas del modelo optimizado

##### Realizando la matriz de confusión

In [48]:
cm = confusion_matrix(y_test, y_predKOpt)
print(cm)

[[1428   98]
 [   1  424]]


##### Accuracy Score

In [49]:
accuracy_score(y_test, y_predKOpt)

0.9492567913890313

##### Precision

In [50]:
precision_score(y_test, y_predKOpt)

0.8122605363984674

##### Recall

In [51]:
recall_score(y_test, y_predKOpt)

0.9976470588235294

##### F1-Score

In [52]:
f1_score(y_test, y_predKOpt)

0.8954593453009504

##### Tablas metricas

In [53]:
print(classification_report(y_test, y_predKOpt,  labels=[0,1], target_names=['No ocupado', ' ocupado']))

              precision    recall  f1-score   support

  No ocupado       1.00      0.94      0.97      1526
     ocupado       0.81      1.00      0.90       425

    accuracy                           0.95      1951
   macro avg       0.91      0.97      0.93      1951
weighted avg       0.96      0.95      0.95      1951



## Score de naive bayes usando clustering optimizado

In [54]:
best_modelK.score(X_test, y_test)

0.9492567913890313

## Calculamos la tasa de error usando clustering optimizado

In [55]:
1 - accuracy_score(y_test, y_predKOpt)

0.050743208610968704