# Modelo utilizando K-Nearest Neighbors (K-NN)

## Importación de las librerias

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from pandas.plotting import scatter_matrix
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

## Importamos el dataset

In [None]:
dataset = pd.read_csv('/content/datasetParcial.csv')

## Preprocesamiento de los datos

In [None]:
X = dataset.iloc[:, 1:-1].values
y = dataset.iloc[:, -1].values

In [None]:
print(X)

[[2.17600000e+01 3.11333333e+01 4.37333333e+02 1.02966667e+03
  5.02101089e-03]
 [2.17900000e+01 3.10000000e+01 4.37333333e+02 1.00000000e+03
  5.00858127e-03]
 [2.17675000e+01 3.11225000e+01 4.34000000e+02 1.00375000e+03
  5.02156913e-03]
 ...
 [2.08900000e+01 2.77450000e+01 4.23500000e+02 1.52150000e+03
  4.23681810e-03]
 [2.08900000e+01 2.80225000e+01 4.18750000e+02 1.63200000e+03
  4.27948547e-03]
 [2.10000000e+01 2.81000000e+01 4.09000000e+02 1.86400000e+03
  4.32073200e-03]]


In [None]:
print(y)

[1 1 1 ... 1 1 1]


## Particion de los datos entre el conjunto de entrenamiento y el conjunto para test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [None]:
print(X_train)

[[1.98900000e+01 3.54000000e+01 0.00000000e+00 5.29000000e+02
  5.08894379e-03]
 [2.00000000e+01 3.57000000e+01 0.00000000e+00 5.40000000e+02
  5.16782401e-03]
 [2.07900000e+01 2.62000000e+01 0.00000000e+00 1.49800000e+03
  3.97467591e-03]
 ...
 [2.05000000e+01 2.37000000e+01 0.00000000e+00 5.81500000e+02
  3.52920391e-03]
 [2.01000000e+01 3.22000000e+01 0.00000000e+00 5.69666667e+02
  4.68652701e-03]
 [2.15000000e+01 2.88233333e+01 0.00000000e+00 6.22333333e+02
  4.57184298e-03]]


In [None]:
print(X_test)

[[2.03900000e+01 3.27900000e+01 0.00000000e+00 6.59000000e+02
  4.86008397e-03]
 [2.10000000e+01 3.37000000e+01 0.00000000e+00 1.50000000e+03
  5.18898657e-03]
 [2.00000000e+01 3.05000000e+01 0.00000000e+00 7.16500000e+02
  4.40975111e-03]
 ...
 [2.06000000e+01 2.19700000e+01 6.00000000e+00 9.26750000e+02
  3.29057323e-03]
 [2.01000000e+01 3.35000000e+01 0.00000000e+00 5.51000000e+02
  4.87721823e-03]
 [2.06000000e+01 2.50000000e+01 0.00000000e+00 5.50000000e+02
  3.74712771e-03]]


In [None]:
print(y_train)

[0 0 0 ... 0 0 0]


In [None]:
print(y_test)

[0 0 0 ... 0 0 0]


## Escalando las caracteristicas

In [None]:
sc = StandardScaler()
X_train[:, 1:-1] = sc.fit_transform(X_train[:, 1:-1])
X_test[:, 1:-1] = sc.transform(X_test[:, 1:-1])

## Entrenando el modelo de K-Nearest Neighbors (K-NN) con el conjunto de entrenamiento

In [None]:
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)

## Prediciendo los resultados del test

In [None]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[0 0]
 [0 0]
 [0 0]
 ...
 [0 0]
 [0 0]
 [0 0]]


## Evaluación de las metricas

##### Realizando la matriz de confusión

In [None]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[1515   11]
 [   5  420]]


##### Accuracy Score

In [None]:
accuracy_score(y_test, y_pred)

0.9917990773962071

##### Precision

In [None]:
# Valores desde la matriz de confusión
# TP: Verdaderos Positivos (Predicciones correctas de la clase positiva)
# TN: Verdaderos Negativos (Predicciones correctas de la clase negativa)
# FP: Falsos Positivos (Predicciones incorrectas de la clase positiva)
# FN: Falsos Negativos (Predicciones incorrectas de la clase negativa)

# Valores que proporcionaste
TP = 420
TN = 1515
FP = 11
FN = 5

# Cálculo de la precisión
precision = TP / (TP + FP)
print("Precisión:", precision)

Precisión: 0.974477958236659


##### Recall

In [None]:
recall = TP / (TP + FN)
print("Recall:", recall)

Recall: 0.9882352941176471


##### F1-Score

In [None]:
f1_score = 2 * (precision * recall) / (precision + recall)
print("Puntaje F1:", f1_score)

Puntaje F1: 0.9813084112149534


## Score de K-Nearest Neighbors (K-NN)

In [None]:
classifier.score(X_test, y_test)

0.9917990773962071

## Calculamos la tasa de error

In [None]:
1 - accuracy_score(y_test, y_pred)

0.008200922603792904

# Modelo Optimizado con Grid_Search para encontrar los mejores parametros y modelo

In [None]:
knn_parameters = {
    'n_neighbors': [3, 5, 7],  # Ajusta el número de vecinos
    'weights': ['uniform', 'distance'],  # Ajusta la estrategia de peso
    'p': [1, 2]  # Ajusta la métrica de distancia (1 para Manhattan, 2 para Euclidiana)
}

knn_grid_search = GridSearchCV(
    estimator=classifier,
    param_grid=knn_parameters,
    scoring='accuracy',
    cv=5,
    refit=True,
    n_jobs=-1
)

knn_grid_search.fit(X_train, y_train)

best_accuracy_knn = knn_grid_search.best_score_
best_parameters_knn = knn_grid_search.best_params_
best_model = knn_grid_search.best_estimator_
print("Best Accuracy (K-Nearest Neighbors): {:.2f} %".format(best_accuracy_knn * 100))
print("Best Parameters (K-Nearest Neighbors):", best_parameters_knn)


Best Accuracy (K-Nearest Neighbors): 99.56 %
Best Parameters (K-Nearest Neighbors): {'n_neighbors': 3, 'p': 1, 'weights': 'uniform'}


In [None]:
# evalua el mejor modelo
y_pred = best_model.predict(X_test)
cm_best = confusion_matrix(y_test, y_pred)
print(cm_best)

[[1516   10]
 [   6  419]]


## Evaluación de las metricas del modelo optimizado

In [None]:
print(classification_report(y_test, y_pred,  labels=[0,1], target_names=['No ocupado', ' Ocupado']))

              precision    recall  f1-score   support

  No ocupado       1.00      0.99      0.99      1526
     Ocupado       0.98      0.99      0.98       425

    accuracy                           0.99      1951
   macro avg       0.99      0.99      0.99      1951
weighted avg       0.99      0.99      0.99      1951



##### Realizando la matriz de confusión

In [None]:
print(cm_best)

[[1516   10]
 [   6  419]]


##### Accuracy Score

In [None]:
accuracy_score(y_test, y_pred)

0.9917990773962071

##### Precision

In [None]:
# Valores desde la matriz de confusión
# TP: Verdaderos Positivos
# TN: Verdaderos Negativos
# FP: Falsos Positivos
# FN: Falsos Negativos

TP = 419  # Verdaderos Positivos
TN = 1516  # Verdaderos Negativos
FP = 10  # Falsos Positivos
FN = 6  # Falsos Negativos

# Cálculo de la precisión
precision = TP / (TP + FP)
print("Precisión:", precision)

Precisión: 0.9766899766899767


##### Recall

In [None]:
recall = TP / (TP + FN)
print("Recall:", recall)

Recall: 0.9858823529411764


##### F1-Score

In [None]:
f1_score = 2 * (precision * recall) / (precision + recall)
print("Puntaje F1:", f1_score)

Puntaje F1: 0.9812646370023418


## Score de KNN optimizado

In [None]:
knn_grid_search.score(X_test, y_test)

0.9917990773962071

Calculamos la taza de error de la version optimizada

In [None]:
1 - accuracy_score(y_test, y_pred)

0.008200922603792904

## Preprocesamiento de los datos mediante clustering

### Nos creamos un pipeline para combinar la ejecución, primero K-Means y luego K-Nearest Neighbors (K-NN)

In [None]:
pipeline = Pipeline([
    ("kmeans", KMeans(n_clusters=60, random_state=42)),
    ("knn", KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)),
])
pipeline.fit(X_train, y_train)



## Prediciendo los resultados del test

In [None]:
y_predK = pipeline.predict(X_test)
print(np.concatenate((y_predK.reshape(len(y_predK),1), y_test.reshape(len(y_test),1)),1))

[[0 0]
 [0 0]
 [0 0]
 ...
 [0 0]
 [0 0]
 [0 0]]


## Evaluación de las metricas

##### Realizando la matriz de confusión

In [None]:
cmK = confusion_matrix(y_test, y_predK)
print(cmK)

[[1514   12]
 [   5  420]]


##### Accuracy Score

In [None]:
accuracy_score(y_test, y_predK)

0.99128651973347

##### Precision

In [None]:
# Valores desde la matriz de confusión
# TP: Verdaderos Positivos (Predicciones correctas de la clase positiva)
# TN: Verdaderos Negativos (Predicciones correctas de la clase negativa)
# FP: Falsos Positivos (Predicciones incorrectas de la clase positiva)
# FN: Falsos Negativos (Predicciones incorrectas de la clase negativa)

# Valores que proporcionaste
TPK = 420
TNK = 1514
FPK = 12
FNK = 5

# Cálculo de la precisión
precisionK = TPK / (TPK + FPK)
print("Precisión:", precisionK)

Precisión: 0.9722222222222222


##### Recall

In [None]:
recallK = TPK / (TPK + FNK)
print("Recall:", recallK)

Recall: 0.9882352941176471


##### F1-Score

In [None]:
f1_scoreK = 2 * (precisionK * recallK) / (precisionK + recallK)
print("Puntaje F1:", f1_scoreK)

Puntaje F1: 0.9801633605600933


## Score de K-Nearest Neighbors (K-NN) usando clustering

In [None]:
pipeline.score(X_test, y_test)

0.99128651973347

## Calculamos la tasa de error usando clustering

In [None]:
1 - accuracy_score(y_test, y_predK)

0.00871348026652996

# Modelo con kmeans Optimizado con Grid_Search para encontrar los mejores parametros y modelo

In [None]:
knn_parameters = {
    'knn__n_neighbors': [3, 5, 7],  # Ajusta el número de vecinos
    'knn__weights': ['uniform', 'distance'],  # Ajusta la estrategia de peso
    'knn__p': [1, 2]  # Ajusta la métrica de distancia (1 para Manhattan, 2 para Euclidiana)
}


knn_grid_search_Kmeans = GridSearchCV(
    estimator=pipeline,
    param_grid=knn_parameters,
    scoring='accuracy',
    cv=5,
    refit=True,
    n_jobs=-1
)

knn_grid_search_Kmeans.fit(X_train, y_train)

best_accuracy_knn = knn_grid_search_Kmeans.best_score_
best_parameters_knn = knn_grid_search_Kmeans.best_params_
best_model = knn_grid_search_Kmeans.best_estimator_
print("Best Accuracy (K-Nearest Neighbors): {:.2f} %".format(best_accuracy_knn * 100))
print("Best Parameters (K-Nearest Neighbors):", best_parameters_knn)



Best Accuracy (K-Nearest Neighbors): 99.53 %
Best Parameters (K-Nearest Neighbors): {'knn__n_neighbors': 3, 'knn__p': 2, 'knn__weights': 'uniform'}


In [None]:
# evalua el mejor modelo
y_predKOptimizado = best_model.predict(X_test)
cm_bestKmeans = confusion_matrix(y_test, y_predKOptimizado)
print(cm_bestKmeans)

[[1515   11]
 [   5  420]]


## Evaluación de las metricas con el modelo optimizado

In [None]:
print(classification_report(y_test, y_predKOptimizado,  labels=[0,1], target_names=['No ocupado', ' Ocupado']))

              precision    recall  f1-score   support

  No ocupado       1.00      0.99      0.99      1526
     Ocupado       0.97      0.99      0.98       425

    accuracy                           0.99      1951
   macro avg       0.99      0.99      0.99      1951
weighted avg       0.99      0.99      0.99      1951



##### Realizando la matriz de confusión

In [None]:
print(cm_bestKmeans)

[[1515   11]
 [   5  420]]


##### Accuracy Score

In [None]:
accuracy_score(y_test, y_predKOptimizado)

0.9917990773962071

##### Precision

In [None]:
# Valores desde la matriz de confusión
# TP: Verdaderos Positivos (Predicciones correctas de la clase positiva)
# TN: Verdaderos Negativos (Predicciones correctas de la clase negativa)
# FP: Falsos Positivos (Predicciones incorrectas de la clase positiva)
# FN: Falsos Negativos (Predicciones incorrectas de la clase negativa)

# Valores que proporcionaste
TPKO = 420
TNKO = 1515
FPKO = 11
FNKO = 5

# Cálculo de la precisión
precisionKOpti = TPKO / (TPKO + FPKO)
print("Precisión:", precisionKOpti)

Precisión: 0.974477958236659


##### Recall

In [None]:
recallKOpti = TPKO / (TPKO + FNKO)
print("Recall:", recallKOpti)

Recall: 0.9882352941176471


##### F1-Score

In [None]:
f1_scoreKOpti = 2 * (precisionKOpti * recallKOpti) / (precisionKOpti + recallKOpti)
print("Puntaje F1:", f1_scoreKOpti)

Puntaje F1: 0.9813084112149534


## Score de KNN optimizado

In [None]:
knn_grid_search_Kmeans.score(X_test, y_test)

0.9917990773962071

## Calculamos la tasa de error usando clustering

In [None]:
1 - accuracy_score(y_test, y_predKOptimizado)

0.008200922603792904