# Modelo Utilizando Random Forest

## Importación de las librerias

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline

from sklearn.model_selection import RandomizedSearchCV


## Importamos el dataset

In [None]:
dataset = pd.read_csv('/content/datasetParcial.csv')

## Preprocesamiento de los datos

In [None]:
X = dataset.iloc[:, 1:-1].values
y = dataset.iloc[:, -1].values

In [None]:
print(X)

[[2.17600000e+01 3.11333333e+01 4.37333333e+02 1.02966667e+03
  5.02101089e-03]
 [2.17900000e+01 3.10000000e+01 4.37333333e+02 1.00000000e+03
  5.00858127e-03]
 [2.17675000e+01 3.11225000e+01 4.34000000e+02 1.00375000e+03
  5.02156913e-03]
 ...
 [2.08900000e+01 2.77450000e+01 4.23500000e+02 1.52150000e+03
  4.23681810e-03]
 [2.08900000e+01 2.80225000e+01 4.18750000e+02 1.63200000e+03
  4.27948547e-03]
 [2.10000000e+01 2.81000000e+01 4.09000000e+02 1.86400000e+03
  4.32073200e-03]]


In [None]:
print(y)

[1 1 1 ... 1 1 1]


## Particion de los datos entre el conjunto de entrenamiento y el conjunto para test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [None]:
print(X_train)

[[1.98900000e+01 3.54000000e+01 0.00000000e+00 5.29000000e+02
  5.08894379e-03]
 [2.00000000e+01 3.57000000e+01 0.00000000e+00 5.40000000e+02
  5.16782401e-03]
 [2.07900000e+01 2.62000000e+01 0.00000000e+00 1.49800000e+03
  3.97467591e-03]
 ...
 [2.05000000e+01 2.37000000e+01 0.00000000e+00 5.81500000e+02
  3.52920391e-03]
 [2.01000000e+01 3.22000000e+01 0.00000000e+00 5.69666667e+02
  4.68652701e-03]
 [2.15000000e+01 2.88233333e+01 0.00000000e+00 6.22333333e+02
  4.57184298e-03]]


In [None]:
print(X_test)

[[2.03900000e+01 3.27900000e+01 0.00000000e+00 6.59000000e+02
  4.86008397e-03]
 [2.10000000e+01 3.37000000e+01 0.00000000e+00 1.50000000e+03
  5.18898657e-03]
 [2.00000000e+01 3.05000000e+01 0.00000000e+00 7.16500000e+02
  4.40975111e-03]
 ...
 [2.06000000e+01 2.19700000e+01 6.00000000e+00 9.26750000e+02
  3.29057323e-03]
 [2.01000000e+01 3.35000000e+01 0.00000000e+00 5.51000000e+02
  4.87721823e-03]
 [2.06000000e+01 2.50000000e+01 0.00000000e+00 5.50000000e+02
  3.74712771e-03]]


In [None]:
print(y_train)

[0 0 0 ... 0 0 0]


In [None]:
print(y_test)

[0 0 0 ... 0 0 0]


## Escalando las caracteristicas

In [None]:
sc = StandardScaler()
X_train[:, 1:-1] = sc.fit_transform(X_train[:, 1:-1])
X_test[:, 1:-1] = sc.transform(X_test[:, 1:-1])

## Entrenando el modelo Random Forest con el conjunto de entrenamiento

In [None]:
classifier = RandomForestClassifier(n_estimators=100, criterion='entropy', random_state=0)
classifier.fit(X_train, y_train)

## Prediciendo los resultados del test

In [None]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[0 0]
 [0 0]
 [0 0]
 ...
 [0 0]
 [0 0]
 [0 0]]


## Evaluación de las metricas

##### Realizando la matriz de confusión

In [None]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[1518    8]
 [   6  419]]


##### Accuracy Score

In [None]:
accuracy_score(y_test, y_pred)

0.9928241927216812

##### Precision

In [None]:
# Valores desde la matriz de confusión
# TP: Verdaderos Positivos
# TN: Verdaderos Negativos
# FP: Falsos Positivos
# FN: Falsos Negativos

TP = 419  # Verdaderos Positivos
TN = 1518  # Verdaderos Negativos
FP = 8  # Falsos Positivos
FN = 6  # Falsos Negativos

# Cálculo de la precisión
precision = TP / (TP + FP)
print("Precisión:", precision)

Precisión: 0.9812646370023419


##### Recall

In [None]:
recall = TP / (TP + FN)
print("Recall:", recall)

Recall: 0.9858823529411764


##### F1-Score

In [None]:
f1_score = 2 * (precision * recall) / (precision + recall)
print("Puntaje F1:", f1_score)

Puntaje F1: 0.9835680751173709


## Score de los arboles de decision

In [None]:
classifier.score(X_test, y_test)

0.9928241927216812

## Calculamos la tasa de error

In [None]:
1 - accuracy_score(y_test, y_pred)

0.007175807278318791

####obtimizado sin reduccion

Obtimizado

In [None]:
param_distribs = {
    'n_estimators': [10, 20, 30],
    'criterion': ['entropy', 'gini'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2'],
    'random_state': [0]
}

In [None]:
classifier = RandomForestClassifier()

# Realiza una búsqueda aleatoria de hiperparámetros
rnd_search_cv = RandomizedSearchCV(classifier, param_distribs, n_iter=10, cv=5, scoring='accuracy')
rnd_search_cv.fit(X_train, y_train)

# Obtiene el mejor modelo con la mejor configuración de hiperparámetros
best_rf_model = rnd_search_cv.best_estimator_


  warn(
  warn(
  warn(
  warn(
  warn(


## Prediciendo los resultados del test

In [None]:
y_predK = pipeline.predict(X_test)
print(np.concatenate((y_predK.reshape(len(y_predK),1), y_test.reshape(len(y_test),1)),1))

[[0 0]
 [0 0]
 [0 0]
 ...
 [0 0]
 [0 0]
 [0 0]]


Evaluación de las metricas

In [None]:
cmK = confusion_matrix(y_test, y_predK)
print(cmK)

[[1516   10]
 [   6  419]]


Accuracy Score

In [None]:
accuracy_score(y_test, y_predK)

0.9917990773962071

Precision

In [None]:
# Valores desde la matriz de confusión
# TP: Verdaderos Positivos (Predicciones correctas de la clase positiva)
# TN: Verdaderos Negativos (Predicciones correctas de la clase negativa)
# FP: Falsos Positivos (Predicciones incorrectas de la clase positiva)
# FN: Falsos Negativos (Predicciones incorrectas de la clase negativa)

# Valores que proporcionaste
TPK = 419
TNK = 1516
FPK = 10
FNK = 6

# Cálculo de la precisión
precisionK = TPK / (TPK + FPK)
print("Precisión:", precisionK)

Precisión: 0.9766899766899767


Recall

In [None]:
recallK = TPK / (TPK + FNK)
print("Recall:", recallK)

Recall: 0.9858823529411764


F1-Score

In [None]:
f1_scoreK = 2 * (precisionK * recallK) / (precisionK + recallK)
print("Puntaje F1:", f1_scoreK)

Puntaje F1: 0.9812646370023418


Score de la regresion logistica usando clustering

In [None]:
pipeline.score(X_test, y_test)

0.9917990773962071

Calculamos la tasa de error usando clustering

In [None]:
1 - accuracy_score(y_test, y_pred)

0.007175807278318791

## Evaluación de las metricas

### Nos creamos un pipeline para combinar la ejecución, primero K-Means y luego random forest

In [None]:
pipeline = Pipeline([
    ("kmeans", KMeans(n_clusters=60, random_state=42)),
    ("random_forest", classifier)
])
pipeline.fit(X_train, y_train)



##### Realizando la matriz de confusión

In [None]:
cmK = confusion_matrix(y_test, y_predK)
print(cmK)

[[1516   10]
 [   6  419]]


##### Accuracy Score

In [None]:
accuracy_score(y_test, y_predK)

0.9917990773962071

##### Precision

In [None]:
# Valores desde la matriz de confusión
# TP: Verdaderos Positivos (Predicciones correctas de la clase positiva)
# TN: Verdaderos Negativos (Predicciones correctas de la clase negativa)
# FP: Falsos Positivos (Predicciones incorrectas de la clase positiva)
# FN: Falsos Negativos (Predicciones incorrectas de la clase negativa)

# Valores que proporcionaste
TPK = 419
TNK = 1516
FPK = 10
FNK = 6

# Cálculo de la precisión
precisionK = TPK / (TPK + FPK)
print("Precisión:", precisionK)

Precisión: 0.9766899766899767


##### Recall

In [None]:
recallK = TPK / (TPK + FNK)
print("Recall:", recallK)

Recall: 0.9858823529411764


##### F1-Score

In [None]:
f1_scoreK = 2 * (precisionK * recallK) / (precisionK + recallK)
print("Puntaje F1:", f1_scoreK)

Puntaje F1: 0.9812646370023418


## Score de la regresion logistica usando clustering

In [None]:
pipeline.score(X_test, y_test)

0.99128651973347





## Calculamos la tasa de error usando clustering

In [None]:
1 - accuracy_score(y_test, y_pred)

0.007175807278318791

## Obtimizado

In [None]:
classifier = RandomForestClassifier()

# Realiza una búsqueda aleatoria de hiperparámetros
rnd_search_cv = RandomizedSearchCV(classifier, param_distribs, n_iter=10, cv=5, scoring='accuracy')
rnd_search_cv.fit(X_train, y_train)

# Obtiene el mejor modelo con la mejor configuración de hiperparámetros
best_rf_model = rnd_search_cv.best_estimator_


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Prediciendo los resultados del test

In [None]:
y_predK = pipeline.predict(X_test)
print(np.concatenate((y_predK.reshape(len(y_predK),1), y_test.reshape(len(y_test),1)),1))

[[0 0]
 [0 0]
 [0 0]
 ...
 [0 0]
 [0 0]
 [0 0]]


Evaluación de las metricas

In [None]:
cmK = confusion_matrix(y_test, y_predK)
print(cmK)

[[1515   11]
 [   6  419]]


Accuracy Score

In [None]:
accuracy_score(y_test, y_predK)

0.99128651973347

Precision

In [None]:
# Valores desde la matriz de confusión
# TP: Verdaderos Positivos (Predicciones correctas de la clase positiva)
# TN: Verdaderos Negativos (Predicciones correctas de la clase negativa)
# FP: Falsos Positivos (Predicciones incorrectas de la clase positiva)
# FN: Falsos Negativos (Predicciones incorrectas de la clase negativa)

# Valores que proporcionaste
TPK = 419
TNK = 1516
FPK = 10
FNK = 6

# Cálculo de la precisión
precisionK = TPK / (TPK + FPK)
print("Precisión:", precisionK)

Precisión: 0.9766899766899767


Recall

In [None]:
recallK = TPK / (TPK + FNK)
print("Recall:", recallK)

Recall: 0.9858823529411764


F1-Score

In [None]:
f1_scoreK = 2 * (precisionK * recallK) / (precisionK + recallK)
print("Puntaje F1:", f1_scoreK)

Puntaje F1: 0.9812646370023418


Score de la regresion logistica usando clustering

In [None]:
pipeline.score(X_test, y_test)

0.99128651973347

Calculamos la tasa de error usando clustering

In [None]:
1 - accuracy_score(y_test, y_pred)

0.007175807278318791