## **FASE IV: MODELADO**

### **KNN (K-Nearest Neighbors, Regresión)**

In [26]:
# Librerías de Nvidia para procesamiento con GPU
import cudf

# Carga de parquet preprocesado
path = 'monopoly_cleaned_parquet'
monopoly = cudf.read_parquet(path)

In [27]:
monopoly.head()

Unnamed: 0,Subsegmento,Sexo,Region,Edad,Renta,Antiguedad,Internauta,Adicional,Dualidad,Monoproducto,...,ColL2AC_avg,ColL2CC_avg,ColMx_avg,PagoNac_avg,PagoInt_avg,EeccNac_avg,EeccInt_avg,UsoL1_avg,UsoL2_avg,UsoLI_avg
0,160.0,1.0,13.0,43.0,601932.8,130.0,1.0,1.0,0.0,0.0,...,0.0,34699.666667,0.0,29333.333333,0.0,908079.416667,0.0,913045.583333,53874.25,0.0
1,160.0,0.0,13.0,46.0,143640.0,69.0,1.0,0.0,0.0,0.0,...,0.0,17597.25,45699.2125,167416.666667,0.0,460136.5,58.904167,428734.833333,119963.916667,84.508333
2,170.0,0.0,13.0,45.0,929106.0,24.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,68325.75,0.0,8510.75,0.0,25294.5,0.0,0.0
3,151.0,0.0,13.0,46.0,172447.0,134.0,0.0,1.0,0.0,1.0,...,0.0,2706.5,0.0,57083.333333,0.0,341916.833333,0.0,344417.5,31825.25,0.0
4,170.0,0.0,13.0,46.0,805250.0,116.0,0.0,1.0,1.0,0.0,...,0.0,6227.666667,0.0,291317.166667,0.0,291684.5,0.0,83849.25,402597.666667,0.0


In [28]:
# Asignar variables objetivo
X = monopoly.drop(['Renta'], axis=1)
X.head()

Unnamed: 0,Subsegmento,Sexo,Region,Edad,Antiguedad,Internauta,Adicional,Dualidad,Monoproducto,Ctacte,...,ColL2AC_avg,ColL2CC_avg,ColMx_avg,PagoNac_avg,PagoInt_avg,EeccNac_avg,EeccInt_avg,UsoL1_avg,UsoL2_avg,UsoLI_avg
0,160.0,1.0,13.0,43.0,130.0,1.0,1.0,0.0,0.0,1.0,...,0.0,34699.666667,0.0,29333.333333,0.0,908079.416667,0.0,913045.583333,53874.25,0.0
1,160.0,0.0,13.0,46.0,69.0,1.0,0.0,0.0,0.0,1.0,...,0.0,17597.25,45699.2125,167416.666667,0.0,460136.5,58.904167,428734.833333,119963.916667,84.508333
2,170.0,0.0,13.0,45.0,24.0,1.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,68325.75,0.0,8510.75,0.0,25294.5,0.0,0.0
3,151.0,0.0,13.0,46.0,134.0,0.0,1.0,0.0,1.0,0.0,...,0.0,2706.5,0.0,57083.333333,0.0,341916.833333,0.0,344417.5,31825.25,0.0
4,170.0,0.0,13.0,46.0,116.0,0.0,1.0,1.0,0.0,1.0,...,0.0,6227.666667,0.0,291317.166667,0.0,291684.5,0.0,83849.25,402597.666667,0.0


In [34]:
y = monopoly['Renta']
y.head()

0    601932.8
1    143640.0
2    929106.0
3    172447.0
4    805250.0
Name: Renta, dtype: float64

#### **Modelo N°1**

In [35]:
from cuml.neighbors import KNeighborsRegressor

# Creación del modelo
knn_1 = KNeighborsRegressor(n_neighbors=5)

In [36]:
knn_1 = knn_1.fit(X, y)

In [37]:
Y_hat = knn_1.predict(X)

In [40]:
# Evaluación del modelo
from cuml.metrics import mean_squared_error
from cuml.metrics import r2_score
from cuml.metrics import mean_absolute_error
from math import sqrt


MAE_1 = mean_absolute_error(y, Y_hat)
MSE_1 = mean_squared_error(y, Y_hat)
RMSE_1 = sqrt(MSE_1)
r2_1 = r2_score(y, Y_hat)

print("MAE: %.2f" % MAE_1 )
print("MSE: %.2f" % MSE_1)
print("RMSE: %.2f" % RMSE_1)
print('r2: %.2f' % r2_1)

MAE: 168665.58
MSE: 79441744940.96
RMSE: 281854.12
r2: 0.45


#### **Modelo N°2**

In [45]:
from cuml.model_selection import train_test_split

# Dividir en train y test
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=52)

In [63]:
knn_2 = KNeighborsRegressor(n_neighbors=7, metric='euclidean', weights='uniform')

In [64]:
knn_2 = knn_2.fit(X_train, y_train)

In [65]:
Y_hat = knn_2.predict(X_test)

In [66]:
MAE_2 = mean_absolute_error(y_test, Y_hat)
MSE_2 = mean_squared_error(y_test, Y_hat)
RMSE_2 = sqrt(MSE_2)
r2_2 = r2_score(y_test, Y_hat)

print("MAE: %.2f" % MAE_2 )
print("MSE: %.2f" % MSE_2)
print("RMSE: %.2f" % RMSE_2)
print('r2: %.2f' % r2_2)

MAE: 201465.16
MSE: 104804196651.52
RMSE: 323734.76
r2: 0.21


#### **Modelo N°3**

In [67]:
knn_3 = KNeighborsRegressor(n_neighbors=12, metric='minkowski', p=5)

In [68]:
knn_3 = knn_3.fit(X_train, y_train)
Y_hat = knn_3.predict(X_test)

In [69]:
MAE_3 = mean_absolute_error(y_test, Y_hat)
MSE_3 = mean_squared_error(y_test, Y_hat)
RMSE_3 = sqrt(MSE_3)
r2_3 = r2_score(y_test, Y_hat)

print("MAE: %.2f" % MAE_3)
print("MSE: %.2f" % MSE_3)
print("RMSE: %.2f" % RMSE_3)
print('r2: %.2f' % r2_3)

MAE: 202024.48
MSE: 102324658332.41
RMSE: 319882.26
r2: 0.22


#### **Modelo N°4**

In [71]:
knn_4 = KNeighborsRegressor(n_neighbors=17, metric='manhattan', weights='uniform')

In [72]:
knn_4 = knn_4.fit(X_train, y_train)
Y_hat = knn_4.predict(X_test)

In [73]:
MAE_4 = mean_absolute_error(y_test, Y_hat)
MSE_4 = mean_squared_error(y_test, Y_hat)
RMSE_4 = sqrt(MSE_4)
r2_4 = r2_score(y_test, Y_hat)

print("MAE: %.2f" % MAE_4)
print("MSE: %.2f" % MSE_4)
print("RMSE: %.2f" % RMSE_4)
print('r2: %.2f' % r2_4)

MAE: 200080.00
MSE: 100557116507.42
RMSE: 317107.42
r2: 0.24


#### **Modelo N°5**

In [None]:
from cuml.model_selection import GridSearchCV

# Definir la lista de hiperparámetros a probar
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 12],  # Valores de K
    'metric': ['euclidean', 'manhattan', 'minkowski']  # Métricas de distancia
}

# Crear el modelo KNN Regressor
knn = KNeighborsRegressor()

# Realizar la búsqueda en cuadrícula
grid_search = GridSearchCV(estimator=knn, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)
grid_search.fit(X_train, y_train)  # X_train y y_train son tus datos de entrenamiento

# Obtener los mejores hiperparámetros
best_params = grid_search.best_params_
best_params