<a href="https://colab.research.google.com/github/daneelsan/INF648-Project/blob/main/notebooks/DanielS_Test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Inicialización

In [2]:
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

## Leer data y limpieza

In [3]:
# Leer el dataset
raw_data_url = "https://raw.githubusercontent.com/daneelsan/INF648-Project/main/dataset/sgemm_product.csv"
raw_data = pd.read_csv(raw_data_url)

In [4]:
# Mantenemos la data original ('raw_data') para usarla en el futuro
data = raw_data

In [5]:
# Las últimas cuatro columnas son las variables objetivo (Run1 (ms), Run2 (ms), Run3 (ms), Run4 (ms))
# Crear una variable objetivo única como el logaritmo del promedio de estos tiempos de ejecución
data['average_run'] = raw_data[['Run1 (ms)', 'Run2 (ms)', 'Run3 (ms)', 'Run4 (ms)']].mean(axis=1)
data['log_average_run'] = np.log(data['average_run'])

In [6]:
# Eliminar las columnas de tiempos de ejecución originales y la columna average_run
data = data.drop(columns=['Run1 (ms)', 'Run2 (ms)', 'Run3 (ms)', 'Run4 (ms)', 'average_run'])

## Preparar data de entrenamiento y prueba

In [7]:
# Características y objetivo
X = data.drop(columns=['log_average_run'])
y = data['log_average_run']

In [8]:
# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Modelo de regresión polinómica (grado 2)

In [24]:
# Crear un modelo de regresión polinómica (grado 2 por ejemplo)
degree = 2
model = make_pipeline(PolynomialFeatures(degree), LinearRegression())

In [25]:
# Entrenar el modelo
model.fit(X_train, y_train)

In [26]:
# Hacer predicciones
y_pred = model.predict(X_test)

In [27]:
# Evaluar el modelo
mse = mean_squared_error(y_test, y_pred)
print(f"Error Cuadrático Medio: {mse}")

Error Cuadrático Medio: 0.23522183534728414


In [28]:
# Opcional: Mostrar algunas predicciones
print(f"Predicciones: {y_pred[:5]}")
print(f"Valores reales: {y_test[:5].values}")

Predicciones: [4.48377952 5.21391126 3.84304001 5.44636378 3.62699878]
Valores reales: [4.60968498 5.52616981 4.19203826 5.46351396 3.58317166]


## Optimizar el modelo usando validación cruzada para seleccionar el mejor grado del polinomio

In [9]:
# Definir los grados a probar
param_grid = {'polynomialfeatures__degree': [1, 2, 3]}

In [10]:
# Crear un pipeline para la regresión polinómica
pipeline = Pipeline([
    ('polynomialfeatures', PolynomialFeatures()),
    ('linearregression', LinearRegression())
])

In [13]:
# Realizar la búsqueda de cuadrícula para encontrar el mejor grado de polinomio
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', verbose=60)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV 1/5; 1/3] START polynomialfeatures__degree=1................................
[CV 1/5; 1/3] END polynomialfeatures__degree=1;, score=-0.562 total time=   0.4s
[CV 2/5; 1/3] START polynomialfeatures__degree=1................................
[CV 2/5; 1/3] END polynomialfeatures__degree=1;, score=-0.561 total time=   0.4s
[CV 3/5; 1/3] START polynomialfeatures__degree=1................................
[CV 3/5; 1/3] END polynomialfeatures__degree=1;, score=-0.554 total time=   0.2s
[CV 4/5; 1/3] START polynomialfeatures__degree=1................................
[CV 4/5; 1/3] END polynomialfeatures__degree=1;, score=-0.564 total time=   0.2s
[CV 5/5; 1/3] START polynomialfeatures__degree=1................................
[CV 5/5; 1/3] END polynomialfeatures__degree=1;, score=-0.556 total time=   0.2s
[CV 1/5; 2/3] START polynomialfeatures__degree=2................................
[CV 1/5; 2/3] END polynomialfeatures__degree=2;, 

In [14]:
# Mejor grado encontrado
best_degree = grid_search.best_params_['polynomialfeatures__degree']
print(f"Mejor grado encontrado: {best_degree}")

Mejor grado encontrado: 3


In [15]:
# Crear el modelo final con el mejor grado
best_model = grid_search.best_estimator_

In [16]:
# Entrenar el modelo
best_model.fit(X_train, y_train)

In [17]:
# Hacer predicciones
y_pred = best_model.predict(X_test)

In [18]:
# Evaluar el modelo
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Error Cuadrático Medio (MSE): {mse}")
print(f"Error Absoluto Medio (MAE): {mae}")
print(f"Coeficiente de Determinación (R²): {r2}")

Error Cuadrático Medio (MSE): 0.15183736930709554
Error Absoluto Medio (MAE): 0.31260194160852195
Coeficiente de Determinación (R²): 0.8795824717964692


In [19]:
# Mostrar algunas predicciones
print(f"Predicciones: {y_pred[:5]}")
print(f"Valores reales: {y_test[:5].values}")

Predicciones: [4.66184977 5.51113583 4.28428899 5.33158676 3.61351962]
Valores reales: [4.60968498 5.52616981 4.19203826 5.46351396 3.58317166]
