<a href="https://colab.research.google.com/github/cristianmunoz1/Intro_ML_project/blob/main/09_PCA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Implementación de PCA

Importe de librerías necesarias

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold
from sklearn.decomposition import PCA
from sklearn.metrics import mean_absolute_error
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
import time

Cargamos los datos al notebook

In [None]:
data = pd.read_csv('data.csv')
X = data.drop(['TimeInShelterDays'], axis=1).to_numpy()
y = data['TimeInShelterDays'].values.reshape(-1, 1)
scaler_y = StandardScaler()
y = scaler_y.fit_transform(y).ravel()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Se define la función para encontrar el número de componentes óptimo de los datos cuando tenemos una regresión lineal múltiple

In [None]:
def entrenamiento_pca(modelo, n_comp, n_sets, X, Y):
    Errores, times = np.zeros(n_sets), np.zeros(n_sets)
    kf = KFold(n_splits=n_sets)
    for j, (train_idx, test_idx) in enumerate(kf.split(X)):
        X_tr, X_te = X[train_idx], X[test_idx]
        y_tr, y_te = Y[train_idx], Y[test_idx]

        pca = PCA(n_components=n_comp)
        start = time.time()
        X_tr_pca = pca.fit_transform(X_tr)
        X_te_pca = pca.transform(X_te)

        modelo.fit(X_tr_pca, y_tr)
        y_pred = modelo.predict(X_te_pca)
        times[j] = time.time() - start
        Errores[j] = mean_absolute_error(y_te, y_pred)

    return np.mean(Errores), np.std(Errores), np.mean(times)

In [None]:
def experimentar_PCA(modelo, n_feats, X, Y):
    resultados = []
    for f in n_feats:
        error, ic, t_ex = entrenamiento_pca(modelo, f, 5, X, Y)
        resultados.append({
            'NUM_VAR': f,
            'T_EJECUCION': t_ex,
            'ERROR_VALIDACION': error,
            'STD_ERROR_VALIDACION': ic
        })
    return pd.DataFrame(resultados)

In [None]:
# Experimento con regresión lineal
print("Regresión lineal con PCA:")
modelo_lr = LinearRegression()
resultados_lr = experimentar_PCA(modelo_lr, n_feats=[2,5,10,15,20], X=X_train, Y=y_train)
print(resultados_lr)

# Experimento con SVR
print("\nSVR con PCA:")
modelo_svr = SVR(kernel='rbf', C=0.01, gamma='scale', epsilon=0.0001, max_iter=10000)
resultados_svr = experimentar_PCA(modelo_svr, list(range(2, 21)), X=X_train, Y=y_train)
print(resultados_svr)


Regresión lineal con PCA:
   NUM_VAR  T_EJECUCION  ERROR_VALIDACION  STD_ERROR_VALIDACION
0        2     0.079885          0.870019              0.016651
1        5     0.023586          0.871865              0.015563
2       10     0.051164          0.875042              0.014676
3       15     0.057296          0.875208              0.014558
4       20     0.037178          0.875325              0.013476

SVR con PCA:
    NUM_VAR  T_EJECUCION  ERROR_VALIDACION  STD_ERROR_VALIDACION
0         2     0.307708          0.869439              0.016728
1         3     0.268848          0.870319              0.015883
2         4     0.303056          0.869833              0.016137
3         5     0.277407          0.869114              0.016754
4         6     0.307419          0.869380              0.016830
5         7     0.211977          0.868830              0.016720
6         8     0.179140          0.868868              0.017285
7         9     0.178806          0.868277              

In [None]:
# Entrenamiento final Regresión Lineal con PCA de 2 componentes
pca_final_lr = PCA(n_components=2)
X_train_pca_lr = pca_final_lr.fit_transform(X_train)
X_test_pca_lr = pca_final_lr.transform(X_test)

modelo_final_lr = LinearRegression()
modelo_final_lr.fit(X_train_pca_lr, y_train)
y_pred_lr = modelo_final_lr.predict(X_test_pca_lr)

# Invertimos la transformación para comparar
y_pred_lr_orig = scaler_y.inverse_transform(y_pred_lr.reshape(-1, 1))
y_test_orig = scaler_y.inverse_transform(y_test.reshape(-1, 1))

mae_lr = mean_absolute_error(y_test_orig, y_pred_lr_orig)
print(f"\nMAE Regresión Lineal (PCA=2): {mae_lr:.4f}")


MAE Regresión Lineal (PCA=2): 22.2246


In [None]:
# Entrenamiento final SVR con PCA de 9 componentes
pca_final_svr = PCA(n_components=9)
X_train_pca_svr = pca_final_svr.fit_transform(X_train)
X_test_pca_svr = pca_final_svr.transform(X_test)

modelo_final_svr = SVR(kernel='rbf', C=0.01, gamma='scale', epsilon=0.0001, max_iter=10000)
modelo_final_svr.fit(X_train_pca_svr, y_train)
y_pred_svr = modelo_final_svr.predict(X_test_pca_svr)

y_pred_svr_orig = scaler_y.inverse_transform(y_pred_svr.reshape(-1, 1))
mae_svr = mean_absolute_error(y_test_orig, y_pred_svr_orig)
print(f"MAE SVR (PCA=9): {mae_svr:.4f}")

MAE SVR (PCA=9): 22.1764
