**Universidad Autónoma Metropolitana - Unidad Iztapalapa (UAM-I)**

**Maestría en Matemáticas Aplicadas e Industriales (MCMAI)**

**Taller de Modelado Matemático II - Parte I**

> Trimestre 25-P

**Profesor**: 
    
> Dr. Alejandro Román Vásquez

**Alumnos**: 
    
> Alan Badillo Salas
> 
> Brandon Eduardo Antonio Gómez
> 
> Diego Armando Arce Montes de Oca

# Fase 1 - Adquisición de los datos

In [376]:
import numpy
import pandas

import matplotlib.pyplot as pyplot
import seaborn

In [377]:
# Carga de datos

casas = pandas.read_csv("Casas.csv")

# Selección de Columnas de Análisis

columnas_analisis = [
    "MSZoning",
    "LotArea",
    "Street",
    "Neighborhood",
    "YearBuilt",
    "OverallCond",
    "ExterQual",
    "GrLivArea",
    "FullBath",
    "GarageArea",
    "BsmtCond",
    "FireplaceQu",
    "Electrical",
    "LotFrontage",
    "KitchenQual",
    "PavedDrive",
    "SalePrice",
]

casas_analisis = casas[columnas_analisis]

# Selección de Ejes de Datos

MSZoning = casas_analisis["MSZoning"]
LotArea = casas_analisis["LotArea"]
Street = casas_analisis["Street"]
Neighborhood = casas_analisis["Neighborhood"]
YearBuilt = casas_analisis["YearBuilt"]
OverallCond = casas_analisis["OverallCond"]
ExterQual = casas_analisis["ExterQual"]
GrLivArea = casas_analisis["GrLivArea"]
FullBath = casas_analisis["FullBath"]
GarageArea = casas_analisis["GarageArea"]
BsmtCond = casas_analisis["BsmtCond"]
FireplaceQu = casas_analisis["FireplaceQu"]
Electrical = casas_analisis["Electrical"]
LotFrontage = casas_analisis["LotFrontage"]
KitchenQual = casas_analisis["KitchenQual"]
PavedDrive = casas_analisis["PavedDrive"]
SalePrice = casas_analisis["SalePrice"]

# Mean Encoder

Ejes_Cats = [
    ("MSZoning", MSZoning),
    ("Neighborhood", Neighborhood),
    ("OverallCond", OverallCond),
    ("BsmtCond", BsmtCond),
    ("FireplaceQu", FireplaceQu),
    ("Electrical", Electrical),
]

for nombre, eje in Ejes_Cats:
    eje = eje.fillna("NA")
    casas_analisis.loc[:, [nombre]] = eje
    eje_mean = pandas.merge(left=eje, right=pandas.DataFrame([eje, SalePrice]).T.groupby(nombre).mean(), on=nombre)["SalePrice"]
    casas_analisis.loc[:, [f"{nombre}_mean"]] = eje_mean

# One-Hot Encoder (Dummies)

Ejes_Dums = [
    ("ExterQual", ExterQual),
    ("FullBath", FullBath),
    ("KitchenQual", KitchenQual),
    ("PavedDrive", PavedDrive),
]

for nombre, eje in Ejes_Dums:
    columnas = []
    eje = eje.fillna("NA")
    casas_analisis.loc[:, [nombre]] = eje
    for i, cat in enumerate(eje.unique()):
        eje_dummy = (eje == cat).astype(int)
        casas_analisis.loc[:, [f"{nombre}_{cat}_dummy{i}"]] = eje_dummy
        columnas.append(f"{nombre}_{cat}_dummy{i}")

# Selección de variables

x1 = casas_analisis["MSZoning_mean"]            # mean encoder
x2 = casas_analisis["LotArea"]                  # continua
x3 = casas_analisis["Neighborhood_mean"]        # mean encoder
x4 = casas_analisis["YearBuilt"]                # continua
x5 = casas_analisis["OverallCond_mean"]         # mean encoder
x6 = casas_analisis["ExterQual_Gd_dummy0"]      # dummy
x7 = casas_analisis["ExterQual_Ex_dummy2"]      # dummy
x8 = casas_analisis["ExterQual_Fa_dummy3"]      # dummy
x9 = casas_analisis["GrLivArea"]                # continua
x10 = casas_analisis["FullBath_1_dummy1"]       # dummy
x11 = casas_analisis["FullBath_3_dummy2"]       # dummy
x12 = casas_analisis["FullBath_0_dummy3"]       # dummy
x13 = casas_analisis["GarageArea"]              # continua
x14 = casas_analisis["BsmtCond_mean"]           # mean encoder
x15 = casas_analisis["FireplaceQu_mean"]        # mean encoder
x16 = casas_analisis["Electrical_mean"]         # mean encoder
x17 = casas_analisis["LotFrontage"]             # continua*
x18 = casas_analisis["KitchenQual_Gd_dummy0"]   # dummy
x19 = casas_analisis["KitchenQual_Ex_dummy2"]   # dummy
x20 = casas_analisis["KitchenQual_Fa_dummy3"]   # dummy
x21 = casas_analisis["PavedDrive_N_dummy1"]     # dummy
x22 = casas_analisis["PavedDrive_P_dummy2"]     # dummy

X = pandas.DataFrame([
    x1, x2, x3, x4, x5, x6, x7, x8, x9, x10,
    x11, x12, x13, x14, x15, x16, x17, x18, x19, x20,
    x21, x22
], index=[
    "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10",
    "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20",
    "x21", "x22"
]).T

# Imputación de Datos

X1 = X.copy().dropna(subset=["x17"])

y17 = X1["x17"]

from sklearn.ensemble import RandomForestRegressor

reg = RandomForestRegressor(random_state=123)

reg.fit(X1, y17)

print(reg.score(X1, y17))

# Obtenemos los registros con datos faltantes
Xmiss = X[X1.columns][X["x17"].isna()]

# Predecimos los datos faltantes
yp = reg.predict(Xmiss)

# Reintegramos los datos faltantes (imputados) a la matriz original de variables
X.loc[:, ["x17_imp"]] = X["x17"]
X.loc[X["x17"].isna(), ["x17_imp"]] = yp

x17 = X["x17_imp"]

casas_analisis.loc[:, ["LotFrontage_imp"]] = X["x17_imp"]

# Eliminación de Puntos Atípicos

xs = [
    ("x2", x2), 
    ("x4", x4), 
    ("x9", x9), 
    ("x13", x13), 
    ("x17_imp", x17)
]

for i, (nombre, x) in enumerate(xs):
    Q1 = x.quantile(0.25)
    Q3 = x.quantile(0.75)
    IQR = Q3 - Q1
    xmin = Q1 - 1.5 * IQR
    xmax = Q3 + 1.5 * IQR
    xp = x.copy().astype(float)
    xp[xp >= xmax] = xmax
    xp[xp <= xmin] = xmin
    
    X.loc[:, [f"{nombre}_in"]] = xp 

X2 = X.copy()

del X2["x2"]
del X2["x4"]
del X2["x9"]
del X2["x13"]
del X2["x17"]
del X2["x17_imp"]

# print(X2.columns)

columns = [
    "x1", "x2_in", "x3", "x4_in", "x5", "x6", "x7", "x8", "x9_in", "x10",
    "x11", "x12", "x13_in", "x14", "x15", "x16", "x17_imp_in", "x18", "x19", "x20",
    "x21", "x22"
]

X2 = X2[columns]

X2.columns = [f"x{j + 1}" for j in range(len(columns))]

# print(X2.head())

# Normalización

n, m = X2.shape

X3 = numpy.zeros((n, m))

for j, column in enumerate(X2.columns):
    xj = X2[column]
    X3[:, j] = (xj - xj.mean()) / xj.std()

X3 = pandas.DataFrame(X3, columns=X2.columns)
y = SalePrice

# Guardamos los datos transformados y el regresor de la imputación de x17

casas_analisis.to_csv("Casas_analisis.csv", index=False)
X2.to_csv("Casas_X2.csv", index=False)
X3.to_csv("Casas_X3.csv", index=False)
y.to_csv("Casas_y.csv", index=False)

# Guardamos el regresor de la imputación para x17

import pickle

pickle.dump(reg, open("reg_imp_x17.pickle", "wb"))

0.9963294003653655


# Fase 2 - Transformación de los datos de Kaggle

In [378]:
casas_analisis_original = pandas.read_csv("Casas_analisis.csv")

casas_analisis_original.head()

Unnamed: 0,MSZoning,LotArea,Street,Neighborhood,YearBuilt,OverallCond,ExterQual,GrLivArea,FullBath,GarageArea,...,FullBath_3_dummy2,FullBath_0_dummy3,KitchenQual_Gd_dummy0,KitchenQual_TA_dummy1,KitchenQual_Ex_dummy2,KitchenQual_Fa_dummy3,PavedDrive_Y_dummy0,PavedDrive_N_dummy1,PavedDrive_P_dummy2,LotFrontage_imp
0,RL,8450,Pave,CollgCr,2003,5,Gd,1710,2,548,...,0,0,1,0,0,0,1,0,0,65.0
1,RL,9600,Pave,Veenker,1976,8,TA,1262,2,460,...,0,0,0,1,0,0,1,0,0,80.0
2,RL,11250,Pave,CollgCr,2001,5,Gd,1786,2,608,...,0,0,1,0,0,0,1,0,0,68.0
3,RL,9550,Pave,Crawfor,1915,5,TA,1717,1,642,...,0,0,1,0,0,0,1,0,0,60.0
4,RL,14260,Pave,NoRidge,2000,5,Gd,2198,2,836,...,0,0,1,0,0,0,1,0,0,84.0


In [379]:
def transformaciones(casas, casas_analisis_original):
    # Selección de Columnas de Análisis

    columnas_analisis = [
        "MSZoning",
        "LotArea",
        "Street",
        "Neighborhood",
        "YearBuilt",
        "OverallCond",
        "ExterQual",
        "GrLivArea",
        "FullBath",
        "GarageArea",
        "BsmtCond",
        "FireplaceQu",
        "Electrical",
        "LotFrontage",
        "KitchenQual",
        "PavedDrive",
    ]

    casas_analisis = casas[columnas_analisis]

    # Mean Encoder

    Ejes_Cats = [
        "MSZoning",
        "Neighborhood",
        "OverallCond",
        "BsmtCond",
        "FireplaceQu",
        "Electrical",
    ]

    for nombre in Ejes_Cats:
        eje = casas_analisis[nombre]
        casas_analisis.loc[:, [nombre]] = eje
        # NOTA: Recuperamos la media de la categoría del conjunto original
        eje_mean = pandas.merge(left=eje, right=casas_analisis_original[[nombre, f"{nombre}_mean"]], on=nombre)[f"{nombre}_mean"]
        casas_analisis.loc[:, [f"{nombre}_mean"]] = eje_mean

    # One-Hot Encoder (Dummies)

    Ejes_Dums = [
        "ExterQual",
        "FullBath",
        "KitchenQual",
        "PavedDrive",
    ]

    for nombre in Ejes_Dums:
        eje = casas_analisis[nombre]
        casas_analisis.loc[:, [nombre]] = eje
        # NOTA: Recuperamos el orden de las categorías de la original
        for i, cat in enumerate(casas_analisis_original[nombre].unique()):
            eje_dummy = (eje == cat).astype(int)
            casas_analisis.loc[:, [f"{nombre}_{cat}_dummy{i}"]] = eje_dummy

    # print(casas_analisis.head())

    # Selección de variables

    x1 = casas_analisis["MSZoning_mean"]            # mean encoder
    x2 = casas_analisis["LotArea"]                  # continua
    x3 = casas_analisis["Neighborhood_mean"]        # mean encoder
    x4 = casas_analisis["YearBuilt"]                # continua
    x5 = casas_analisis["OverallCond_mean"]         # mean encoder
    x6 = casas_analisis["ExterQual_Gd_dummy0"]      # dummy
    x7 = casas_analisis["ExterQual_Ex_dummy2"]      # dummy
    x8 = casas_analisis["ExterQual_Fa_dummy3"]      # dummy
    x9 = casas_analisis["GrLivArea"]                # continua
    x10 = casas_analisis["FullBath_1_dummy1"]       # dummy
    x11 = casas_analisis["FullBath_3_dummy2"]       # dummy
    x12 = casas_analisis["FullBath_0_dummy3"]       # dummy
    x13 = casas_analisis["GarageArea"]              # continua
    x14 = casas_analisis["BsmtCond_mean"]           # mean encoder
    x15 = casas_analisis["FireplaceQu_mean"]        # mean encoder
    x16 = casas_analisis["Electrical_mean"]         # mean encoder
    x17 = casas_analisis["LotFrontage"]             # continua*
    x18 = casas_analisis["KitchenQual_Gd_dummy0"]   # dummy
    x19 = casas_analisis["KitchenQual_Ex_dummy2"]   # dummy
    x20 = casas_analisis["KitchenQual_Fa_dummy3"]   # dummy
    x21 = casas_analisis["PavedDrive_N_dummy1"]     # dummy
    x22 = casas_analisis["PavedDrive_P_dummy2"]     # dummy

    X = pandas.DataFrame([
        x1, x2, x3, x4, x5, x6, x7, x8, x9, x10,
        x11, x12, x13, x14, x15, x16, x17, x18, x19, x20,
        x21, x22
    ], index=[
        "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10",
        "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20",
        "x21", "x22"
    ]).T

    # print(X.head())

    # Imputación de Datos

    X1 = X.copy().dropna(subset=["x17"])

    # Obtenemos los registros con datos faltantes
    Xmiss = X[X1.columns][X["x17"].isna()]

    # NOTA: Usamos el regresor de imputación del conjunto original
    import pickle

    reg = pickle.load(open("reg_imp_x17.pickle", "rb"))
    # reg = pickle.load(open("reg_imp_best.pickle", "rb"))

    # Predecimos los datos faltantes
    yp = reg.predict(Xmiss)

    # Reintegramos los datos faltantes (imputados) a la matriz original de variables
    X.loc[:, ["x17_imp"]] = X["x17"]
    X.loc[X["x17"].isna(), ["x17_imp"]] = yp

    x17 = X["x17"]

    # Eliminación de Puntos Atípicos

    xs = [
        ("x2", "LotArea"),
        ("x4", "YearBuilt"),
        ("x9", "GrLivArea"),
        ("x13", "GarageArea"),
        ("x17_imp", "LotFrontage_imp"),
    ]

    for i, (nombre, column) in enumerate(xs):
        # print(nombre, column, casas_analisis_original.columns[-1])
        # NOTA: Recuperamos los rangos del conjunto original
        x = casas_analisis_original[column]
        Q1 = x.quantile(0.25)
        Q3 = x.quantile(0.75)
        IQR = Q3 - Q1
        xmin = Q1 - 1.5 * IQR
        xmax = Q3 + 1.5 * IQR
        # xp = x.copy().astype(float)
        xp = X[nombre].copy().astype(float)
        xp[xp >= xmax] = xmax
        xp[xp <= xmin] = xmin
        
        X.loc[:, [f"{nombre}_in"]] = xp 

    # print(X.head())

    X2 = X.copy()

    del X2["x2"]
    del X2["x4"]
    del X2["x9"]
    del X2["x13"]
    del X2["x17"]
    del X2["x17_imp"]

    # print(X2.columns)

    columns = [
        "x1", "x2_in", "x3", "x4_in", "x5", "x6", "x7", "x8", "x9_in", "x10",
        "x11", "x12", "x13_in", "x14", "x15", "x16", "x17_imp_in", "x18", "x19", "x20",
        "x21", "x22"
    ]

    X2 = X2[columns]

    X2.columns = [f"x{j + 1}" for j in range(len(columns))]

    # print(X2.head())

    # Normalización

    n, m = X2.shape

    X3 = numpy.zeros((n, m))

    for j, column in enumerate(X2.columns):
        xj = X2[column]
        X3[:, j] = (xj - xj.mean()) / xj.std()

    X3 = pandas.DataFrame(X3, columns=X2.columns)

    return casas_analisis, X2, X3

In [380]:
casas_kaggle = pandas.read_csv("Casas_Kaggle.csv")

casas_analisis_kaggle, X2, X3 = transformaciones(casas_kaggle, casas_analisis_original)

casas_analisis_kaggle.to_csv("Casas_Kaggle_analisis.csv", index=False)
X2.to_csv("Casas_Kaggle_X2.csv", index=False)
X3.to_csv("Casas_Kaggle_X3.csv", index=False)

X2.head()

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,...,x13,x14,x15,x16,x17,x18,x19,x20,x21,x22
0,131558.375,11622.0,145847.08,1961.0,153961.59127,0.0,0.0,0.0,896.0,1.0,...,730.0,183632.6209,141331.482609,186825.113193,80.0,0.0,0.0,0.0,0.0,0.0
1,131558.375,14267.0,145847.08,1958.0,153961.59127,0.0,0.0,0.0,1329.0,1.0,...,312.0,183632.6209,141331.482609,186825.113193,81.0,1.0,0.0,0.0,0.0,0.0
2,131558.375,13830.0,145847.08,1997.0,153961.59127,0.0,0.0,0.0,1629.0,0.0,...,482.0,183632.6209,141331.482609,186825.113193,74.0,0.0,0.0,0.0,0.0,0.0
3,131558.375,9978.0,145847.08,1998.0,153961.59127,0.0,0.0,0.0,1604.0,0.0,...,470.0,183632.6209,141331.482609,186825.113193,78.0,1.0,0.0,0.0,0.0,0.0
4,131558.375,5005.0,145847.08,1992.0,153961.59127,1.0,0.0,0.0,1280.0,0.0,...,506.0,183632.6209,141331.482609,186825.113193,43.0,1.0,0.0,0.0,0.0,0.0


# Fase 3 - Ajuste de los modelos

In [381]:
X2 = pandas.read_csv("Casas_X2.csv")

X2.head()

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,...,x13,x14,x15,x16,x17,x18,x19,x20,x21,x22
0,191004.994787,8450.0,197965.773333,2003.0,203146.914738,1.0,0.0,0.0,1710.0,0.0,...,548.0,183632.6209,141331.482609,186825.113193,65.0,1.0,0.0,0.0,0.0,0.0
1,191004.994787,9600.0,238772.727273,1976.0,155651.736111,0.0,0.0,0.0,1262.0,0.0,...,460.0,183632.6209,205723.488818,186825.113193,80.0,0.0,0.0,0.0,0.0,0.0
2,191004.994787,11250.0,197965.773333,2001.0,203146.914738,1.0,0.0,0.0,1786.0,0.0,...,608.0,183632.6209,205723.488818,186825.113193,68.0,1.0,0.0,0.0,0.0,0.0
3,191004.994787,9550.0,210624.72549,1915.0,203146.914738,0.0,0.0,0.0,1717.0,1.0,...,642.0,213599.907692,226351.415789,186825.113193,60.0,1.0,0.0,0.0,0.0,0.0
4,191004.994787,14260.0,335295.317073,2000.0,203146.914738,1.0,0.0,0.0,2198.0,0.0,...,836.0,183632.6209,205723.488818,186825.113193,84.0,1.0,0.0,0.0,0.0,0.0


In [382]:
X2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 22 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   x1      1460 non-null   float64
 1   x2      1460 non-null   float64
 2   x3      1460 non-null   float64
 3   x4      1460 non-null   float64
 4   x5      1460 non-null   float64
 5   x6      1460 non-null   float64
 6   x7      1460 non-null   float64
 7   x8      1460 non-null   float64
 8   x9      1460 non-null   float64
 9   x10     1460 non-null   float64
 10  x11     1460 non-null   float64
 11  x12     1460 non-null   float64
 12  x13     1460 non-null   float64
 13  x14     1460 non-null   float64
 14  x15     1460 non-null   float64
 15  x16     1460 non-null   float64
 16  x17     1460 non-null   float64
 17  x18     1460 non-null   float64
 18  x19     1460 non-null   float64
 19  x20     1460 non-null   float64
 20  x21     1460 non-null   float64
 21  x22     1460 non-null   float64
dtype

In [383]:
X3 = pandas.read_csv("Casas_X3.csv")

X3.head()

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,...,x13,x14,x15,x16,x17,x18,x19,x20,x21,x22
0,0.387032,-0.33313,0.290473,1.052885,0.79008,1.410829,-0.192111,-0.098363,0.428489,-0.8955,...,0.373381,0.15055,-0.919147,0.30418,-0.14721,1.220838,-0.27107,-0.16561,-0.25622,-0.144792
1,0.387032,-0.013184,0.985904,0.156125,-0.898279,-0.708318,-0.192111,-0.098363,-0.502177,-0.8955,...,-0.051523,0.15055,0.57583,0.30418,0.707824,-0.818548,-0.27107,-0.16561,-0.25622,-0.144792
2,0.387032,0.445869,0.290473,0.986459,0.79008,1.410829,-0.192111,-0.098363,0.58637,-0.8955,...,0.663088,0.15055,0.57583,0.30418,0.023797,1.220838,-0.27107,-0.16561,-0.25622,-0.144792
3,0.387032,-0.027095,0.506207,-1.869888,0.79008,-0.708318,-0.192111,-0.098363,0.443031,1.11593,...,0.827255,1.814463,1.054744,0.30418,-0.432222,1.220838,-0.27107,-0.16561,-0.25622,-0.144792
4,0.387032,1.283293,2.630839,0.953245,0.79008,1.410829,-0.192111,-0.098363,1.44225,-0.8955,...,1.763975,0.15055,0.57583,0.30418,0.935834,1.220838,-0.27107,-0.16561,-0.25622,-0.144792


In [384]:
y = pandas.read_csv("Casas_y.csv")

y.head()

Unnamed: 0,SalePrice
0,208500
1,181500
2,223500
3,140000
4,250000


## Partición de los datos

In [385]:
from sklearn.model_selection import train_test_split

X2_train, X2_test, y_train, y_test = train_test_split(X2, y["SalePrice"], random_state=123)

X2_train.shape, X2_test.shape, y_train.shape, y_test.shape

((1095, 22), (365, 22), (1095,), (365,))

## Ajuste por Ridge

In [386]:
from sklearn.linear_model import Ridge

reg = Ridge(alpha=1, random_state=123)

reg.fit(X2_train, y_train)

pandas.DataFrame(reg.coef_)

Unnamed: 0,0
0,-0.034046
1,1.918044
2,0.391997
3,324.405605
4,-0.035692
5,10068.533714
6,50227.120961
7,-8999.511127
8,52.524308
9,9889.608682


In [387]:
y_pred = reg.predict(X2_test)

e = (y_test - y_pred) ** 2

rmse = e.mean() ** 0.5

rmse

np.float64(32637.017820570316)

In [411]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge

reg = Ridge(random_state=123)

cv = GridSearchCV(reg, {
    "alpha": numpy.logspace(-3, 3, 100)
})

cv.fit(X2, y)

pandas.DataFrame(cv.cv_results_).sort_values(by="rank_test_score")[["param_alpha", "mean_test_score", "rank_test_score"]]

Unnamed: 0,param_alpha,mean_test_score,rank_test_score
60,4.328761,0.817391,1
61,4.977024,0.817384,2
59,3.764936,0.817379,3
58,3.274549,0.817355,4
62,5.722368,0.817354,5
...,...,...,...
95,572.236766,0.775354,96
96,657.933225,0.774271,97
97,756.463328,0.773284,98
98,869.749003,0.772390,99


In [412]:
cv.best_estimator_.alpha

np.float64(4.328761281083062)

In [415]:
from sklearn.linear_model import Ridge

reg = Ridge(alpha=cv.best_estimator_.alpha, random_state=123)

reg.fit(X2_train, y_train)

pandas.DataFrame(reg.coef_)

Unnamed: 0,0
0,-0.038076
1,1.917155
2,0.397941
3,332.416204
4,-0.033388
5,8901.982604
6,46172.298551
7,-6389.746901
8,53.082704
9,9539.051817


In [420]:
y_pred = reg.predict(X2_test)

e = (y_test - y_pred) ** 2

rmse = e.mean() ** 0.5

rmse

np.float64(32864.41824337965)

In [425]:
from sklearn.linear_model import RidgeCV

cv = RidgeCV(
    alphas=numpy.logspace(-3, 3, 100),
    cv=5
)

cv.fit(X2, y)

cv.best_score_

np.float64(0.8173907120973855)

In [422]:
from sklearn.linear_model import Ridge

reg = Ridge(alpha=cv.best_score_, random_state=123)

reg.fit(X2_train, y_train)

pandas.DataFrame(reg.coef_)

Unnamed: 0,0
0,-0.033826
1,1.917767
2,0.391656
3,323.955055
4,-0.035813
5,10140.50367
6,50478.123075
7,-9203.60847
8,52.493172
9,9914.455492


In [423]:
y_pred = reg.predict(X2_test)

e = (y_test - y_pred) ** 2

rmse = e.mean() ** 0.5

rmse

np.float64(32624.641912308733)