In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score

In [3]:
# Read the data
ames = pd.read_csv("AmesHousing.csv")

# Get rid of columns with mostly NaN values
good_cols = ames.isna().sum() < 100
ames = ames.loc[:,good_cols]

# Drop other NAs
ames = ames.dropna()

In [4]:
X = ames.drop(["SalePrice", "Order", "PID"], axis = 1)
y = ames["SalePrice"]


ct = ColumnTransformer(
  [
    ("dummify",
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize",
    StandardScaler(),
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

lr_pipeline_1 = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", LinearRegression())]
)

In [5]:
cross_val_score(lr_pipeline_1, X, y, cv = 5, scoring = 'r2')
#large negative numbers indicate a crazy overfitting of the data

array([-2.59303720e+21, -1.13145211e+19, -7.57138616e+20, -4.47669752e+18,
       -2.55949915e+20])

In [9]:
lr_pipeline_2 = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", Ridge())]
)

cross_val_score(lr_pipeline_2, X, y, cv = 5, scoring = 'r2')

array([0.89815807, 0.91744024, 0.79493606, 0.78522563, 0.91389818])

In [17]:
from sklearn.model_selection import GridSearchCV

lr_pipeline_2 = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", Ridge())]
)

degrees = {'linear_regression__alpha': np.logspace(-3, 1, num=5) }

gscv = GridSearchCV(lr_pipeline_2, degrees, cv = 5, scoring='r2')


{'linear_regression': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01])}

In [38]:
gscv_fitted = gscv.fit(X, y)

gscv_fitted.cv_results_

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

{'mean_fit_time': array([1.33802881, 0.81871724, 0.81581178, 1.44770331, 0.84443917,
        1.21154027, 0.88926725, 0.76802297, 1.06707673, 1.0889504 ,
        0.87793756, 0.18248401, 0.46044827, 0.61236515, 0.92429557,
        1.62563429, 0.12389855, 0.14770436, 0.14092731, 0.70239983,
        1.46591191, 0.1063818 , 0.11967888, 0.11260509, 0.28916807]),
 'std_fit_time': array([0.73250309, 0.07476696, 0.04186669, 0.78395608, 0.01590609,
        0.8176724 , 0.37072495, 0.03042483, 0.48252685, 0.47912355,
        0.01876917, 0.0108315 , 0.14809504, 0.40851639, 0.2975515 ,
        0.62588885, 0.01165604, 0.00412696, 0.01424078, 0.11146847,
        0.693045  , 0.01196052, 0.02456815, 0.01911365, 0.08681502]),
 'mean_score_time': array([0.04262409, 0.03062572, 0.03024802, 0.03340044, 0.02737274,
        0.03202233, 0.02851195, 0.03063149, 0.04079208, 0.03170118,
        0.0284421 , 0.0285871 , 0.04939809, 0.03707933, 0.03201141,
        0.03262916, 0.03458786, 0.02898035, 0.02872148, 0.02

In [39]:
gscv_fitted.cv_results_['mean_test_score']

array([0.86319233, 0.86281308, 0.86215515, 0.86064079, 0.85549892,
       0.86368653, 0.86405936, 0.86426789, 0.86405067, 0.85560578,
       0.85162965, 0.85403034, 0.85698683, 0.86088123, 0.85661828,
       0.81213194, 0.82116844, 0.8307003 , 0.84185231, 0.85715222,
       0.53627566, 0.59867086, 0.67323913, 0.75882319, 0.86063224])

In [37]:
pd.DataFrame(data = {"degrees": np.logspace(-3, 1, num = 5), "scores": gscv_fitted.cv_results_['mean_test_score']}).sort_values(by = "scores", ascending = False).head(1)

ValueError: All arrays must be of the same length

In [23]:
#using lasso instead of ridge
from sklearn.model_selection import GridSearchCV

lr_pipeline_2 = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", Lasso())]
)

degrees = {'linear_regression__alpha': np.logspace(-3, 1, num=5) }

gscv = GridSearchCV(lr_pipeline_2, degrees, cv = 5, scoring='r2')


In [26]:
# gscv_fitted = gscv.fit(X, y)

gscv_fitted.cv_results_

{'mean_fit_time': array([1.23605466, 0.89538627, 1.33932385, 1.26627584, 0.30861526]),
 'std_fit_time': array([0.73482926, 0.00948104, 0.67018306, 0.50530739, 0.1165053 ]),
 'mean_score_time': array([0.02865081, 0.0314332 , 0.03957882, 0.03501143, 0.02886138]),
 'std_score_time': array([0.0022772 , 0.00246166, 0.01070411, 0.01262916, 0.00170874]),
 'param_linear_regression__alpha': masked_array(data=[0.001, 0.01, 0.1, 1.0, 10.0],
              mask=[False, False, False, False, False],
        fill_value=1e+20),
 'params': [{'linear_regression__alpha': 0.001},
  {'linear_regression__alpha': 0.01},
  {'linear_regression__alpha': 0.1},
  {'linear_regression__alpha': 1.0},
  {'linear_regression__alpha': 10.0}],
 'split0_test_score': array([0.8972019 , 0.89720561, 0.89725821, 0.89774385, 0.90077569]),
 'split1_test_score': array([0.9103958 , 0.91040134, 0.91045103, 0.91093785, 0.91506699]),
 'split2_test_score': array([0.79032004, 0.79085941, 0.79595065, 0.79691806, 0.80141962]),
 'split3_t

In [25]:
gscv_fitted.cv_results_['mean_test_score']

array([0.85549892, 0.85560578, 0.85661828, 0.85715222, 0.86063224])

In [28]:
#elastic net tuning

lr_pipeline_2 = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", ElasticNet())]
)

degrees = {
    'linear_regression__alpha': np.logspace(-3, 1, num=5),
    'linear_regression__l1_ratio': np.linspace(0, 1, num=5)
     }

gscv = GridSearchCV(lr_pipeline_2, degrees, cv = 5, scoring='r2')


In [36]:
# gscv_fitted = gscv.fit(X, y)

gscv_fitted.cv_results_

{'mean_fit_time': array([1.40802474, 0.82172384, 0.83770356, 1.39689145, 0.86147265,
        1.37794037, 0.84352455, 0.77637458, 1.28926158, 0.87690043,
        1.31312151, 0.26258893, 0.19854741, 0.24252186, 0.84102535,
        1.52111754, 0.1324563 , 0.14164205, 0.1539371 , 0.71318569,
        1.5223681 , 0.10950193, 0.11021819, 0.11824675, 0.29897509]),
 'std_fit_time': array([0.38262799, 0.05436514, 0.0334284 , 0.79775532, 0.02065184,
        0.8345447 , 0.21833267, 0.04108655, 0.74930414, 0.02936257,
        0.95696092, 0.13685251, 0.01544117, 0.0131766 , 0.09882871,
        0.82388147, 0.0133888 , 0.01035912, 0.0079781 , 0.12507164,
        0.75062709, 0.01942079, 0.01526828, 0.00634105, 0.10563343]),
 'mean_score_time': array([0.03375039, 0.03006663, 0.03038697, 0.03807764, 0.02984176,
        0.04473648, 0.03183417, 0.03202434, 0.02930822, 0.02990642,
        0.0406939 , 0.03416071, 0.03026185, 0.03374901, 0.03136792,
        0.03713675, 0.02963743, 0.02916074, 0.02868972, 0.02

In [35]:
r2_score = gscv_fitted.cv_results_['mean_test_score']
specifications = gscv_fitted.cv_results_['params']
best_model = pd.DataFrame(data = {"r2_score": r2_score, "specifications": specifications})
best_model.sort_values(by = "r2_score", ascending = False).head(1)

Unnamed: 0,r2_score,specifications
7,0.864268,"{'linear_regression__alpha': 0.01, 'linear_reg..."
