## Notebook #3: Final Modeling and Metrics

In this notebook, I will finalize the model I finished with in Notebook #2. This is by far my best performing model and the model off which I base my Kaggle submissions. 

This model is built upon the ones I covered in notebook #2. The transformations I used are SimpleImputer for replacing NAs with 0, OneHotEncoder for replacing categorical variables with numeric columns, and StandardScaler to get everything on the same scale to move through the LassoCV. After running this dataframe through the LassoCV, I used TransformedTargetRegressor to log transform the target variable, saleprice, due to its distribution lacking normality. 

I replaced the NAs with 0 because that is what lined up with the data dictionary ordinal values. 

Let's dive in! 

In [129]:
# All packages at the top

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import sys

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")
    
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer

from sklearn.linear_model import LassoCV
from sklearn.compose import TransformedTargetRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [89]:
df_house = pd.read_csv('./datasets/train.csv')
pd.set_option('display.max_columns', None)
df_house.head(3)

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,Utilities,Lot Config,Land Slope,Neighborhood,Condition 1,Condition 2,Bldg Type,House Style,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Roof Style,Roof Matl,Exterior 1st,Exterior 2nd,Mas Vnr Type,Mas Vnr Area,Exter Qual,Exter Cond,Foundation,Bsmt Qual,Bsmt Cond,Bsmt Exposure,BsmtFin Type 1,BsmtFin SF 1,BsmtFin Type 2,BsmtFin SF 2,Bsmt Unf SF,Total Bsmt SF,Heating,Heating QC,Central Air,Electrical,1st Flr SF,2nd Flr SF,Low Qual Fin SF,Gr Liv Area,Bsmt Full Bath,Bsmt Half Bath,Full Bath,Half Bath,Bedroom AbvGr,Kitchen AbvGr,Kitchen Qual,TotRms AbvGrd,Functional,Fireplaces,Fireplace Qu,Garage Type,Garage Yr Blt,Garage Finish,Garage Cars,Garage Area,Garage Qual,Garage Cond,Paved Drive,Wood Deck SF,Open Porch SF,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,AllPub,CulDSac,Gtl,Sawyer,RRAe,Norm,1Fam,2Story,6,8,1976,2005,Gable,CompShg,HdBoard,Plywood,BrkFace,289.0,Gd,TA,CBlock,TA,TA,No,GLQ,533.0,Unf,0.0,192.0,725.0,GasA,Ex,Y,SBrkr,725,754,0,1479,0.0,0.0,2,1,3,1,Gd,6,Typ,0,,Attchd,1976.0,RFn,2.0,475.0,TA,TA,Y,0,44,0,0,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,AllPub,CulDSac,Gtl,SawyerW,Norm,Norm,1Fam,2Story,7,5,1996,1997,Gable,CompShg,VinylSd,VinylSd,BrkFace,132.0,Gd,TA,PConc,Gd,TA,No,GLQ,637.0,Unf,0.0,276.0,913.0,GasA,Ex,Y,SBrkr,913,1209,0,2122,1.0,0.0,2,1,4,1,Gd,8,Typ,1,TA,Attchd,1997.0,RFn,2.0,559.0,TA,TA,Y,0,74,0,0,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Norm,Norm,1Fam,1Story,5,7,1953,2007,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,Gd,CBlock,TA,TA,No,GLQ,731.0,Unf,0.0,326.0,1057.0,GasA,TA,Y,SBrkr,1057,0,0,1057,1.0,0.0,1,0,3,1,Gd,5,Typ,0,,Detchd,1953.0,Unf,1.0,246.0,TA,TA,Y,0,52,0,0,0,0,,,,0,1,2010,WD,109000


In [90]:
df_house.columns = df_house.columns.str.lower().str.replace(' ', '_')

In [91]:
df_house.drop(columns=['pid'], inplace=True)

In [93]:
df_house = df_house[df_house['gr_liv_area'] < 4000]  # Making sure I remove those outliers

In [94]:
df_nums = df_house.select_dtypes(include=[np.number])
df_nums.head(3)

Unnamed: 0,id,ms_subclass,lot_frontage,lot_area,overall_qual,overall_cond,year_built,year_remod/add,mas_vnr_area,bsmtfin_sf_1,bsmtfin_sf_2,bsmt_unf_sf,total_bsmt_sf,1st_flr_sf,2nd_flr_sf,low_qual_fin_sf,gr_liv_area,bsmt_full_bath,bsmt_half_bath,full_bath,half_bath,bedroom_abvgr,kitchen_abvgr,totrms_abvgrd,fireplaces,garage_yr_blt,garage_cars,garage_area,wood_deck_sf,open_porch_sf,enclosed_porch,3ssn_porch,screen_porch,pool_area,misc_val,mo_sold,yr_sold,saleprice
0,109,60,,13517,6,8,1976,2005,289.0,533.0,0.0,192.0,725.0,725,754,0,1479,0.0,0.0,2,1,3,1,6,0,1976.0,2.0,475.0,0,44,0,0,0,0,0,3,2010,130500
1,544,60,43.0,11492,7,5,1996,1997,132.0,637.0,0.0,276.0,913.0,913,1209,0,2122,1.0,0.0,2,1,4,1,8,1,1997.0,2.0,559.0,0,74,0,0,0,0,0,4,2009,220000
2,153,20,68.0,7922,5,7,1953,2007,0.0,731.0,0.0,326.0,1057.0,1057,0,0,1057,1.0,0.0,1,0,3,1,5,0,1953.0,1.0,246.0,0,52,0,0,0,0,0,1,2010,109000


In [97]:
df_house.select_dtypes(include=[np.object_]).head(3)

Unnamed: 0,ms_zoning,street,alley,lot_shape,land_contour,utilities,lot_config,land_slope,neighborhood,condition_1,condition_2,bldg_type,house_style,roof_style,roof_matl,exterior_1st,exterior_2nd,mas_vnr_type,exter_qual,exter_cond,foundation,bsmt_qual,bsmt_cond,bsmt_exposure,bsmtfin_type_1,bsmtfin_type_2,heating,heating_qc,central_air,electrical,kitchen_qual,functional,fireplace_qu,garage_type,garage_finish,garage_qual,garage_cond,paved_drive,pool_qc,fence,misc_feature,sale_type
0,RL,Pave,,IR1,Lvl,AllPub,CulDSac,Gtl,Sawyer,RRAe,Norm,1Fam,2Story,Gable,CompShg,HdBoard,Plywood,BrkFace,Gd,TA,CBlock,TA,TA,No,GLQ,Unf,GasA,Ex,Y,SBrkr,Gd,Typ,,Attchd,RFn,TA,TA,Y,,,,WD
1,RL,Pave,,IR1,Lvl,AllPub,CulDSac,Gtl,SawyerW,Norm,Norm,1Fam,2Story,Gable,CompShg,VinylSd,VinylSd,BrkFace,Gd,TA,PConc,Gd,TA,No,GLQ,Unf,GasA,Ex,Y,SBrkr,Gd,Typ,TA,Attchd,RFn,TA,TA,Y,,,,WD
2,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Norm,Norm,1Fam,1Story,Gable,CompShg,VinylSd,VinylSd,,TA,Gd,CBlock,TA,TA,No,GLQ,Unf,GasA,TA,Y,SBrkr,Gd,Typ,,Detchd,Unf,TA,TA,Y,,,,WD


In [98]:
df_house.head(3)

Unnamed: 0,id,ms_subclass,ms_zoning,lot_frontage,lot_area,street,alley,lot_shape,land_contour,utilities,lot_config,land_slope,neighborhood,condition_1,condition_2,bldg_type,house_style,overall_qual,overall_cond,year_built,year_remod/add,roof_style,roof_matl,exterior_1st,exterior_2nd,mas_vnr_type,mas_vnr_area,exter_qual,exter_cond,foundation,bsmt_qual,bsmt_cond,bsmt_exposure,bsmtfin_type_1,bsmtfin_sf_1,bsmtfin_type_2,bsmtfin_sf_2,bsmt_unf_sf,total_bsmt_sf,heating,heating_qc,central_air,electrical,1st_flr_sf,2nd_flr_sf,low_qual_fin_sf,gr_liv_area,bsmt_full_bath,bsmt_half_bath,full_bath,half_bath,bedroom_abvgr,kitchen_abvgr,kitchen_qual,totrms_abvgrd,functional,fireplaces,fireplace_qu,garage_type,garage_yr_blt,garage_finish,garage_cars,garage_area,garage_qual,garage_cond,paved_drive,wood_deck_sf,open_porch_sf,enclosed_porch,3ssn_porch,screen_porch,pool_area,pool_qc,fence,misc_feature,misc_val,mo_sold,yr_sold,sale_type,saleprice
0,109,60,RL,,13517,Pave,,IR1,Lvl,AllPub,CulDSac,Gtl,Sawyer,RRAe,Norm,1Fam,2Story,6,8,1976,2005,Gable,CompShg,HdBoard,Plywood,BrkFace,289.0,Gd,TA,CBlock,TA,TA,No,GLQ,533.0,Unf,0.0,192.0,725.0,GasA,Ex,Y,SBrkr,725,754,0,1479,0.0,0.0,2,1,3,1,Gd,6,Typ,0,,Attchd,1976.0,RFn,2.0,475.0,TA,TA,Y,0,44,0,0,0,0,,,,0,3,2010,WD,130500
1,544,60,RL,43.0,11492,Pave,,IR1,Lvl,AllPub,CulDSac,Gtl,SawyerW,Norm,Norm,1Fam,2Story,7,5,1996,1997,Gable,CompShg,VinylSd,VinylSd,BrkFace,132.0,Gd,TA,PConc,Gd,TA,No,GLQ,637.0,Unf,0.0,276.0,913.0,GasA,Ex,Y,SBrkr,913,1209,0,2122,1.0,0.0,2,1,4,1,Gd,8,Typ,1,TA,Attchd,1997.0,RFn,2.0,559.0,TA,TA,Y,0,74,0,0,0,0,,,,0,4,2009,WD,220000
2,153,20,RL,68.0,7922,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Norm,Norm,1Fam,1Story,5,7,1953,2007,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,Gd,CBlock,TA,TA,No,GLQ,731.0,Unf,0.0,326.0,1057.0,GasA,TA,Y,SBrkr,1057,0,0,1057,1.0,0.0,1,0,3,1,Gd,5,Typ,0,,Detchd,1953.0,Unf,1.0,246.0,TA,TA,Y,0,52,0,0,0,0,,,,0,1,2010,WD,109000


I had an issue here when I ran the categorical variables I wanted through the model. It always registered as a 1D array, no matter what I did or found on Stack Overflow. So, for the sake of time and unsure of a clear solution, I manually copied and pasted the column titles from the dataframe above. This was time consuming, of course, but was an easy solution for the issue I was having.

In [99]:
df_cat = df_house[['id', 
                   'utilities', 
                   'neighborhood', 
                   'bldg_type', 
                   'house_style', 
                   'roof_style', 
                   'mas_vnr_type', 
                   'exter_qual', 
                   'exter_cond', 
                   'foundation', 
                   'bsmt_qual', 
                   'bsmt_cond', 
                   'kitchen_qual', 
                   'functional', 
                   'garage_type', 
                   'garage_finish', 
                   'garage_qual', 
                   'garage_cond', 
                   'paved_drive', 
                   'alley',
                   'fireplace_qu',
                   'pool_qc',
                   'fence',
                   'misc_feature'
                   ]]

In [100]:
df_cat.head(3)

Unnamed: 0,id,utilities,neighborhood,bldg_type,house_style,roof_style,mas_vnr_type,exter_qual,exter_cond,foundation,bsmt_qual,bsmt_cond,kitchen_qual,functional,garage_type,garage_finish,garage_qual,garage_cond,paved_drive,alley,fireplace_qu,pool_qc,fence,misc_feature
0,109,AllPub,Sawyer,1Fam,2Story,Gable,BrkFace,Gd,TA,CBlock,TA,TA,Gd,Typ,Attchd,RFn,TA,TA,Y,,,,,
1,544,AllPub,SawyerW,1Fam,2Story,Gable,BrkFace,Gd,TA,PConc,Gd,TA,Gd,Typ,Attchd,RFn,TA,TA,Y,,TA,,,
2,153,AllPub,NAmes,1Fam,1Story,Gable,,TA,Gd,CBlock,TA,TA,Gd,Typ,Detchd,Unf,TA,TA,Y,,,,,


In [101]:
df_combined = pd.merge(left = df_nums, right = df_cat, how = 'inner')

In [102]:
df_combined.head(3)

Unnamed: 0,id,ms_subclass,lot_frontage,lot_area,overall_qual,overall_cond,year_built,year_remod/add,mas_vnr_area,bsmtfin_sf_1,bsmtfin_sf_2,bsmt_unf_sf,total_bsmt_sf,1st_flr_sf,2nd_flr_sf,low_qual_fin_sf,gr_liv_area,bsmt_full_bath,bsmt_half_bath,full_bath,half_bath,bedroom_abvgr,kitchen_abvgr,totrms_abvgrd,fireplaces,garage_yr_blt,garage_cars,garage_area,wood_deck_sf,open_porch_sf,enclosed_porch,3ssn_porch,screen_porch,pool_area,misc_val,mo_sold,yr_sold,saleprice,utilities,neighborhood,bldg_type,house_style,roof_style,mas_vnr_type,exter_qual,exter_cond,foundation,bsmt_qual,bsmt_cond,kitchen_qual,functional,garage_type,garage_finish,garage_qual,garage_cond,paved_drive,alley,fireplace_qu,pool_qc,fence,misc_feature
0,109,60,,13517,6,8,1976,2005,289.0,533.0,0.0,192.0,725.0,725,754,0,1479,0.0,0.0,2,1,3,1,6,0,1976.0,2.0,475.0,0,44,0,0,0,0,0,3,2010,130500,AllPub,Sawyer,1Fam,2Story,Gable,BrkFace,Gd,TA,CBlock,TA,TA,Gd,Typ,Attchd,RFn,TA,TA,Y,,,,,
1,544,60,43.0,11492,7,5,1996,1997,132.0,637.0,0.0,276.0,913.0,913,1209,0,2122,1.0,0.0,2,1,4,1,8,1,1997.0,2.0,559.0,0,74,0,0,0,0,0,4,2009,220000,AllPub,SawyerW,1Fam,2Story,Gable,BrkFace,Gd,TA,PConc,Gd,TA,Gd,Typ,Attchd,RFn,TA,TA,Y,,TA,,,
2,153,20,68.0,7922,5,7,1953,2007,0.0,731.0,0.0,326.0,1057.0,1057,0,0,1057,1.0,0.0,1,0,3,1,5,0,1953.0,1.0,246.0,0,52,0,0,0,0,0,1,2010,109000,AllPub,NAmes,1Fam,1Story,Gable,,TA,Gd,CBlock,TA,TA,Gd,Typ,Detchd,Unf,TA,TA,Y,,,,,


In [103]:
X = df_combined.drop(columns=['id', 'saleprice'])
y = df_combined['saleprice']

X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=42)

The whole time I was creating my first models, I used the code in the cell below (104). However, as I was noticing my model get better, I wanted to ensure I could test every feature possible. This is in line with my problem statement because my problem is centered around giving realtors and clients the best possible prediction for their home based on the features that it has. With this model, I needed to input as many features as I could. I ran into another issue here as well - there were some columns of categorical data that just would not run through the model. Again, no matter what I did, I could not get them to be encoded. Therefore, I was not able to include every categorical column in my model, just the overlap between the ones I felt most important and the ones that would actually work.

In [58]:
# df_house.drop(columns=['alley',
#                  'fireplace_qu',
#                  'pool_qc',
#                  'fence',
#                  'misc_feature',
#                  'pid'], inplace=True)

In [104]:
ohe = OneHotEncoder(handle_unknown='ignore')
smart_encoder = make_column_transformer((ohe, ['overall_qual',   # Again here, I had to manually input these 
                                               'utilities', 
                                               'neighborhood', 
                                               'bldg_type', 
                                               'house_style', 
                                               'roof_style', 
                                               'mas_vnr_type', 
                                               'exter_qual', 
                                               'exter_cond', 
                                               'foundation', 
                                               'bsmt_qual', 
                                               'bsmt_cond', 
                                               'kitchen_qual', 
                                               'functional', 
                                               'garage_type', 
                                               'garage_finish', 
                                               'garage_qual', 
                                               'garage_cond', 
                                               'paved_drive',
                                               'alley',
                                               'fireplace_qu',
                                               'pool_qc',
                                               'fence',
                                               'misc_feature']),
                                        remainder='passthrough',
                                        verbose_feature_names_out=False)

X_train_enc = smart_encoder.fit_transform(X_train)
X_val_enc = smart_encoder.transform(X_val)

In [105]:
X_train_enc = pd.DataFrame(X_train_enc, columns = smart_encoder.get_feature_names_out())
X_val_enc = pd.DataFrame(X_val_enc, columns = smart_encoder.get_feature_names_out())

In [106]:
X_train_enc.shape

(1536, 185)

In [107]:
si = SimpleImputer(strategy = 'constant', fill_value=0)

X_train_fill = si.fit_transform(X_train_enc)
X_val_fill = si.transform(X_val_enc)

In [108]:
X_train_fill = pd.DataFrame(X_train_fill, columns = si.feature_names_in_)
X_val_fill = pd.DataFrame(X_val_fill, columns = si.feature_names_in_)

In [109]:
ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train_fill)
X_val_sc = ss.transform(X_val_fill)

X_train_sc = pd.DataFrame(X_train_sc, columns = ss.get_feature_names_out())
X_val_sc = pd.DataFrame(X_val_sc, columns = ss.get_feature_names_out())

In [111]:
lasso_cv = LassoCV(cv = 10).fit(X_train_sc, y_train) 

print('best alpha:', lasso_cv.alpha_)
print('score:', lasso_cv.score(X_train_sc, y_train))

best alpha: 567.9607901002692
score: 0.9279680118839857


In [112]:
lasso_cv.score(X_val_sc, y_val)

0.9344318425844886

In [113]:
tt = TransformedTargetRegressor(regressor = lasso_cv, func = np.log, inverse_func = np.exp)

In [114]:
tt.fit(X_train_sc, y_train)

TransformedTargetRegressor(func=<ufunc 'log'>, inverse_func=<ufunc 'exp'>,
                           regressor=LassoCV(cv=10))

In [115]:
tt.score(X_train_sc, y_train)

0.9424373847158217

In [116]:
tt.score(X_val_sc, y_val)

0.9376050739263987

In [117]:
y_train_preds = tt.predict(X_train_sc)

In [118]:
mean_squared_error(y_train, y_train_preds)

361050075.80118006

In [119]:
(mean_squared_error(y_train, y_train_preds))**0.5

19001.317738545928

In [120]:
y_val_preds = tt.predict(X_val_sc)
mean_squared_error(y_val, y_val_preds)

394467870.71098816

In [121]:
(mean_squared_error(y_val, y_val_preds))**0.5

19861.21523751727

In [132]:
y_train_look = list(zip(y_train, y_train_preds))

In [133]:
y_val_look = list(zip(y_val, y_val_preds))

In [131]:
mean_absolute_error(y_train, y_train_preds), mean_absolute_error(y_val, y_val_preds)

(12902.615440200454, 13425.913183469822)

As we can see, this model is working pretty well. My prediction percentage is in the 90s, the train score is not too much different from the test score, and the MSE/RMSE are both similar in value. The MSE tested here (360 million approx) is also much less than the null model MSE (6.2 billion). These metrics help me understand the following about my model: 
* The bias variance trade off is not perfect, but is pretty good. I know this because when I looked at the comparison between y (actuals) and my predicted values (y_train_preds and y_val_preds), the predicted neither looked all the same nor looked exactly like the actuals. Further, the $R^2$ scores are quite similar, showing minimal signs of either over (high variance) or under (high bias) fitting.
* I can also tell this from my $R^2$ scores. The $R^2$ of the validation set is 93.7%. This means that I can explain 93.7% of the variation in my data with my model.

---

In this notebook, I presented my final model that is able to predict the house sale price in 92-94% of the data. I want to explain here why I did not use a pipeline. As I was building my model, I attempted to put everything through a pipeline; however, it was difficult to force it to do exactly what I wanted it to do here written out manually. It ended up becoming a much larger task than I had time or space for and actually writing it out was more efficient for me in this project. That being said, I want to continue improving my code and model building abilities, and will continue to try out pipelines in my own time.

In the next notebook, I will examine my insights and present visualizations for my data. 

---

**APPENDIX:** 

I tried taking out the zero coefficient features, but nothing changed with my model metrics  as seen below. 

In [47]:
df_new = X_train_sc[new_X_train]

In [48]:
df_new

Unnamed: 0,overall_qual_4,overall_qual_5,overall_qual_7,overall_qual_8,overall_qual_9,overall_qual_10,neighborhood_Blmngtn,neighborhood_BrkSide,neighborhood_ClearCr,neighborhood_Crawfor,neighborhood_GrnHill,neighborhood_MeadowV,neighborhood_NWAmes,neighborhood_NoRidge,neighborhood_NridgHt,neighborhood_OldTown,neighborhood_Somerst,neighborhood_StoneBr,bldg_type_1Fam,bldg_type_TwnhsE,house_style_2.5Unf,house_style_SLvl,roof_style_Flat,roof_style_Gable,roof_style_Mansard,mas_vnr_type_BrkCmn,mas_vnr_type_BrkFace,mas_vnr_type_Stone,exter_qual_Ex,exter_qual_Fa,exter_qual_TA,exter_cond_Po,foundation_CBlock,foundation_PConc,bsmt_qual_Ex,bsmt_qual_TA,bsmt_cond_Ex,bsmt_cond_Fa,kitchen_qual_Ex,kitchen_qual_Fa,kitchen_qual_TA,functional_Maj1,functional_Mod,functional_Sal,functional_Typ,garage_type_2Types,garage_type_BuiltIn,garage_finish_Fin,garage_qual_Ex,garage_qual_Fa,garage_qual_Gd,paved_drive_Y,fireplace_qu_Gd,pool_qc_Gd,pool_qc_TA,fence_GdWo,misc_feature_Othr,misc_feature_TenC,lot_frontage,lot_area,overall_cond,year_built,year_remod/add,mas_vnr_area,bsmtfin_sf_1,bsmtfin_sf_2,total_bsmt_sf,low_qual_fin_sf,gr_liv_area,bsmt_full_bath,full_bath,half_bath,bedroom_abvgr,kitchen_abvgr,fireplaces,garage_cars,garage_area,wood_deck_sf,3ssn_porch,screen_porch,yr_sold
0,-0.280449,-0.629574,1.945520,-0.372319,-0.199865,-0.102598,-0.10579,-0.19452,-0.120545,-0.190885,-0.025524,-0.099307,-0.216875,-0.154919,-0.255318,-0.302794,-0.256761,-0.141139,0.447214,-0.292425,-0.08493,-0.211891,-0.088736,0.525117,-0.062622,-0.076772,-0.652828,-0.293735,-0.198096,-0.111914,0.806126,-0.036108,1.162850,-0.890112,-0.312933,1.152073,-0.036108,-0.181528,-0.276377,-0.154919,0.985778,-0.076772,-0.125988,-0.036108,0.269492,-0.095908,-0.253868,1.747218,-0.044237,-0.206809,-0.092389,0.321634,1.696699,-0.044237,-0.036108,-0.206809,-0.044237,-0.025524,1.499720,0.531987,2.165189,-1.124198,-0.198475,-0.555218,-0.347634,-0.291180,-0.121222,-0.103165,1.581291,1.097024,-1.039912,1.284713,0.203869,-0.193684,2.258117,0.295154,0.113717,3.311222,-0.094554,-0.288676,-0.587158
1,-0.280449,1.588375,-0.514001,-0.372319,-0.199865,-0.102598,-0.10579,-0.19452,-0.120545,-0.190885,-0.025524,-0.099307,-0.216875,-0.154919,-0.255318,-0.302794,-0.256761,-0.141139,0.447214,-0.292425,-0.08493,-0.211891,-0.088736,0.525117,-0.062622,-0.076772,-0.652828,-0.293735,-0.198096,-0.111914,0.806126,-0.036108,1.162850,-0.890112,-0.312933,1.152073,-0.036108,-0.181528,-0.276377,-0.154919,-1.014427,-0.076772,-0.125988,-0.036108,0.269492,-0.095908,-0.253868,-0.572338,-0.044237,-0.206809,-0.092389,0.321634,-0.589380,-0.044237,-0.036108,-0.206809,-0.044237,-0.025524,0.062187,-0.519838,2.165189,-0.696674,0.851277,-0.555218,-0.098234,0.584287,-0.451673,-0.103165,-1.292329,1.097024,-1.039912,-0.732996,0.203869,-0.193684,-0.926885,0.295154,1.145397,0.761326,-0.094554,-0.288676,-0.587158
2,-0.280449,-0.629574,-0.514001,2.685871,-0.199865,-0.102598,-0.10579,-0.19452,-0.120545,-0.190885,-0.025524,-0.099307,-0.216875,-0.154919,3.916686,-0.302794,-0.256761,-0.141139,0.447214,-0.292425,-0.08493,-0.211891,-0.088736,0.525117,-0.062622,-0.076772,1.531798,-0.293735,-0.198096,-0.111914,-1.240501,-0.036108,-0.859956,1.123455,-0.312933,-0.868000,-0.036108,-0.181528,-0.276377,-0.154919,-1.014427,-0.076772,-0.125988,-0.036108,0.269492,-0.095908,-0.253868,-0.572338,-0.044237,-0.206809,-0.092389,0.321634,1.696699,-0.044237,-0.036108,-0.206809,-0.044237,-0.025524,0.826832,0.119061,-0.509661,1.046307,0.946709,1.190625,1.547809,2.523241,5.037097,-0.103165,0.274541,1.097024,0.789761,-0.732996,0.203869,-0.193684,0.665616,1.609229,1.888950,-0.751457,-0.094554,-0.288676,0.933710
3,-0.280449,-0.629574,-0.514001,2.685871,-0.199865,-0.102598,-0.10579,-0.19452,-0.120545,-0.190885,-0.025524,-0.099307,-0.216875,6.454972,-0.255318,-0.302794,-0.256761,-0.141139,0.447214,-0.292425,-0.08493,-0.211891,-0.088736,0.525117,-0.062622,-0.076772,1.531798,-0.293735,-0.198096,-0.111914,-1.240501,-0.036108,-0.859956,1.123455,-0.312933,-0.868000,-0.036108,-0.181528,-0.276377,-0.154919,-1.014427,-0.076772,-0.125988,-0.036108,0.269492,-0.095908,-0.253868,-0.572338,-0.044237,-0.206809,-0.092389,0.321634,-0.589380,-0.044237,-0.036108,-0.206809,-0.044237,-0.025524,-1.772961,0.354649,1.273573,0.783215,0.517265,2.193304,-1.000610,-0.291180,0.434218,-0.103165,2.060569,-0.819960,0.789761,1.284713,1.417603,-0.193684,0.665616,1.609229,1.289460,0.745730,-0.094554,-0.288676,0.173276
4,-0.280449,-0.629574,-0.514001,-0.372319,-0.199865,-0.102598,-0.10579,-0.19452,-0.120545,-0.190885,-0.025524,-0.099307,-0.216875,-0.154919,-0.255318,-0.302794,-0.256761,-0.141139,0.447214,-0.292425,-0.08493,-0.211891,-0.088736,0.525117,-0.062622,-0.076772,-0.652828,-0.293735,-0.198096,-0.111914,0.806126,-0.036108,-0.859956,-0.890112,-0.312933,1.152073,-0.036108,-0.181528,-0.276377,-0.154919,0.985778,-0.076772,-0.125988,-0.036108,0.269492,-0.095908,-0.253868,-0.572338,-0.044237,-0.206809,-0.092389,0.321634,1.696699,-0.044237,-0.036108,4.835385,-0.044237,-0.025524,0.062187,-0.020554,1.273573,-1.025539,-1.629955,-0.555218,-1.000610,-0.291180,-0.653224,-0.103165,-0.012207,-0.819960,0.789761,-0.732996,0.203869,-0.193684,0.665616,-1.018922,-1.085262,1.860822,-0.094554,-0.288676,0.933710
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1531,-0.280449,1.588375,-0.514001,-0.372319,-0.199865,-0.102598,-0.10579,-0.19452,-0.120545,-0.190885,-0.025524,-0.099307,-0.216875,-0.154919,-0.255318,-0.302794,-0.256761,-0.141139,0.447214,-0.292425,-0.08493,-0.211891,-0.088736,0.525117,-0.062622,-0.076772,-0.652828,-0.293735,-0.198096,-0.111914,0.806126,-0.036108,1.162850,-0.890112,-0.312933,1.152073,-0.036108,-0.181528,-0.276377,-0.154919,0.985778,-0.076772,-0.125988,-0.036108,0.269492,-0.095908,-0.253868,-0.572338,-0.044237,-0.206809,-0.092389,0.321634,-0.589380,-0.044237,-0.036108,-0.206809,-0.044237,-0.025524,0.062187,0.169914,0.381956,-0.861106,-1.629955,-0.555218,0.314410,-0.291180,-0.336835,-0.103165,0.129119,1.097024,-1.039912,1.284713,0.203869,-0.193684,0.665616,-1.018922,-0.862196,-0.751457,-0.094554,-0.288676,0.173276
1532,-0.280449,-0.629574,-0.514001,-0.372319,-0.199865,-0.102598,-0.10579,-0.19452,-0.120545,-0.190885,-0.025524,-0.099307,-0.216875,-0.154919,-0.255318,-0.302794,-0.256761,-0.141139,0.447214,-0.292425,-0.08493,-0.211891,-0.088736,-1.904339,-0.062622,-0.076772,1.531798,-0.293735,-0.198096,-0.111914,0.806126,-0.036108,1.162850,-0.890112,-0.312933,1.152073,-0.036108,-0.181528,-0.276377,-0.154919,0.985778,-0.076772,-0.125988,-0.036108,0.269492,-0.095908,-0.253868,1.747218,-0.044237,-0.206809,-0.092389,0.321634,-0.589380,-0.044237,-0.036108,-0.206809,-0.044237,-0.025524,0.490388,-0.400750,1.273573,0.224146,-0.293907,0.642100,-0.526749,-0.291180,-0.901649,-0.103165,-0.194496,-0.819960,-1.039912,1.284713,0.203869,-0.193684,0.665616,0.295154,-0.155821,-0.751457,-0.094554,-0.288676,-1.347591
1533,-0.280449,-0.629574,-0.514001,-0.372319,-0.199865,-0.102598,-0.10579,-0.19452,-0.120545,-0.190885,-0.025524,10.069757,-0.216875,-0.154919,-0.255318,-0.302794,-0.256761,-0.141139,-2.236068,3.419681,-0.08493,-0.211891,-0.088736,0.525117,-0.062622,-0.076772,-0.652828,-0.293735,-0.198096,-0.111914,0.806126,-0.036108,1.162850,-0.890112,-0.312933,-0.868000,-0.036108,-0.181528,-0.276377,-0.154919,0.985778,13.025616,-0.125988,-0.036108,-3.710691,-0.095908,-0.253868,1.747218,-0.044237,-0.206809,-0.092389,0.321634,1.696699,-0.044237,-0.036108,-0.206809,-0.044237,-0.025524,-0.488357,-1.118240,-1.401278,0.059714,-0.532487,-0.555218,0.897100,-0.291180,-0.268870,-0.103165,2.101533,1.097024,0.789761,1.284713,2.631337,-0.193684,0.665616,0.295154,0.476199,4.925377,-0.094554,-0.288676,-1.347591
1534,3.565710,-0.629574,-0.514001,-0.372319,-0.199865,-0.102598,-0.10579,-0.19452,-0.120545,-0.190885,-0.025524,-0.099307,-0.216875,-0.154919,-0.255318,-0.302794,-0.256761,-0.141139,0.447214,-0.292425,-0.08493,-0.211891,-0.088736,0.525117,-0.062622,-0.076772,-0.652828,-0.293735,-0.198096,-0.111914,0.806126,-0.036108,-0.859956,-0.890112,-0.312933,-0.868000,-0.036108,-0.181528,-0.276377,-0.154919,0.985778,-0.076772,-0.125988,-0.036108,0.269492,-0.095908,-0.253868,-0.572338,-0.044237,-0.206809,-0.092389,-3.109126,-0.589380,-0.044237,-0.036108,-0.206809,-0.044237,-0.025524,1.102105,-0.139088,-1.401278,-1.453062,-1.629955,-0.555218,-1.000610,-0.291180,-0.929772,-0.103165,-1.513535,-0.819960,-1.039912,-0.732996,-1.009864,-0.193684,-0.926885,0.295154,-0.155821,-0.127629,-0.094554,-0.288676,1.694143


In [49]:
from sklearn.linear_model import LassoCV

lasso_cv = LassoCV(cv = 10).fit(df_new, y_train)

print('best alpha:', lasso_cv.alpha_)
print('score:', lasso_cv.score(df_new, y_train))

best alpha: 92.56345716977143
score: 0.9307237994058908


In [26]:
lasso_cv.score(X_val_sc, y_val)

0.9344318425844886

In [50]:
from sklearn.compose import TransformedTargetRegressor

tt = TransformedTargetRegressor(regressor = lasso_cv, func = np.log, inverse_func = np.exp)

In [51]:
tt.fit(X_train_sc, y_train)

TransformedTargetRegressor(func=<ufunc 'log'>, inverse_func=<ufunc 'exp'>,
                           regressor=LassoCV(cv=10))

In [52]:
tt.score(X_train_sc, y_train)

0.9424373847158217

In [53]:
tt.score(X_val_sc, y_val)

0.9376050739263987