### Importing Libraries

In [361]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, Lasso, LogisticRegression
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, GridSearchCV
from sklearn.metrics import r2_score
from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingRegressor

In [294]:
# By contatenating both training and testint dataset we ensure that
# all the feature engineering and preprocesseing is mantained in both
# datasets.  
df_train = pd.read_csv('./datasets/train.csv')
df_test = pd.read_csv('./datasets/test.csv')
df = pd.concat([df_train, df_test], ignore_index=True)
df.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500.0
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000.0
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000.0
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000.0
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500.0


In [295]:
df_test.shape

(878, 80)

In [296]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2929 entries, 0 to 2928
Data columns (total 81 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Id               2929 non-null   int64  
 1   PID              2929 non-null   int64  
 2   MS SubClass      2929 non-null   int64  
 3   MS Zoning        2929 non-null   object 
 4   Lot Frontage     2439 non-null   float64
 5   Lot Area         2929 non-null   int64  
 6   Street           2929 non-null   object 
 7   Alley            198 non-null    object 
 8   Lot Shape        2929 non-null   object 
 9   Land Contour     2929 non-null   object 
 10  Utilities        2929 non-null   object 
 11  Lot Config       2929 non-null   object 
 12  Land Slope       2929 non-null   object 
 13  Neighborhood     2929 non-null   object 
 14  Condition 1      2929 non-null   object 
 15  Condition 2      2929 non-null   object 
 16  Bldg Type        2929 non-null   object 
 17  House Style   

### Data Cleaning and Feature Engineering

In [297]:
df.columns

Index(['Id', 'PID', 'MS SubClass', 'MS Zoning', 'Lot Frontage', 'Lot Area',
       'Street', 'Alley', 'Lot Shape', 'Land Contour', 'Utilities',
       'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1',
       'Condition 2', 'Bldg Type', 'House Style', 'Overall Qual',
       'Overall Cond', 'Year Built', 'Year Remod/Add', 'Roof Style',
       'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type',
       'Mas Vnr Area', 'Exter Qual', 'Exter Cond', 'Foundation', 'Bsmt Qual',
       'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin SF 1',
       'BsmtFin Type 2', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF',
       'Heating', 'Heating QC', 'Central Air', 'Electrical', '1st Flr SF',
       '2nd Flr SF', 'Low Qual Fin SF', 'Gr Liv Area', 'Bsmt Full Bath',
       'Bsmt Half Bath', 'Full Bath', 'Half Bath', 'Bedroom AbvGr',
       'Kitchen AbvGr', 'Kitchen Qual', 'TotRms AbvGrd', 'Functional',
       'Fireplaces', 'Fireplace Qu', 'Garage Type', 'Garage Yr Blt',
       'G

In [298]:
df.corr()['SalePrice'].sort_values()

PID               -0.255052
Enclosed Porch    -0.135656
Kitchen AbvGr     -0.125444
Overall Cond      -0.097019
MS SubClass       -0.087335
Id                -0.051398
Bsmt Half Bath    -0.045328
Low Qual Fin SF   -0.041594
Yr Sold           -0.015203
Misc Val          -0.007375
BsmtFin SF 2       0.016255
Pool Area          0.023106
Mo Sold            0.032735
3Ssn Porch         0.048732
Screen Porch       0.134581
Bedroom AbvGr      0.137067
Bsmt Unf SF        0.190210
2nd Flr SF         0.248452
Half Bath          0.283001
Bsmt Full Bath     0.283662
Lot Area           0.296566
Wood Deck SF       0.326490
Open Porch SF      0.333476
Lot Frontage       0.341842
BsmtFin SF 1       0.423519
Fireplaces         0.471093
TotRms AbvGrd      0.504014
Mas Vnr Area       0.512230
Garage Yr Blt      0.533922
Full Bath          0.537969
Year Remod/Add     0.550370
Year Built         0.571849
1st Flr SF         0.618486
Total Bsmt SF      0.628925
Garage Cars        0.648220
Garage Area        0

In [299]:
# Selecting top 5 Numerical Features with high corr
features_high_corr = ['Overall Qual', 'Gr Liv Area', 'Garage Area', 'Total Bsmt SF', '1st Flr SF', 'SalePrice','Full Bath']
df_first_iter = df[features_high_corr]
df[features_high_corr].dtypes

Overall Qual       int64
Gr Liv Area        int64
Garage Area      float64
Total Bsmt SF    float64
1st Flr SF         int64
SalePrice        float64
Full Bath          int64
dtype: object

In [300]:
df_first_iter

Unnamed: 0,Overall Qual,Gr Liv Area,Garage Area,Total Bsmt SF,1st Flr SF,SalePrice,Full Bath
0,6,1479,475.0,725.0,725,130500.0,2
1,7,2122,559.0,913.0,913,220000.0,2
2,5,1057,246.0,1057.0,1057,109000.0,1
3,5,1444,400.0,384.0,744,174000.0,2
4,6,1445,484.0,676.0,831,138500.0,2
...,...,...,...,...,...,...,...
2924,6,1877,488.0,1084.0,1084,,2
2925,6,1988,480.0,1104.0,1104,,2
2926,5,1211,322.0,952.0,1211,,1
2927,4,864,528.0,864.0,864,,1


In [301]:
df_first_iter.head()

Unnamed: 0,Overall Qual,Gr Liv Area,Garage Area,Total Bsmt SF,1st Flr SF,SalePrice,Full Bath
0,6,1479,475.0,725.0,725,130500.0,2
1,7,2122,559.0,913.0,913,220000.0,2
2,5,1057,246.0,1057.0,1057,109000.0,1
3,5,1444,400.0,384.0,744,174000.0,2
4,6,1445,484.0,676.0,831,138500.0,2


In [302]:
df_first_iter.describe()

Unnamed: 0,Overall Qual,Gr Liv Area,Garage Area,Total Bsmt SF,1st Flr SF,SalePrice,Full Bath
count,2929.0,2929.0,2928.0,2928.0,2929.0,2051.0,2929.0
mean,6.093547,1498.606009,472.679303,1050.901981,1158.882212,181469.701609,1.566064
std,1.40942,502.174926,214.948914,438.999166,390.248168,79258.659352,0.5524
min,1.0,334.0,0.0,0.0,334.0,12789.0,0.0
25%,5.0,1126.0,320.0,793.0,876.0,129825.0,1.0
50%,6.0,1442.0,480.0,990.0,1084.0,162500.0,2.0
75%,7.0,1742.0,576.0,1300.5,1384.0,214000.0,2.0
max,10.0,5642.0,1488.0,6110.0,5095.0,611657.0,4.0


In [303]:
df_first_iter.isna().sum()

Overall Qual       0
Gr Liv Area        0
Garage Area        1
Total Bsmt SF      1
1st Flr SF         0
SalePrice        878
Full Bath          0
dtype: int64

In [304]:
df_first_iter.shape

(2929, 7)

In [305]:
df_first_iter = df_first_iter.dropna(subset=['Garage Area', 'Total Bsmt SF'])

In [306]:
df_first_iter.isna().sum()

Overall Qual       0
Gr Liv Area        0
Garage Area        0
Total Bsmt SF      0
1st Flr SF         0
SalePrice        878
Full Bath          0
dtype: int64

In [307]:
df_first_iter.shape

(2927, 7)

In [308]:
df_first_iter.isna().sum()

Overall Qual       0
Gr Liv Area        0
Garage Area        0
Total Bsmt SF      0
1st Flr SF         0
SalePrice        878
Full Bath          0
dtype: int64

### Define feature Matrix and Target

In [309]:
proc_train = df_first_iter.loc[df['SalePrice'].notna()]
proc_test = df_first_iter.loc[df['SalePrice'].isna()]

In [370]:
X = proc_train.drop(columns=['SalePrice'])
y = proc_train['SalePrice']

In [371]:
poly = PolynomialFeatures(include_bias=False)

In [372]:
X_poly = poly.fit_transform(X)

### Instantiate, Fit and Model 

In [373]:
lr = LassoCV()

In [374]:
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, random_state = 42, test_size=.3)

In [375]:
lr.fit(X_poly, y)

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

LassoCV()

In [376]:
#lr.score(X_train, y_train), lr.score(X_test, y_test)

In [377]:
#cross_val_score(lr, X_poly, y, cv=5).mean()

### Categorical model implementation

In [378]:
temp_set = None
categorical_features = []
for i in df.columns:
    add_col = True
    if df[i].dtypes == 'object':
        # print(f'Column name: {i}\n set: {set(df[i])}')
        temp_set = set(df[i])
        for x in temp_set:
            if x != x:
                add_col = False
                break
        if add_col:
            categorical_features.append(i)

categorical_features.append('SalePrice')

In [379]:
def dummies_list_df(list_cols, df):
    for i in list_cols:
        pd.get_dummies(df, columns=[i], prefix=i, inplace=True)

In [380]:
df_sec_mod = df[categorical_features]
df_sec_mod.dtypes

MS Zoning        object
Street           object
Lot Shape        object
Land Contour     object
Utilities        object
Lot Config       object
Land Slope       object
Neighborhood     object
Condition 1      object
Condition 2      object
Bldg Type        object
House Style      object
Roof Style       object
Roof Matl        object
Exterior 1st     object
Exterior 2nd     object
Exter Qual       object
Exter Cond       object
Foundation       object
Heating          object
Heating QC       object
Central Air      object
Kitchen Qual     object
Functional       object
Paved Drive      object
Sale Type        object
SalePrice       float64
dtype: object

In [381]:
df_sec_mod = pd.get_dummies(df_sec_mod, columns=categorical_features)

In [382]:
df_sec_mod.dtypes

MS Zoning_A (agr)     uint8
MS Zoning_C (all)     uint8
MS Zoning_FV          uint8
MS Zoning_I (all)     uint8
MS Zoning_RH          uint8
                      ...  
SalePrice_556581.0    uint8
SalePrice_582933.0    uint8
SalePrice_584500.0    uint8
SalePrice_591587.0    uint8
SalePrice_611657.0    uint8
Length: 1015, dtype: object

In [383]:
df_sec_mod.isna().sum().sort_values()

MS Zoning_A (agr)     0
SalePrice_202900.0    0
SalePrice_203000.0    0
SalePrice_203135.0    0
SalePrice_203160.0    0
                     ..
SalePrice_118900.0    0
SalePrice_119000.0    0
SalePrice_119164.0    0
SalePrice_115400.0    0
SalePrice_611657.0    0
Length: 1015, dtype: int64

pipe_2_params = {'ss__': [1, 2], 
                 'knn__weights': ['uniform', 'distance'],
                 'knn__n_neighbors': [3, 5, 10]}

In [384]:
pipe = Pipeline([
    ('ss', StandardScaler()),
    ('lasso', LassoCV())
])


alphas = np.logspace(-5, 2, 30)
grid = GridSearchCV(estimator=Lasso(),
param_grid=dict(alpha=alphas), cv = ShuffleSplit(n=len(X), n_iter=10, test_size=.3), scoring='r2')
grid.fit(self.X, self.Y)

In [385]:
pipe.fit(X_poly, y)

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent(


Pipeline(steps=[('ss', StandardScaler()), ('lasso', LassoCV())])

In [386]:
#pipe.score(X_test, y_test)

In [387]:
#pipe.score(X_train, y_train)

### Piping 3rd model 

In [388]:
'''
pipe2 = Pipeline(steps = [
    ('ss', StandardScaler()),
    ('lr', LogisticRegression())])

pipe_params = { 'lr__C': [0.1, 0.3], 
              'lr__penalty': ['l1', 'l2']}
'''


"\npipe2 = Pipeline(steps = [\n    ('ss', StandardScaler()),\n    ('lr', LogisticRegression())])\n\npipe_params = { 'lr__C': [0.1, 0.3], \n              'lr__penalty': ['l1', 'l2']}\n"

In [389]:
#gs = GridSearchCV(pipe2, pipe_params, cv = 3)

In [390]:
#gs.fit(X_poly, y)

### Voting Classifier 

In [391]:
#create a dictionary of our models

ensamble = VotingRegressor([('lasso_1', lr), ('lasso_2', pipe)])

In [392]:
#fit model to training data
ensamble.fit(X_train, y_train) #test our model on the test data
ensamble.score(X_test, y_test)

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

0.8106641540199717

### Run model through test.csv

In [404]:
X_TEST = proc_test.drop(columns=['SalePrice'])

In [405]:
poly_TEST = PolynomialFeatures(include_bias=False)
X_TEST_poly  = poly_TEST.fit_transform(X_TEST)

In [415]:
preds = pipe.predict(X_TEST_poly)

In [416]:
df_test['y_hat'] = preds

In [417]:
df.iloc[2051,:]

Id                   2658
PID             902301120
MS SubClass           190
MS Zoning              RM
Lot Frontage           69
                  ...    
Misc Val                0
Mo Sold                 4
Yr Sold              2006
Sale Type             WD 
SalePrice             NaN
Name: 2051, Length: 81, dtype: object

In [418]:
proc_test.head()

Unnamed: 0,Overall Qual,Gr Liv Area,Garage Area,Total Bsmt SF,1st Flr SF,SalePrice,Full Bath
2051,6,1928,440.0,1020.0,908,,2
2052,5,1967,580.0,1967.0,1967,,2
2053,7,1496,426.0,654.0,664,,2
2054,5,968,480.0,968.0,968,,1
2055,6,1394,514.0,1394.0,1394,,1


In [419]:
df_test.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,y_hat
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,...,0,0,,,,0,4,2006,WD,195781.272856
1,2718,905108090,90,RL,,9662,Pave,,IR1,Lvl,...,0,0,,,,0,8,2006,WD,165979.884203
2,2414,528218130,60,RL,58.0,17104,Pave,,IR1,Lvl,...,0,0,,,,0,9,2006,New,161338.803422
3,1989,902207150,30,RM,60.0,8520,Pave,,Reg,Lvl,...,0,0,,,,0,7,2007,WD,128938.548978
4,625,535105100,20,RL,,9500,Pave,,IR1,Lvl,...,185,0,,,,0,7,2009,WD,184273.795315


In [420]:
df_submission = df_test[['Id','y_hat']]

In [421]:
df_submission = df_submission.rename(columns={'y_hat':'SalePrice'})

In [422]:
df_submission

Unnamed: 0,Id,SalePrice
0,2658,195781.272856
1,2718,165979.884203
2,2414,161338.803422
3,1989,128938.548978
4,625,184273.795315
...,...,...
873,1662,197295.353267
874,1234,202733.440541
875,1373,133793.632103
876,1672,107254.902543


### Export as CSV

In [424]:
# df_submission.to_csv('my_submission_v17.csv', index=False)