### Importing Libraries

In [280]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, Lasso
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error

In [281]:
# By contatenating both training and testint dataset we ensure that
# all the feature engineering and preprocesseing is mantained in both
# datasets.  
df_train = pd.read_csv('./datasets/train.csv')
df_test = pd.read_csv('./datasets/test.csv')
df = pd.concat([df_train, df_test], ignore_index=True)
df.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500.0
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000.0
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000.0
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000.0
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500.0


In [282]:
df_test.shape

(878, 80)

In [283]:
set(df['MS Zoning'])

{'A (agr)', 'C (all)', 'FV', 'I (all)', 'RH', 'RL', 'RM'}

In [284]:
temp_set = None
obj_to_hot = []
for i in df.columns:
    add_col = True
    if df[i].dtypes == 'object':
        print(f'Column name: {i}\n set: {set(df[i])}')
        temp_set = set(df[i])
        for x in temp_set:
            if x != x:
                add_col = False
                break
        if add_col:
            obj_to_hot.append(i)

Column name: MS Zoning
 set: {'RM', 'C (all)', 'RL', 'RH', 'A (agr)', 'I (all)', 'FV'}
Column name: Street
 set: {'Pave', 'Grvl'}
Column name: Alley
 set: {nan, 'Pave', 'Grvl'}
Column name: Lot Shape
 set: {'IR2', 'IR1', 'Reg', 'IR3'}
Column name: Land Contour
 set: {'Lvl', 'Bnk', 'Low', 'HLS'}
Column name: Utilities
 set: {'AllPub', 'NoSeWa', 'NoSewr'}
Column name: Lot Config
 set: {'Inside', 'FR2', 'Corner', 'FR3', 'CulDSac'}
Column name: Land Slope
 set: {'Mod', 'Sev', 'Gtl'}
Column name: Neighborhood
 set: {'NWAmes', 'Timber', 'Landmrk', 'Sawyer', 'Edwards', 'IDOTRR', 'Somerst', 'Mitchel', 'NoRidge', 'BrkSide', 'CollgCr', 'SWISU', 'NAmes', 'NridgHt', 'Blmngtn', 'GrnHill', 'Blueste', 'BrDale', 'Greens', 'MeadowV', 'Gilbert', 'StoneBr', 'OldTown', 'NPkVill', 'Veenker', 'Crawfor', 'ClearCr', 'SawyerW'}
Column name: Condition 1
 set: {'Norm', 'PosA', 'RRNe', 'RRAn', 'Feedr', 'Artery', 'PosN', 'RRAe', 'RRNn'}
Column name: Condition 2
 set: {'Norm', 'PosA', 'RRAn', 'Feedr', 'Artery', 'Po

In [285]:
obj_to_hot

['MS Zoning',
 'Street',
 'Lot Shape',
 'Land Contour',
 'Utilities',
 'Lot Config',
 'Land Slope',
 'Neighborhood',
 'Condition 1',
 'Condition 2',
 'Bldg Type',
 'House Style',
 'Roof Style',
 'Roof Matl',
 'Exterior 1st',
 'Exterior 2nd',
 'Exter Qual',
 'Exter Cond',
 'Foundation',
 'Heating',
 'Heating QC',
 'Central Air',
 'Kitchen Qual',
 'Functional',
 'Paved Drive',
 'Sale Type']

In [286]:
def dummies_list_df(list_cols, df):
    for i in list_cols:
        pd.get_dummies(df, columns=[i], prefix=i, inplace=True)

In [287]:
df.shape

(2929, 81)

In [350]:
features_high_corr = ['Overall Qual', 'Gr Liv Area', 'Garage Area', 
                      'Total Bsmt SF', '1st Flr SF', 'SalePrice','Full Bath']

In [351]:
total_features = features_high_corr + obj_to_hot

In [352]:
df_first_iter = df[total_features]
df_first_iter.dtypes

Overall Qual       int64
Gr Liv Area        int64
Garage Area      float64
Total Bsmt SF    float64
1st Flr SF         int64
SalePrice        float64
Full Bath          int64
MS Zoning         object
Street            object
Lot Shape         object
Land Contour      object
Utilities         object
Lot Config        object
Land Slope        object
Neighborhood      object
Condition 1       object
Condition 2       object
Bldg Type         object
House Style       object
Roof Style        object
Roof Matl         object
Exterior 1st      object
Exterior 2nd      object
Exter Qual        object
Exter Cond        object
Foundation        object
Heating           object
Heating QC        object
Central Air       object
Kitchen Qual      object
Functional        object
Paved Drive       object
Sale Type         object
dtype: object

In [355]:
df_first_iter = pd.get_dummies(df_first_iter, columns=obj_to_hot)

In [356]:
df_first_iter.dtypes

Overall Qual         int64
Gr Liv Area          int64
Garage Area        float64
Total Bsmt SF      float64
1st Flr SF           int64
                    ...   
Sale Type_ConLw      uint8
Sale Type_New        uint8
Sale Type_Oth        uint8
Sale Type_VWD        uint8
Sale Type_WD         uint8
Length: 194, dtype: object

In [357]:
df_first_iter.isna().sum().sort_values()

Overall Qual              0
Exterior 2nd_AsbShng      0
Exterior 2nd_AsphShn      0
Exterior 2nd_Brk Cmn      0
Exterior 2nd_BrkFace      0
                       ... 
Neighborhood_Timber       0
Sale Type_WD              0
Total Bsmt SF             1
Garage Area               1
SalePrice               878
Length: 194, dtype: int64

### Data Cleaning and Feature Engineering

In [388]:
var_t = df_first_iter.corr()['SalePrice'].sort_values(ascending=False).to_numpy()
names_arr = df_first_iter.corr()['SalePrice'].sort_values(ascending=False).index.values

In [389]:
type(var_t)

numpy.ndarray

In [390]:
stacked_arr_corr = np.dstack((var_t,names_arr))

In [391]:
stacked_arr_corr

array([[[1.0, 'SalePrice'],
        [0.8002068702531417, 'Overall Qual'],
        [0.6970384443056628, 'Gr Liv Area'],
        [0.6502700352166217, 'Garage Area'],
        [0.6289247057344542, 'Total Bsmt SF'],
        [0.6184861410533946, '1st Flr SF'],
        [0.5512844949973916, 'Kitchen Qual_Ex'],
        [0.5379691021121198, 'Full Bath'],
        [0.5290468529844219, 'Foundation_PConc'],
        [0.4938610166826227, 'Exter Qual_Ex'],
        [0.45325535292868097, 'Heating QC_Ex'],
        [0.44864681340299645, 'Neighborhood_NridgHt'],
        [0.4466854983075194, 'Exter Qual_Gd'],
        [0.3581019691173542, 'Sale Type_New'],
        [0.34214555855939704, 'Exterior 1st_VinylSd'],
        [0.3375625324636504, 'Exterior 2nd_VinylSd'],
        [0.3062455312154595, 'Kitchen Qual_Gd'],
        [0.2892096014539416, 'Paved Drive_Y'],
        [0.2773778061451669, 'Central Air_Y'],
        [0.27357444442084283, 'Lot Shape_IR1'],
        [0.2652241868015608, 'Roof Style_Hip'],
        [0.

In [392]:
df_first_iter.head()

Unnamed: 0,Overall Qual,Gr Liv Area,Garage Area,Total Bsmt SF,1st Flr SF,SalePrice,Full Bath,MS Zoning_A (agr),MS Zoning_C (all),MS Zoning_FV,...,Sale Type_COD,Sale Type_CWD,Sale Type_Con,Sale Type_ConLD,Sale Type_ConLI,Sale Type_ConLw,Sale Type_New,Sale Type_Oth,Sale Type_VWD,Sale Type_WD
0,6,1479,475.0,725.0,725,130500.0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,7,2122,559.0,913.0,913,220000.0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,5,1057,246.0,1057.0,1057,109000.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,5,1444,400.0,384.0,744,174000.0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,6,1445,484.0,676.0,831,138500.0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [393]:
df_first_iter = df_first_iter.dropna(subset=['Garage Area', 'Total Bsmt SF'])

In [394]:
df_first_iter.isna().sum().sort_values()

Overall Qual              0
Exterior 2nd_AsbShng      0
Exterior 2nd_AsphShn      0
Exterior 2nd_Brk Cmn      0
Exterior 2nd_BrkFace      0
                       ... 
Condition 1_RRAn          0
Condition 1_RRNe          0
Neighborhood_StoneBr      0
Sale Type_WD              0
SalePrice               878
Length: 194, dtype: int64

In [395]:
df_first_iter.shape

(2927, 194)

In [396]:

features_high_corr = ['Overall Qual', 'Gr Liv Area', 'Garage Area', 
                      'Total Bsmt SF', '1st Flr SF', 'SalePrice','Full Bath',
                      'Kitchen Qual_Ex','Foundation_PConc','Kitchen Qual_TA',
                      'Exter Qual_TA']

df_second_iter = df_first_iter[features_high_corr]
df_second_iter.dtypes

Overall Qual          int64
Gr Liv Area           int64
Garage Area         float64
Total Bsmt SF       float64
1st Flr SF            int64
SalePrice           float64
Full Bath             int64
Kitchen Qual_Ex       uint8
Foundation_PConc      uint8
Kitchen Qual_TA       uint8
Exter Qual_TA         uint8
dtype: object

### Define feature Matrix and Target

In [397]:
proc_train = df_second_iter.loc[df['SalePrice'].notna()]
proc_test = df_second_iter.loc[df['SalePrice'].isna()]

In [398]:
proc_train.isna().sum()

Overall Qual        0
Gr Liv Area         0
Garage Area         0
Total Bsmt SF       0
1st Flr SF          0
SalePrice           0
Full Bath           0
Kitchen Qual_Ex     0
Foundation_PConc    0
Kitchen Qual_TA     0
Exter Qual_TA       0
dtype: int64

In [399]:
proc_test.isna().sum()

Overall Qual          0
Gr Liv Area           0
Garage Area           0
Total Bsmt SF         0
1st Flr SF            0
SalePrice           878
Full Bath             0
Kitchen Qual_Ex       0
Foundation_PConc      0
Kitchen Qual_TA       0
Exter Qual_TA         0
dtype: int64

In [400]:
X = proc_train.drop(columns=['SalePrice'])
y = proc_train['SalePrice']

In [401]:
poly = PolynomialFeatures(include_bias=False)

In [402]:
X_poly = poly.fit_transform(X)

### Instantiate, Fit and Model 

In [403]:
lr = LinearRegression()

In [404]:
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, random_state = 30)

In [405]:
ss = StandardScaler()
ss.fit(X_train)
X_train = ss.transform(X_train)
X_test = ss.transform(X_test)

In [406]:
lr.fit(X_train, y_train)

LinearRegression()

In [407]:
lr.score(X_train, y_train), lr.score(X_test, y_test)

(0.890708659738612, 0.849815277481508)

In [408]:
cross_val_score(lr, X_poly, y, cv=5).mean()

0.8674151763138942

In [409]:
pred_temp = lr.predict(X_test)

In [410]:
np.sqrt(mean_squared_error(y_test, pred_temp))

29475.678811838647

### Ridge model

In [411]:
ridge = RidgeCV(alphas=np.linspace(.1, 10, 100))

In [412]:
ridge_scores = cross_val_score(ridge, X_train, y_train, cv=3)
ridge_scores.mean()

0.7746116901567569

In [413]:
ridge_scores = cross_val_score(ridge, X_test, y_test, cv=3)
ridge_scores.mean()

0.6486337236684955

### Lasso 

In [414]:
#lasso = LassoCV(n_alphas=200)

In [415]:
#lasso_scores = cross_val_score(lasso, X_train, y_train, cv=3)
#lasso_scores.mean()

In [416]:
#lasso_scores = cross_val_score(lasso,  X_test, y_test, cv=3)
#lasso_scores.mean()

### Lasso Grid 

In [417]:
lasso = Lasso()

In [418]:
'''
parameters = {'alpha':[1e-15, 1e-10, 1e-8, 1e-4, 1e-3, 1e-2, 1, 5, 10, 20]}
lasso_regressor = GridSearchCV(lasso, parameters, scoring='r2', cv=5)
lasso_regressor.fit(X_train, y_train)
print(lasso_regressor.best_params_)
print(lasso_regressor.best_score_)
'''


"\nparameters = {'alpha':[1e-15, 1e-10, 1e-8, 1e-4, 1e-3, 1e-2, 1, 5, 10, 20]}\nlasso_regressor = GridSearchCV(lasso, parameters, scoring='r2', cv=5)\nlasso_regressor.fit(X_train, y_train)\nprint(lasso_regressor.best_params_)\nprint(lasso_regressor.best_score_)\n"

### Run model through test.csv

In [419]:
X_TEST = proc_test.drop(columns=['SalePrice'])

In [424]:
poly_TEST = PolynomialFeatures(include_bias=False)
X_TEST_poly  = poly_TEST.fit_transform(X_TEST)

ss_TEST = StandardScaler()
ss.fit(X_TEST_poly)
X_TEST_poly = ss.transform(X_TEST_poly)


In [425]:
preds = lr.predict(X_TEST_poly)

In [426]:
df_test['y_hat'] = preds

In [427]:
proc_test.head()

Unnamed: 0,Overall Qual,Gr Liv Area,Garage Area,Total Bsmt SF,1st Flr SF,SalePrice,Full Bath,Kitchen Qual_Ex,Foundation_PConc,Kitchen Qual_TA,Exter Qual_TA
2051,6,1928,440.0,1020.0,908,,2,0,0,0,1
2052,5,1967,580.0,1967.0,1967,,2,0,0,1,1
2053,7,1496,426.0,654.0,664,,2,0,1,0,0
2054,5,968,480.0,968.0,968,,1,0,0,1,0
2055,6,1394,514.0,1394.0,1394,,1,0,0,1,1


In [428]:
df_test.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,y_hat
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,...,0,0,,,,0,4,2006,WD,194056.415877
1,2718,905108090,90,RL,,9662,Pave,,IR1,Lvl,...,0,0,,,,0,8,2006,WD,153832.18467
2,2414,528218130,60,RL,58.0,17104,Pave,,IR1,Lvl,...,0,0,,,,0,9,2006,New,169383.985674
3,1989,902207150,30,RM,60.0,8520,Pave,,Reg,Lvl,...,0,0,,,,0,7,2007,WD,134334.000989
4,625,535105100,20,RL,,9500,Pave,,IR1,Lvl,...,185,0,,,,0,7,2009,WD,183664.946074


In [429]:
df_submission = df_test[['Id','y_hat']]

In [430]:
df_submission = df_submission.rename(columns={'y_hat':'SalePrice'})

In [431]:
df_submission

Unnamed: 0,Id,SalePrice
0,2658,194056.415877
1,2718,153832.184670
2,2414,169383.985674
3,1989,134334.000989
4,625,183664.946074
...,...,...
873,1662,179169.929057
874,1234,214430.485871
875,1373,141016.063303
876,1672,129022.544211


### Export as CSV

In [432]:
df_submission.to_csv('my_submission_v7.csv', index=False)