### Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import r2_score

In [2]:
# By contatenating both training and testint dataset we ensure that
# all the feature engineering and preprocesseing is mantained in both
# datasets.  
df_train = pd.read_csv('./datasets/train.csv')
df_test = pd.read_csv('./datasets/test.csv')
df = pd.concat([df_train, df_test], ignore_index=True)
df.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500.0
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000.0
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000.0
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000.0
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500.0


In [3]:
df_test.shape

(878, 80)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2929 entries, 0 to 2928
Data columns (total 81 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Id               2929 non-null   int64  
 1   PID              2929 non-null   int64  
 2   MS SubClass      2929 non-null   int64  
 3   MS Zoning        2929 non-null   object 
 4   Lot Frontage     2439 non-null   float64
 5   Lot Area         2929 non-null   int64  
 6   Street           2929 non-null   object 
 7   Alley            198 non-null    object 
 8   Lot Shape        2929 non-null   object 
 9   Land Contour     2929 non-null   object 
 10  Utilities        2929 non-null   object 
 11  Lot Config       2929 non-null   object 
 12  Land Slope       2929 non-null   object 
 13  Neighborhood     2929 non-null   object 
 14  Condition 1      2929 non-null   object 
 15  Condition 2      2929 non-null   object 
 16  Bldg Type        2929 non-null   object 
 17  House Style   

### Data Cleaning and Feature Engineering

In [5]:
# droping columns based on missing nans
# print(f'Cols before droping nan\'s: {df.shape[1]}')
# temp = df.dropna(axis='columns')
# print(f'Cols after droping nan\'s: {temp.shape[1]}')
# df = temp

In [6]:
df.columns

Index(['Id', 'PID', 'MS SubClass', 'MS Zoning', 'Lot Frontage', 'Lot Area',
       'Street', 'Alley', 'Lot Shape', 'Land Contour', 'Utilities',
       'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1',
       'Condition 2', 'Bldg Type', 'House Style', 'Overall Qual',
       'Overall Cond', 'Year Built', 'Year Remod/Add', 'Roof Style',
       'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type',
       'Mas Vnr Area', 'Exter Qual', 'Exter Cond', 'Foundation', 'Bsmt Qual',
       'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin SF 1',
       'BsmtFin Type 2', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF',
       'Heating', 'Heating QC', 'Central Air', 'Electrical', '1st Flr SF',
       '2nd Flr SF', 'Low Qual Fin SF', 'Gr Liv Area', 'Bsmt Full Bath',
       'Bsmt Half Bath', 'Full Bath', 'Half Bath', 'Bedroom AbvGr',
       'Kitchen AbvGr', 'Kitchen Qual', 'TotRms AbvGrd', 'Functional',
       'Fireplaces', 'Fireplace Qu', 'Garage Type', 'Garage Yr Blt',
       'G

In [7]:
df.corr()['SalePrice'].sort_values()

PID               -0.255052
Enclosed Porch    -0.135656
Kitchen AbvGr     -0.125444
Overall Cond      -0.097019
MS SubClass       -0.087335
Id                -0.051398
Bsmt Half Bath    -0.045328
Low Qual Fin SF   -0.041594
Yr Sold           -0.015203
Misc Val          -0.007375
BsmtFin SF 2       0.016255
Pool Area          0.023106
Mo Sold            0.032735
3Ssn Porch         0.048732
Screen Porch       0.134581
Bedroom AbvGr      0.137067
Bsmt Unf SF        0.190210
2nd Flr SF         0.248452
Half Bath          0.283001
Bsmt Full Bath     0.283662
Lot Area           0.296566
Wood Deck SF       0.326490
Open Porch SF      0.333476
Lot Frontage       0.341842
BsmtFin SF 1       0.423519
Fireplaces         0.471093
TotRms AbvGrd      0.504014
Mas Vnr Area       0.512230
Garage Yr Blt      0.533922
Full Bath          0.537969
Year Remod/Add     0.550370
Year Built         0.571849
1st Flr SF         0.618486
Total Bsmt SF      0.628925
Garage Cars        0.648220
Garage Area        0

In [8]:
# Getting a series of features with significant corr
series_sig_corr = df.corr()['SalePrice'] >= .571849

In [9]:
# Reference for taking the index values in pandas from 
# the following stackoverflow post
# https://stackoverflow.com/questions/52173161/getting-a-list-of-indices-where-pandas-boolean-series-is-true
features_high_corr = series_sig_corr[series_sig_corr].index.values

In [10]:
df_first_iter = df[features_high_corr]
df[features_high_corr].dtypes

Overall Qual       int64
Total Bsmt SF    float64
1st Flr SF         int64
Gr Liv Area        int64
Garage Cars      float64
Garage Area      float64
SalePrice        float64
dtype: object

In [11]:
df_first_iter.head()

Unnamed: 0,Overall Qual,Total Bsmt SF,1st Flr SF,Gr Liv Area,Garage Cars,Garage Area,SalePrice
0,6,725.0,725,1479,2.0,475.0,130500.0
1,7,913.0,913,2122,2.0,559.0,220000.0
2,5,1057.0,1057,1057,1.0,246.0,109000.0
3,5,384.0,744,1444,2.0,400.0,174000.0
4,6,676.0,831,1445,2.0,484.0,138500.0


Note: Decided to drop `Garage Yr Blt` due to the large number of NaN and the fact that imputation makes the model perform worst

In [13]:
#df_first_iter = df_first_iter.drop(columns='Garage Yr Blt')

### Getting dummies of categorical features 

In [15]:
#set(df_first_iter['Full Bath'])

In [16]:
# df_first_iter = pd.get_dummies(df_first_iter, columns=['Full Bath'], prefix='Full_Bath')

In [18]:
# set(df_first_iter['TotRms AbvGrd'])

In [19]:
# df_first_iter = pd.get_dummies(df_first_iter, columns=['TotRms AbvGrd'], prefix='TotRms_AbvGrd')

In [21]:
# set(df_first_iter['Fireplaces'])

In [22]:
#df_first_iter = pd.get_dummies(df_first_iter, columns=['Fireplaces'], prefix='Fireplaces')

In [23]:
# set(df_first_iter['Garage Yr Blt'])

In [24]:
df_first_iter.isna().sum()

Overall Qual       0
Total Bsmt SF      1
1st Flr SF         0
Gr Liv Area        0
Garage Cars        1
Garage Area        1
SalePrice        878
dtype: int64

In [25]:
df_first_iter.shape

(2929, 7)

### Data imputation

In [26]:
proc_train = df_first_iter.loc[df['SalePrice'].notna()]
proc_test = df_first_iter.loc[df['SalePrice'].isna()]

In [27]:
# Use basic imputation provided by pandas to columns with no significant number of
# of NaN
proc_train['Garage Area'].fillna(proc_train['Garage Area'].mean(), inplace = True)
proc_train['Garage Cars'].fillna(proc_train['Garage Cars'].mean(), inplace = True)
proc_train['Total Bsmt SF'].fillna(proc_train['Total Bsmt SF'].mean(), inplace = True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


In [28]:
# Use basic imputation provided by pandas to columns with no significant number of
# of NaN
proc_test['Garage Area'].fillna(proc_test['Garage Area'].mean(), inplace = True)
proc_test['Garage Cars'].fillna(proc_test['Garage Cars'].mean(), inplace = True)
proc_test['Total Bsmt SF'].fillna(proc_test['Total Bsmt SF'].mean(), inplace = True)


### Define feature Matrix and Target

In [30]:
proc_train.isna().sum()

Overall Qual     0
Total Bsmt SF    0
1st Flr SF       0
Gr Liv Area      0
Garage Cars      0
Garage Area      0
SalePrice        0
dtype: int64

In [31]:
proc_test.isna().sum()

Overall Qual       0
Total Bsmt SF      0
1st Flr SF         0
Gr Liv Area        0
Garage Cars        0
Garage Area        0
SalePrice        878
dtype: int64

In [32]:
X = proc_train.drop(columns=['SalePrice'])
y = proc_train['SalePrice']

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 30)

### Instantiate, Fit and Model 

In [34]:
lr = LinearRegression()

In [35]:
lr.fit(X_train, y_train)

LinearRegression()

In [36]:
lr.score(X_train, y_train), lr.score(X_test, y_test)

(0.8265386931706129, 0.5858876568326628)

### Run model through test.csv

In [None]:
X_TEST = proc_test.drop(columns=['SalePrice'])

In [None]:
preds = lr.predict(X_TEST)
preds.shape

In [None]:
df_test.shape

In [None]:
df_test['y_hat'] = preds

In [None]:
df.iloc[2051,:]

In [None]:
proc_test.head()

In [None]:
df_test.head()

In [None]:
df_submission = df_test[['Id','y_hat']]

In [None]:
df_submission = df_submission.rename(columns={'y_hat':'SalePrice'})

In [None]:
df_submission

### Export as CSV

In [None]:
# df_submission.to_csv('my_submission.csv', index=False)