### Importing Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import r2_score

### Load Data

In [10]:
df = pd.read_csv('./datasets/train.csv')
df.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500


### Data Cleaning Initial Check

In [11]:
# Checking for null values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2051 entries, 0 to 2050
Data columns (total 81 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Id               2051 non-null   int64  
 1   PID              2051 non-null   int64  
 2   MS SubClass      2051 non-null   int64  
 3   MS Zoning        2051 non-null   object 
 4   Lot Frontage     1721 non-null   float64
 5   Lot Area         2051 non-null   int64  
 6   Street           2051 non-null   object 
 7   Alley            140 non-null    object 
 8   Lot Shape        2051 non-null   object 
 9   Land Contour     2051 non-null   object 
 10  Utilities        2051 non-null   object 
 11  Lot Config       2051 non-null   object 
 12  Land Slope       2051 non-null   object 
 13  Neighborhood     2051 non-null   object 
 14  Condition 1      2051 non-null   object 
 15  Condition 2      2051 non-null   object 
 16  Bldg Type        2051 non-null   object 
 17  House Style   

In [12]:
# These features are contain a lot of loss of info

df.drop(columns=['Alley','Pool QC','Misc Feature', 'Fence'])

# The following features should be considered for
# imputation; next round check
imputable_cols = ['Lot Frontage', 'Bsmt Qual', 'Bsmt Cond', 
                  'Bsmt Exposure', 'BsmtFin Type 1','Garage Yr Blt',
                  'Garage Finish', 'Garage Qual', 'Garage Cond']

In [13]:
# Drop rows containing nan's
print(f'Rows before droping nan\'s: {df.shape[0]}')
temp = df.dropna()
print(f'Rows after droping nan\'s: {temp.shape[0]}')

Rows before droping nan's: 2051
Rows after droping nan's: 0


### 1st model iteration:

In [14]:
# droping columns based on missing nans
print(f'Cols before droping nan\'s: {df.shape[1]}')
temp = df.dropna(axis='columns')
print(f'Cols after droping nan\'s: {temp.shape[1]}')

Cols before droping nan's: 81
Cols after droping nan's: 55


In [15]:
df = temp

In [17]:
df.corr()['SalePrice'].sort_values()

PID               -0.255052
Enclosed Porch    -0.135656
Kitchen AbvGr     -0.125444
Overall Cond      -0.097019
MS SubClass       -0.087335
Id                -0.051398
Low Qual Fin SF   -0.041594
Yr Sold           -0.015203
Misc Val          -0.007375
Pool Area          0.023106
Mo Sold            0.032735
3Ssn Porch         0.048732
Screen Porch       0.134581
Bedroom AbvGr      0.137067
2nd Flr SF         0.248452
Half Bath          0.283001
Lot Area           0.296566
Wood Deck SF       0.326490
Open Porch SF      0.333476
Fireplaces         0.471093
TotRms AbvGrd      0.504014
Full Bath          0.537969
Year Remod/Add     0.550370
Year Built         0.571849
1st Flr SF         0.618486
Gr Liv Area        0.697038
Overall Qual       0.800207
SalePrice          1.000000
Name: SalePrice, dtype: float64

In [45]:
# ** Note ** : list contains predicted label of salePrice  
features_high_corr = ['Fireplaces', 'TotRms AbvGrd', 'Full Bath', 'Year Remod/Add', 
                      'Year Built', '1st Flr SF', 'Gr Liv Area', 'Overall Qual', 'SalePrice']

In [46]:
df_first_iter = df[features_high_corr]

In [47]:
df[features_high_corr].dtypes

Fireplaces        int64
TotRms AbvGrd     int64
Full Bath         int64
Year Remod/Add    int64
Year Built        int64
1st Flr SF        int64
Gr Liv Area       int64
Overall Qual      int64
SalePrice         int64
dtype: object

In [48]:
set(df['Full Bath'])

{0, 1, 2, 3, 4}

In [49]:
df_first_iter = pd.get_dummies(df_first_iter, columns=['Full Bath'], prefix='FullBath')

In [50]:
set(df['Overall Qual'])

{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}

In [51]:
df_first_iter = pd.get_dummies(df_first_iter, columns=['Overall Qual'], prefix='OverallQual')

In [52]:
set(df['Fireplaces'])

{0, 1, 2, 3, 4}

In [53]:
df_first_iter = pd.get_dummies(df_first_iter, columns=['Fireplaces'])

In [54]:
set(df['TotRms AbvGrd'])

{2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}

In [55]:
df_first_iter = pd.get_dummies(df_first_iter, columns=['TotRms AbvGrd'], prefix='TotRms_AbvGrd')

### Model Prep

In [57]:
final_features = list(df_first_iter.columns)
final_features.remove('SalePrice')

X = df_first_iter[final_features]
y = df_first_iter['SalePrice']

In [58]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=30)

In [59]:
lr = LinearRegression()

In [61]:
lr_scores = cross_val_score(lr, X_train, y_train, cv=3)
lr_scores.mean()

0.863003312624074