In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error,r2_score

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
datafile = "IowaCleanData.csv"
df = pd.read_csv(datafile)
df.head()
# Let's have a look at the features
# df.shape, df.info()

Unnamed: 0.1,Unnamed: 0,Lot Frontage,Lot Area,Overall Qual,Year Built,Year Remod/Add,Mas Vnr Area,BsmtFin SF 1,Bsmt Unf SF,Total Bsmt SF,...,Full Bath,Half Bath,TotRms AbvGrd,Fireplaces,Garage Cars,Garage Area,Wood Deck SF,Open Porch SF,SalePrice,priceLbl
0,0,141.0,31770,6,1960,1960,112.0,639.0,441.0,1080.0,...,1,0,7,2,2.0,528.0,210,62,215000,"200,000 - 299,999"
1,1,80.0,11622,5,1961,1961,0.0,468.0,270.0,882.0,...,1,0,5,0,1.0,730.0,140,0,105000,"100,000 - 199,999"
2,2,81.0,14267,6,1958,1958,108.0,923.0,406.0,1329.0,...,1,1,6,0,1.0,312.0,393,36,172000,"100,000 - 199,999"
3,3,93.0,11160,7,1968,1968,0.0,1065.0,1045.0,2110.0,...,2,1,8,2,2.0,522.0,0,0,244000,"200,000 - 299,999"
4,4,74.0,13830,5,1997,1998,0.0,791.0,137.0,928.0,...,2,1,6,1,2.0,482.0,212,34,189900,"100,000 - 199,999"


In [3]:
df.columns

Index(['Unnamed: 0', 'Lot Frontage', 'Lot Area', 'Overall Qual', 'Year Built',
       'Year Remod/Add', 'Mas Vnr Area', 'BsmtFin SF 1', 'Bsmt Unf SF',
       'Total Bsmt SF', '1st Flr SF', '2nd Flr SF', 'Gr Liv Area', 'Full Bath',
       'Half Bath', 'TotRms AbvGrd', 'Fireplaces', 'Garage Cars',
       'Garage Area', 'Wood Deck SF', 'Open Porch SF', 'SalePrice',
       'priceLbl'],
      dtype='object')

In [4]:
# Let's Label encode all categorical variables
for c in df.columns:
    df[c]=df[c].fillna(-1)
    if df[c].dtype == 'object':
        le = preprocessing.LabelEncoder()
        df[c] = le.fit_transform(df[c].astype('str'))

In [5]:
# Split into train/test
X = df[['Lot Frontage', 'Lot Area', 'Overall Qual', 'Year Built','Year Remod/Add', 'Mas Vnr Area', 'BsmtFin SF 1', 'Bsmt Unf SF','Total Bsmt SF', '1st Flr SF', '2nd Flr SF', 'Gr Liv Area', 'Full Bath','Half Bath', 'TotRms AbvGrd', 'Fireplaces', 'Garage Cars','Garage Area', 'Wood Deck SF', 'Open Porch SF']]
y = df['priceLbl'].values.reshape(-1, 1)


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2197, 20), (733, 20), (2197, 1), (733, 1))

In [7]:
from sklearn.preprocessing import StandardScaler

X_scaler = StandardScaler().fit(X_train)


In [8]:
X_train_scaled = X_scaler.transform(X_train)

X_test_scaled = X_scaler.transform(X_test)


In [9]:
# Multiple Linear Regression model

from sklearn.linear_model import LinearRegression

model = LinearRegression()

model.fit(X_train_scaled, y_train)

predictions = model.predict(X_test_scaled)

MSE = mean_squared_error(y_test, predictions)

r2 = r2_score(y_test, predictions)

print(f"MSE: {MSE}, R2: {r2}")

MSE: 0.21500847242404858, R2: 0.7326876917339766


In [10]:
# LASSO model

from sklearn.linear_model import Lasso

# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=Y)

lasso = Lasso(alpha=.01).fit(X_train_scaled, y_train)

predictions = lasso.predict(X_test_scaled)

MSE = mean_squared_error(y_test, predictions)

r2 = lasso.score(X_test_scaled, y_test)

print(f"MSE: {MSE}, R2: {r2}")

MSE: 0.2131765583078092, R2: 0.7349652447319408


In [11]:
# Ridge model

from sklearn.linear_model import Ridge

ridge = Ridge(alpha=.01).fit(X_train_scaled, y_train)

predictions = ridge.predict(X_test_scaled)

MSE = mean_squared_error(y_test, predictions)

r2 = ridge.score(X_test_scaled, y_test)

print(f"MSE: {MSE}, R2: {r2}")

MSE: 0.21500801301295203, R2: 0.7326882629032859


In [12]:
# ElasticNet model

from sklearn.linear_model import ElasticNet

elasticnet = ElasticNet(alpha=.01).fit(X_train_scaled, y_train)

predictions = elasticnet.predict(X_test_scaled)

MSE = mean_squared_error(y_test, predictions)

r2 = elasticnet.score(X_test_scaled, y_test)

print(f"MSE: {MSE}, R2: {r2}")

MSE: 0.21268447441450591, R2: 0.7355770349553512
