# Housing Price Prediction - Regression

## Author: Era Ebhodaghe

In [5]:
#importing Libraries

import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler

## Part 1: PCA and Variance Threshold in a Linear Regression

#### Step 1: Load Dataset

In [8]:
#loading dataset
housing_df = pd.read_csv('/Users/valuedcustomer/Downloads/house-prices-advanced-regression-techniques/train.csv')
housing_df.head(2)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500


#### Step 2: Drop ID Column

In [10]:
housing_df = housing_df.drop('Id', axis = 1)
housing_df.head(2)

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500


#### Step 3: Fill in missing data with the median value for numerical columns

In [12]:
numerical_cols = housing_df.select_dtypes(include=np.number).columns
housing_df[numerical_cols] = housing_df[numerical_cols].fillna(housing_df[numerical_cols].median())

#### Step 4: Fill in missing data with the mode value for categorical columns

In [14]:
categorical_cols = housing_df.select_dtypes(include=object).columns
housing_df[categorical_cols] = housing_df[categorical_cols].fillna(housing_df[categorical_cols].mode().iloc[0])

#### Step 5: Convert Categorical Columns to dummy variable

In [16]:
housing_df = pd.get_dummies(housing_df)
housing_df.head(3)

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,60,65.0,8450,7,5,2003,2003,196.0,706,0,...,False,False,False,True,False,False,False,False,True,False
1,20,80.0,9600,6,8,1976,1976,0.0,978,0,...,False,False,False,True,False,False,False,False,True,False
2,60,68.0,11250,7,5,2001,2002,162.0,486,0,...,False,False,False,True,False,False,False,False,True,False


#### Step 6: Split data into training and test sets

In [18]:
target = 'SalePrice'
features = housing_df.drop(target, axis=1)
X_train, X_test, y_train, y_test = train_test_split(features, housing_df[target], test_size=0.2, random_state=42)

#### Step 7: Run a linear regression and evaluate on the test set

In [20]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Linear Regression Results:")
print("R^2 Score:", r2)
print("RMSE:", rmse)

Linear Regression Results:
R^2 Score: 0.886098953863544
RMSE: 29557.70930242208


#### Step 8 and 9: Fit and transform training features with PCA (retain 90% variance)

In [22]:
pca = PCA(n_components=0.90)
X_train_pca = pca.fit_transform(X_train)
num_features_pca = X_train_pca.shape[1]
print("Number of features in PCA-transformed matrix:", num_features_pca)

Number of features in PCA-transformed matrix: 1


#### Step 10:Transform test features with the same PCA

In [24]:
X_test_pca = pca.transform(X_test)

#### Step 11: Repeat step 7 with you PCA transformed data

In [26]:
lr_pca = LinearRegression()
lr_pca.fit(X_train_pca, y_train)
y_pred_pca = lr_pca.predict(X_test_pca)
r2_pca = r2_score(y_test, y_pred_pca)
rmse_pca = np.sqrt(mean_squared_error(y_test, y_pred_pca))
print("\nLinear Regression with PCA Results:")
print("R^2 Score:", r2_pca)
print("RMSE:", rmse_pca)


Linear Regression with PCA Results:
R^2 Score: 0.06348978229901636
RMSE: 84754.58020738687


#### Step 12: Apply MinMaxScaler to original training features

In [28]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

#### Step 13: Find min-max scaled features with variance above 0.1

In [30]:
var_threshold = 0.1
high_var_features = X_train_scaled[:, np.var(X_train_scaled, axis=0) > var_threshold]

#### Step 14 and 15: Transform test features with the same steps as in steps 11 and 12

In [32]:
X_test_scaled = scaler.transform(X_test)
# Step 15: Repeat step 7 with high variance data
X_test_high_var = X_test_scaled[:, np.var(X_train_scaled, axis=0) > var_threshold]
lr_high_var = LinearRegression()
lr_high_var.fit(high_var_features, y_train)
y_pred_high_var = lr_high_var.predict(X_test_high_var)
r2_high_var = r2_score(y_test, y_pred_high_var)
rmse_high_var = np.sqrt(mean_squared_error(y_test, y_pred_high_var))
print("\nLinear Regression with High Variance Data Results:")
print("R^2 Score:", r2_high_var)
print("RMSE:", rmse_high_var)



Linear Regression with High Variance Data Results:
R^2 Score: 0.6681096563409217
RMSE: 50455.00174294488


#### Step 16: Summarize findings

In [34]:
print("\nSummary of Findings:")
print("Linear Regression Results:")
print("R^2 Score:", r2)
print("RMSE:", rmse)
print("\nLinear Regression with PCA Results:")
print("R^2 Score:", r2_pca)
print("RMSE:", rmse_pca)
print("\nLinear Regression with High Variance Data Results:")
print("R^2 Score:", r2_high_var)
print("RMSE:", rmse_high_var)


Summary of Findings:
Linear Regression Results:
R^2 Score: 0.886098953863544
RMSE: 29557.70930242208

Linear Regression with PCA Results:
R^2 Score: 0.06348978229901636
RMSE: 84754.58020738687

Linear Regression with High Variance Data Results:
R^2 Score: 0.6681096563409217
RMSE: 50455.00174294488
