# Importing libraries, loading Dataset, and defining x/y.

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

raw_df = pd.read_excel("./Real estate valuation data set.xlsx")
del raw_df['No']
del raw_df['X1 transaction date']

X=raw_df.iloc[:,:-1].values
y=raw_df.iloc[:,-1].values

# Splitting the dataset into Training and Test sets.

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=0)

#Creating a Standardized dataset and a Normalized dataset.

We will compare both of these against each other and the raw dataset as a baseline.

Standardized dataset:

In [4]:
from sklearn.preprocessing import StandardScaler
std_sc=StandardScaler()

std_X_train = std_sc.fit_transform(X_train)
std_X_test = std_sc.transform(X_test)

Normalized dataset:

In [5]:
from sklearn.preprocessing import Normalizer
norm = Normalizer()

norm_X_train = norm.fit_transform(X_train)
norm_X_test = norm.transform(X_test)

# Performing PCA.

In [10]:
from sklearn.decomposition import PCA

#Raw data
raw_pca = PCA(n_components=2, random_state=0)
raw_X_train = raw_pca.fit_transform(X_train)
raw_X_test = raw_pca.transform(X_test)
print('Variance with 2 columns:', sum(raw_pca.explained_variance_ratio_ * 100))

#Standardized data
std_pca = PCA(n_components=2)
std_X_train = std_pca.fit_transform(std_X_train)
std_X_test = std_pca.transform(std_X_test)
print('Variance with 2 columns and Standard Scaler:', sum(std_pca.explained_variance_ratio_ * 100))

#Normalized data
norm_pca = PCA(n_components=2)
norm_X_train = norm_pca.fit_transform(norm_X_train)
norm_X_test = norm_pca.transform(norm_X_test)
print('Variance with 2 columns and Normalized:', sum(norm_pca.explained_variance_ratio_ * 100))

Variance with 2 columns: 99.99966533440927
Variance with 2 columns and Standard Scaler: 100.0
Variance with 2 columns and Normalized: 100.00000000000001


In [13]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
  
print('Order:')
print('MAE: Lower Better')
print('MAE: Lower Better')
print('R2: Higher Better')
print()
print('Only 2 columns, raw data')
pca_regression = LinearRegression()
pca_regression.fit(raw_X_train, y_train)
y_pred_pca = pca_regression.predict(raw_X_test)
print(mean_absolute_error(y_test, y_pred_pca))
print(mean_squared_error(y_test, y_pred_pca))
print(r2_score(y_test, y_pred_pca))

print('\nOnly 2 columns, standard scaler')
pca_regression = LinearRegression()
pca_regression.fit(std_X_train, y_train)
y_pred_pca = pca_regression.predict(std_X_test)
print(mean_absolute_error(y_test, y_pred_pca))
print(mean_squared_error(y_test, y_pred_pca))
print(r2_score(y_test, y_pred_pca))

print('\nOnly 2 columns, normalized')
pca_regression = LinearRegression()
pca_regression.fit(norm_X_train, y_train)
y_pred_pca = pca_regression.predict(norm_X_test)
print(mean_absolute_error(y_test, y_pred_pca))
print(mean_squared_error(y_test, y_pred_pca))
print(r2_score(y_test, y_pred_pca))

print()
print('\nAll 5 columns')
full_regressor = LinearRegression()
full_regressor.fit(X_train, y_train)
y_pred = full_regressor.predict(X_test)
print(mean_absolute_error(y_test, y_pred))
print(mean_squared_error(y_test, y_pred))
print(r2_score(y_test, y_pred))

Order:
MAE: Lower Better
MAE: Lower Better
R2: Higher Better

Only 2 columns, raw data
6.682875169367948
78.63145466226699
0.54731202359297

Only 2 columns, standard scaler
6.351839427574513
69.49283092992846
0.5999238581610362

Only 2 columns, normalized
6.690121730778055
71.57281297524209
0.5879492245096531


All 5 columns
5.7459274754626275
62.172235622415606
0.6420691483643861
