# MP3

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.preprocessing import PolynomialFeatures

from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing as prep
from sklearn.decomposition import PCA

import pickle

In [None]:
# read into dataframe from csv
df = pd.read_csv('./data/house-data.csv', index_col=None, na_values=['NA'])
df.shape

In [None]:
df.head()

## Cleaning tha dataframe

In [None]:
# listing all headers
list(df)

In [None]:
# count missing values
df.isnull().sum()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
# if you want to change the format, for example to avoid scientific notation, e.g. e+04
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [None]:
df.sample(3)

### Drop unnecessary columns

In [None]:
df = df.drop(['id', 'date', 'zipcode', 'yr_renovated', 'sqft_living15', 'sqft_lot15', 'sqft_basement'], axis=1)

In [None]:
df

### Turn weirdly continuus values into nominal values

In [None]:
df['bathrooms'] = df['bathrooms'].apply(np.floor).astype(int)
df['floors'] = df['floors'].apply(np.floor).astype(int)
df

### Dealing with outliers

In [None]:
df.plot.box()

In [None]:
df.price.plot.box()

In [None]:
df.price.plot.density()

In [None]:
factor = 1.2
upper_lim = df['price'].mean () + df['price'].std () * factor
lower_lim = df['price'].mean () - df['price'].std () * factor

no_outliers = df[(df['price'] < upper_lim) & (df['price'] > lower_lim)]

outliers = df[(df['price'] > upper_lim) | (df['price'] < lower_lim)]
df = no_outliers
outliers

In [None]:
df

In [None]:
df.price.plot.density()

In [None]:
df.sqft_lot.plot.box()

In [None]:
factor = 0.5
upper_lim = df['sqft_lot'].mean () + df['sqft_lot'].std () * factor
lower_lim = df['sqft_lot'].mean () - df['sqft_lot'].std () * factor

no_outliers = df[(df['sqft_lot'] < upper_lim) & (df['sqft_lot'] > lower_lim)]
no_outliers.sqft_lot.plot.box()

In [None]:
df = no_outliers.reset_index()
df

In [None]:
df.plot.box()

## Data exploration

### Check the distribution

In [None]:
df = df.drop(['lat', 'long', 'index'], axis=1)
df.hist()

It seems that a lot of the features follows a kind of normal distribution.

Lets se if we can find some correlations

In [None]:
corr_matrix = df.corr()
corr_matrix

In [None]:
plt.figure(figsize=(10,6))
sns.heatmap(corr_matrix, annot=True, square=True, fmt=".2f")
plt.show()

When looking at price we can see that the higest correlations is grade with 0.59 and the smallest is sqft_lot with only 0.02 which means there is almost not any correlation between the two.

We can also see that the price correlates highly with square foot living space, the grade of the house, and to a lesser extent the amount of bathrooms in the house.

In [None]:
corr_matrix['price']

Least informative attributes, potentiale for removal, if the model is inaccurate:

* sqft_lot (0.017)
* yr_built (0.032)
* condition (0.036)
* waterfront (0.048)

### Training a linear regression model

In [None]:
X = df['grade'].values.reshape(-1, 1)
y = df['price'].values.reshape(-1, 1)

In [None]:
# Scatter plot for grade & price:
plt.ylabel('price')
plt.xlabel('grade')
plt.scatter(X, y, color='blue')
plt.show()

In [None]:
# random_state (kwargs, att.) = We set to 123, so we get the same split of data every time, wich makes it easier
# for debugging and testing, because it allows you to get consistent results when you 
# run the code multiple times 

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123, test_size=0.25) 

In [None]:
# The shape of each subsets (train & test):
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
# Instance of linear regression:
myreg = LinearRegression()

In [None]:
myreg.fit(X_train, y_train)
myreg

In [None]:
a = myreg.coef_
b = myreg.intercept_

print(f'coef: {a}')
print(f'intercept: {b}')

In [None]:
y_predicted = myreg.predict(X_test)
y_predicted

In [None]:
y_test

In [None]:
# Visualise the Linear Regression 
plt.title('Linear Regression')
plt.scatter(X, y, color='green')
plt.plot(X_train, a*X_train + b, color='blue')
plt.plot(X_test, y_predicted, color='orange')
plt.xlabel('grade')
plt.ylabel('price')
plt.show()

In [None]:
R2 = myreg.score(X, y)
R2

In [None]:
mae = metrics.mean_absolute_error(y_test, y_predicted)
print(mae)

mse = metrics.mean_squared_error(y_test, y_predicted)
print(mse)

rmse = np.sqrt(metrics.mean_squared_error(y_test, y_predicted))
print(rmse)

In [None]:
import sys
sys.path.append('./util')

from testForReg import bestLinReg


In [None]:
# Linear regression between 'x' (one col at a time) and 'y' the price:
df_linReg = bestLinReg(df, 'price')
df_linReg

In [None]:
# Real representation of R2
for index, value in df_linReg['R2'].items():
    print(f"Index: {index}, Value: {value}")

It seems that the data is not fit to do linear regression, when tryed on each cols (x) one at a time compared to price (y),
the higgest accuracy is 35.29% between grade and price.

### Training a multiple feature linear regression model

In [None]:
x_params = df.columns[1:]

In [None]:
X = df[x_params]
X

In [None]:
y = df['price']
y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

In [None]:
print(X_train.shape)

In [None]:
linreg = LinearRegression()
linreg.fit(X_train, y_train)

In [None]:
list(zip(x_params, linreg.coef_))

### Testing model

In [None]:
y_test

In [None]:
y_predicted = linreg.predict(X_test)

In [None]:
y_predicted

MSE test

In [None]:
print(np.sqrt(metrics.mean_squared_error(y_test, y_predicted)))

R-squared test

In [None]:
eV = round(metrics.explained_variance_score(y_test, y_predicted), 6)
print('Explained variance score', eV)

In [None]:
metrics.r2_score(y_test, y_predicted)

In [None]:
plt.title('Linear regression model')
plt.scatter(y_test, y_predicted, color='green')
plt.show()

In [None]:
model_store = linreg

### Let's try to make the model better
By focusing on the features that have the highest correlation with price

In [None]:
x_params = ['sqft_living', 'sqft_above', 'grade']
sns.pairplot(df, x_vars=x_params, y_vars='price', height=7, kind='reg')

In [None]:
X = df[x_params]
X

In [None]:
y = df['price']
y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

In [None]:
linreg = LinearRegression()
linreg.fit(X_train, y_train)

In [None]:
y_predicted = linreg.predict(X_test)

In [None]:
eV = round(metrics.explained_variance_score(y_test, y_predicted), 6)
print('Explained variance score', eV)

In [None]:
plt.title('Linear regression model with reduced features')
plt.scatter(y_test, y_predicted, color='green')
plt.show()

Hmm that did not seem to help the accuracy of the model. Ithas been reduced around 13% since the reduction of features.
We'll try only removing features which have very low correlation wih price

In [None]:
x_params = ['sqft_living', 'sqft_above', 'grade', 'bedrooms', 'bathrooms', 'floors', 'view']
X = df[x_params]
X

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

In [None]:
linreg = LinearRegression()
linreg.fit(X_train, y_train)

In [None]:
y_predicted = linreg.predict(X_test)

In [None]:
eV = round(metrics.explained_variance_score(y_test, y_predicted), 6)
print('Explained variance score', eV)

In [None]:
plt.title('Linear regression model with least correlating features removed')
plt.scatter(y_test, y_predicted, color='green')
plt.show()

Looks like a linear regression model works best with all features!

### Polynomial model

In [None]:
# X_params = df.columns[1:]
X = df[x_params].values
X

In [None]:
y = df['price'].values
y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
poly_model = PolynomialFeatures(degree=3)
X_poly_train = poly_model.fit_transform(X_train)
X_poly_test = poly_model.transform(X_test)
pol_reg = LinearRegression()
pol_reg.fit(X_poly_train, y_train)

In [None]:
y_predict = pol_reg.predict(X_poly_test)

In [None]:
y_predict

In [None]:
eV = round(metrics.explained_variance_score(y_test, y_predict), 6)
print('Explained variance score', eV)

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(y_test, label='Actual (Testing)', color='green')
plt.plot(y_predict, label='Predicted (Testing)', color='red', linestyle='--')
plt.xlabel('Index')
plt.ylabel('Values')
plt.title('Actual vs Predicted (Testing)')
plt.legend()
plt.show()

Closest to overtake, but no luck!

## Further improvement: PCA

Now We have built and selected our model which can predict prices of houses based on the input data, but can we make a more accurate model by trying to reduce the dimensionality of the data?
This will introduce a whole new set of problems, but let's see if we can make a more accurate prediction model

In [None]:
X = df[x_params]
X

In [None]:
# Convert data frame to array
X = X.values
X

In [None]:
# Plot the data
plt.figure()
plt.title('Input data')

# calculate the range of coordinates
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1

# plot coordinates
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)

# plot the points
plt.scatter(X[:,0], X[:,1], color='black', s=80, marker='o', facecolors='none')

plt.show()

In [None]:
# Normalize data by l1 method
nl2 = prep.normalize(X, norm='l1')
nl2

In [None]:
# Use min-max scaling, since a lot of the features does not follow gaussian distribution
mmsc = prep.MinMaxScaler()
X = mmsc.fit_transform(X)  
X

In [None]:
comp_amount = 4
pca = PCA(n_components=comp_amount)
pca_X = pca.fit_transform(X)
pca_X

In [None]:
cols = []
for i in range(comp_amount):
    cols.append(f'pc {i +1}')


pcadf = pd.DataFrame(data = pca_data, columns = cols)
pcadf

In [None]:
explained_variance = pca.explained_variance_ratio_  
explained_variance

In [None]:
# Plot the explained_variance
plt.plot(explained_variance, 'bx-', c='red')
plt.xlabel('component')
plt.ylabel('variance')
plt.title('The optimal number of components')
plt.show()


In [None]:
# Plot the cumulative explained_variance
cumulative = np.cumsum(explained_variance)
plt.plot(cumulative, 'b*-', c='green')
plt.xlabel('components')
plt.ylabel('cumulative')
plt.title('The optimal number of components')
plt.show()

In [None]:
# amount of initial features
len(x_params)

So it seems that we can cover around 97% of the datasets variance with 4 principal components instead of the original 7 feaures. Lets see how the so far best predicting model type handles this

In [None]:
X_train, X_test, y_train, y_test = train_test_split(pca_X, y, test_size=0.2, random_state=42)

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
r2 = metrics.r2_score(y_test, y_pred)
print('Explained Variance: ', r2)

In [None]:
mse = metrics.mean_squared_error(y_test, y_pred)
print(mse)


This is not good at all! The model acually became one of the worst performing!

## Conclusion

**R² Scoreboard ranked**

- Linear regression model with all slightly relevant features:
0.55

- Linear regression model with 3 degree polynomial features:
0.47

- Linear regression model with 4 lowest correlating features removed:
0.43

- Linear regression model with 3 highest correlating features:
0.42

- Linear regression model with reduced dimensionality by PCA:
0.39

- Linear regression model with highest correlation feature (grade):
0.35


It seems that the regression model that is best able to predict prices is the multiple parameter linear regression model with an r² score of 0.55. It's something, but not that accurate!

## Save the best fitted model for future use:

In [None]:
model_store_location = './deploy/regmodel.pkl'

In [None]:
with open(model_store_location, 'wb') as f:
    pickle.dump(model_store, f)

In [None]:
with open(model_store_location, 'rb') as f:
    mymodel = pickle.load(f)

### Test stored model

In [None]:
x_new = pd.DataFrame({'bedrooms' : [3],'bathrooms' : [2], 'sqft_living' : [1500],'sqft_lot' : [6750], 'floors' : [2], 'waterfront' : [0],  'view' : [0], 'condition' : [3], 'grade' : [7], 'sqft_above' : [1500], 'yr_built' : [2007]})
price = mymodel.predict(x_new)
print(price)