## Data modelling

In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [14]:
#load data
df = pd.read_csv('../data/cleaned/data_without_outlier.csv')

df

Unnamed: 0,bedrooms,bathrooms,sqft_above,floors,sqft_lot15,sqft_living15,sqft_basement,waterfront,view,condition,grade,yr_built_cat,zip,price
0,3.0,1.00,3.712993,1.0,552.975638,5.283450,2.753298,0,0,3,7,1,1,221900
1,3.0,2.25,3.845177,2.0,680.808061,5.402655,2.602060,0,0,3,7,1,1,538000
2,2.0,1.00,3.609968,1.0,706.577328,5.639440,2.753298,0,0,3,6,1,0,180000
3,4.0,3.00,3.685715,1.0,508.270898,5.291135,2.959041,0,0,5,7,1,1,604000
4,3.0,2.00,3.791662,1.0,672.429043,5.434618,2.753298,0,0,3,8,1,0,510000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18010,3.0,2.50,3.771390,3.0,222.173922,5.351869,2.753298,0,0,3,9,1,0,735000
18011,4.0,2.50,3.857826,2.0,653.588512,5.442967,2.753298,0,1,3,8,1,1,536000
18012,2.0,0.75,3.678839,2.0,270.621885,5.140055,2.753298,0,0,3,7,1,1,575000
18013,3.0,2.50,3.781136,2.0,198.999077,5.309821,2.753298,0,0,3,6,1,1,500000


In [15]:
#define data
X = df.drop(['price'], axis=1)
y = df['price']

In [16]:
X.head()

Unnamed: 0,bedrooms,bathrooms,sqft_above,floors,sqft_lot15,sqft_living15,sqft_basement,waterfront,view,condition,grade,yr_built_cat,zip
0,3.0,1.0,3.712993,1.0,552.975638,5.28345,2.753298,0,0,3,7,1,1
1,3.0,2.25,3.845177,2.0,680.808061,5.402655,2.60206,0,0,3,7,1,1
2,2.0,1.0,3.609968,1.0,706.577328,5.63944,2.753298,0,0,3,6,1,0
3,4.0,3.0,3.685715,1.0,508.270898,5.291135,2.959041,0,0,5,7,1,1
4,3.0,2.0,3.791662,1.0,672.429043,5.434618,2.753298,0,0,3,8,1,0


Now we are going to split the data into train and test set using sklearn, with test_size =0.2, and a random_state=42

In [17]:
#split data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2,
                                                    random_state=42)

After our split,we will first convert to a datafram, ensure to minmax scale to reduce range of our training and test set before we fit our model

In [18]:
#convert train and test data to dataframe
X_train_df = pd.DataFrame(X_train, columns=X.columns)
X_test_df = pd.DataFrame(X_test, columns=X.columns)

In [19]:
#now to scale the data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

scaler.fit(X_train_df)


In [20]:
#save the scaler
import pickle
path="../scalers/"
scaler_file_name= "min_max_scaler_without_outliers.pkl"

with open(path + scaler_file_name, 'wb') as file:
    pickle.dump(scaler, file)

In [21]:
#transform the data
X_train_scaled = scaler.transform(X_train_df)
X_test_scaled = scaler.transform(X_test_df)

#convert the scaled data to dataframe
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X.columns)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X.columns)

In [22]:
X_train_scaled_df.head()

Unnamed: 0,bedrooms,bathrooms,sqft_above,floors,sqft_lot15,sqft_living15,sqft_basement,waterfront,view,condition,grade,yr_built_cat,zip
0,0.333333,0.363636,0.526659,0.0,0.489276,0.628544,0.816959,0.0,0.5,0.5,0.4,0.0,1.0
1,0.333333,0.636364,0.904869,0.4,0.606296,0.85922,0.816959,0.0,0.0,0.75,0.4,0.0,0.0
2,0.666667,0.636364,0.880334,0.4,0.419217,0.89581,0.816959,0.0,0.0,0.75,0.3,0.0,0.0
3,0.333333,0.636364,0.698696,0.4,0.641971,0.55732,0.816959,0.0,0.0,0.5,0.4,0.0,0.0
4,0.666667,1.0,0.919853,0.4,0.743232,0.936358,0.98962,0.0,0.0,0.5,0.4,0.0,1.0


In [23]:
X_train_scaled_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
bedrooms,14412.0,0.42682,0.259328,0.0,0.333333,0.333333,0.666667,1.0
bathrooms,14412.0,0.457952,0.241979,0.0,0.272727,0.454545,0.636364,1.0
sqft_above,14412.0,0.606693,0.177644,0.0,0.477632,0.597093,0.738608,1.0
floors,14412.0,0.190966,0.217693,0.0,0.0,0.0,0.4,1.0
sqft_lot15,14412.0,0.470343,0.186322,0.0,0.353298,0.488698,0.587104,1.0
sqft_living15,14412.0,0.611469,0.161748,0.0,0.498499,0.606943,0.727031,1.0
sqft_basement,14412.0,0.816692,0.076821,0.0,0.816959,0.816959,0.816959,1.0
waterfront,14412.0,0.007979,0.088974,0.0,0.0,0.0,0.0,1.0
view,14412.0,0.060592,0.194407,0.0,0.0,0.0,0.0,1.0
condition,14412.0,0.611678,0.167564,0.0,0.5,0.5,0.75,1.0


Now to build the model, we will use Linear regression because we are meant to predict price of the real estate .
Linear regression is great for price prediction as it predicts numerical values

In [24]:
#now to build the model
from sklearn.linear_model import LinearRegression

In [25]:
linear_model = LinearRegression()
linear_model.fit(X_train_scaled_df, y_train)

In [26]:
#save model
path="../models/"
model_file_name= "linear_model_without_outliers.pkl"

with open(path + model_file_name, 'wb') as file:
    pickle.dump(linear_model, file)

Evaluate model
we will compare against real value and then subsequently we will check with metrics such as accuracy score etc

In [27]:
y_train_pred = linear_model.predict(X_train_scaled_df)
y_test_pred = linear_model.predict(X_test_scaled_df)

In [28]:
results = {"Set": ["Train"]*X_train.shape[0] + ["Test"]*X_test.shape[0], 
           "Real": list(y_train) + list(y_test),
           "Predicted": list(y_train_pred) + list(y_test_pred)}

results_df = pd.DataFrame(results)


In [31]:
results_df

Unnamed: 0,Set,Real,Predicted
0,Train,508000,564991.842954
1,Train,476000,383423.066795
2,Train,230000,182626.073616
3,Train,302000,321818.521923
4,Train,500000,423587.789842
...,...,...,...
18010,Test,315000,305562.624519
18011,Test,525000,593433.654941
18012,Test,413500,137396.465952
18013,Test,610000,629555.030720


In [32]:
#show error
results_df['Errors'] = results_df['Real'] - results_df['Predicted']
display(results_df)

Unnamed: 0,Set,Real,Predicted,Errors
0,Train,508000,564991.842954,-56991.842954
1,Train,476000,383423.066795,92576.933205
2,Train,230000,182626.073616,47373.926384
3,Train,302000,321818.521923,-19818.521923
4,Train,500000,423587.789842,76412.210158
...,...,...,...,...
18010,Test,315000,305562.624519,9437.375481
18011,Test,525000,593433.654941,-68433.654941
18012,Test,413500,137396.465952,276103.534048
18013,Test,610000,629555.030720,-19555.030720


In [33]:
#error metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Mean absolute error
MAE_train = mean_absolute_error(results_df[results_df["Set"]=="Train"]["Real"], results_df[results_df["Set"]=="Train"]["Predicted"])
MAE_test  = mean_absolute_error(results_df[results_df["Set"]=="Test"]["Real"],  results_df[results_df["Set"]=="Test"]["Predicted"])

# Mean squared error
MSE_train = mean_squared_error(results_df[results_df["Set"]=="Train"]["Real"], results_df[results_df["Set"]=="Train"]["Predicted"])
MSE_test  = mean_squared_error(results_df[results_df["Set"]=="Test"]["Real"],  results_df[results_df["Set"]=="Test"]["Predicted"])

# Root mean squared error
RMSE_train = mean_squared_error(results_df[results_df["Set"]=="Train"]["Real"], 
                                results_df[results_df["Set"]=="Train"]["Predicted"],
                                squared=False)
RMSE_test  = mean_squared_error(results_df[results_df["Set"]=="Test"]["Real"],  
                                results_df[results_df["Set"]=="Test"]["Predicted"],
                                squared=False)

# R2
R2_train = r2_score(results_df[results_df["Set"]=="Train"]["Real"], results_df[results_df["Set"]=="Train"]["Predicted"])
R2_test  = r2_score(results_df[results_df["Set"]=="Test"]["Real"],  results_df[results_df["Set"]=="Test"]["Predicted"])

In [34]:
def error_metrics_report(y_real_train: list, y_real_test: list, y_pred_train: list, y_pred_test: list) -> pd.DataFrame:
    '''
   The error metrics report function calculates for regression.

    Parameters:
    - y_real_train (list): The actual target values for the training dataset.
    - y_real_test (list): The actual target values for the testing dataset.
    - y_pred_train (list): The predicted target values for the training dataset.
    - y_pred_test (list): The predicted target values for the testing dataset.

    Returns:
    - metrics_df (DataFrame): A Pandas DataFrame containing error metrics for both the training and testing datasets.
    - 'Metric' (str): The name of the error metric.
    - 'Training Set' (float): The error metric value for the training set.
    - 'Testing Set' (float): The error metric value for the testing set.
    '''

    MAE_train = mean_absolute_error(y_real_train, y_pred_train)
    MAE_test  = mean_absolute_error(y_real_test, y_pred_test)

    # Mean squared error
    MSE_train = mean_squared_error(y_real_train, y_pred_train)
    MSE_test  = mean_squared_error(y_real_test, y_pred_test)

    # Root mean squared error
    RMSE_train = mean_squared_error(y_real_train, y_pred_train)
    RMSE_test  = mean_squared_error(y_real_test, y_pred_test)

    # R2
    R2_train = r2_score(y_real_train, y_pred_train)
    R2_test  = r2_score(y_real_test, y_pred_test)

    results = {"Metric":['MAE', 'MSE', 'RMSE', 'R2'] , 
               "Train": [MAE_train, MSE_train, RMSE_train, R2_train],
               "Test":  [MAE_test, MSE_test, RMSE_test, R2_test]}

    results_df = pd.DataFrame(results).round(2)

    return results_df

In [35]:
error_metrics_report(list(results_df[results_df["Set"]=="Train"]["Real"]),
                     list(results_df[results_df["Set"]=="Test"]["Real"]),
                     list(results_df[results_df["Set"]=="Train"]["Predicted"]),
                     list(results_df[results_df["Set"]=="Test"]["Predicted"]))

Unnamed: 0,Metric,Train,Test
0,MAE,155023.7,160514.7
1,MSE,56638260000.0,75385300000.0
2,RMSE,56638260000.0,75385300000.0
3,R2,0.55,0.52
