## Data modelling

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#load data
df = pd.read_csv('../data/cleaned/data_with_outliers.csv')

df

Unnamed: 0,bedrooms,bathrooms,sqft_above,floors,sqft_lot15,sqft_living15,sqft_basement,id,waterfront,view,condition,grade,yr_built_cat,zip,id.1,price
0,-0.350967,-1.583745,-0.737418,-0.975556,-0.341970,-1.043506,-0.802918,7129300520,0,0,3,7,1,1,7129300520,221900
1,-0.350967,0.253141,0.705741,1.029872,0.057869,-0.294277,1.176767,6414100192,0,0,3,7,1,1,6414100192,538000
2,-1.639211,-1.583745,-1.882200,-0.975556,0.126750,1.121919,-0.802918,5631500400,0,0,3,6,1,0,5631500400,180000
3,0.729637,1.136818,-1.038881,-0.975556,-0.511161,-0.994463,1.300691,2487200875,0,0,5,7,1,1,2487200875,604000
4,-0.350967,-0.071087,0.125114,-0.975556,0.034741,-0.097540,-0.802918,1954400510,0,0,3,8,1,0,1954400510,510000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21592,-0.350967,0.561351,-0.096146,1.877584,-2.414324,-0.610480,-0.802918,263000018,0,0,3,8,2,1,263000018,360000
21593,0.729637,0.561351,0.842224,1.029872,-0.018655,-0.046444,-0.802918,6600060120,0,0,3,8,3,1,6600060120,400000
21594,-1.639211,-2.037696,-1.115067,1.029872,-1.918077,-1.977481,-0.802918,1523300141,0,0,3,7,2,1,1523300141,402101
21595,-0.350967,0.561351,0.010309,1.029872,-2.704055,-0.875644,-0.802918,291310100,0,0,3,8,2,0,291310100,400000


In [5]:
#drop id and id.1
df = df.drop(['id', 'id.1'], axis=1)

In [6]:
#define data
X = df.drop(['price'], axis=1)
y = df['price']

In [7]:
X.head()

Unnamed: 0,bedrooms,bathrooms,sqft_above,floors,sqft_lot15,sqft_living15,sqft_basement,waterfront,view,condition,grade,yr_built_cat,zip
0,-0.350967,-1.583745,-0.737418,-0.975556,-0.34197,-1.043506,-0.802918,0,0,3,7,1,1
1,-0.350967,0.253141,0.705741,1.029872,0.057869,-0.294277,1.176767,0,0,3,7,1,1
2,-1.639211,-1.583745,-1.8822,-0.975556,0.12675,1.121919,-0.802918,0,0,3,6,1,0
3,0.729637,1.136818,-1.038881,-0.975556,-0.511161,-0.994463,1.300691,0,0,5,7,1,1
4,-0.350967,-0.071087,0.125114,-0.975556,0.034741,-0.09754,-0.802918,0,0,3,8,1,0


Now we are going to split the data into train and test set using sklearn, with test_size =0.3, and a random_state=42

In [8]:
#split data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3,
                                                    random_state=42)

After our split,we will first convert to a datafram, ensure to minmax scale to reduce range of our training and test set before we fit our model

In [9]:
#convert train and test data to dataframe
X_train_df = pd.DataFrame(X_train, columns=X.columns)
X_test_df = pd.DataFrame(X_test, columns=X.columns)

In [10]:
#now to scale the data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

scaler.fit(X_train_df)


In [12]:
#save the scaler
import pickle
path="../scalers/"
scaler_file_name= "min_max_scaler_with_outliers.pkl"

with open(path + scaler_file_name, 'wb') as file:
    pickle.dump(scaler, file)

In [13]:
#transform the data
X_train_scaled = scaler.transform(X_train_df)
X_test_scaled = scaler.transform(X_test_df)

#convert the scaled data to dataframe
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X.columns)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X.columns)

In [14]:
X_train_scaled_df.head()

Unnamed: 0,bedrooms,bathrooms,sqft_above,floors,sqft_lot15,sqft_living15,sqft_basement,waterfront,view,condition,grade,yr_built_cat,zip
0,0.18075,0.39748,0.667315,0.0,0.489966,0.639177,0.0,0.0,0.0,0.5,0.5,0.5,0.0
1,0.442957,0.571441,0.747725,0.645038,0.599973,0.707338,0.0,1.0,1.0,0.5,0.7,0.5,1.0
2,0.323345,0.39748,0.573054,0.645038,0.497993,0.622095,0.0,0.0,0.0,0.5,0.5,0.0,0.0
3,0.442957,0.316335,0.596722,0.0,0.525203,0.620131,0.0,0.0,0.5,0.5,0.4,0.0,0.0
4,0.800207,0.471315,0.717457,0.0,0.440688,0.556414,0.0,0.0,0.0,0.5,0.4,0.0,0.0


In [15]:
X_train_scaled_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
bedrooms,15117.0,0.362554,0.110766,0.0,0.323345,0.323345,0.442957,1.0
bathrooms,15117.0,0.326115,0.128519,0.0,0.272387,0.357935,0.39748,1.0
sqft_above,15117.0,0.548148,0.132593,0.0,0.455666,0.542911,0.648662,1.0
floors,15117.0,0.314973,0.322032,0.0,0.0,0.401841,0.645038,1.0
sqft_lot15,15117.0,0.506887,0.125589,0.0,0.446259,0.513485,0.557151,1.0
sqft_living15,15117.0,0.637443,0.112406,0.0,0.55886,0.633598,0.717197,1.0
sqft_basement,15117.0,0.350335,0.43548,0.0,0.0,0.0,0.885863,1.0
waterfront,15117.0,0.00774,0.087637,0.0,0.0,0.0,0.0,1.0
view,15117.0,0.058345,0.192095,0.0,0.0,0.0,0.0,1.0
condition,15117.0,0.602302,0.162705,0.0,0.5,0.5,0.75,1.0


Now to build the model, we will use Linear regression because we are meant to predict price of the real estate .
Linear regression is great for price prediction as it predicts numerical values

In [16]:
#now to build the model
from sklearn.linear_model import LinearRegression

In [17]:
linear_model = LinearRegression()
linear_model.fit(X_train_scaled_df, y_train)

In [18]:
#save model
path="../models/"
model_file_name= "linear_model_with_outliers.pkl"

with open(path + model_file_name, 'wb') as file:
    pickle.dump(linear_model, file)

Evaluate model
we will compare against real value and then subsequently we will check with metrics such as accuracy score etc

In [19]:
y_train_pred = linear_model.predict(X_train_scaled_df)
y_test_pred = linear_model.predict(X_test_scaled_df)

In [20]:
results = {"Set": ["Train"]*X_train.shape[0] + ["Test"]*X_test.shape[0], 
           "Real": list(y_train) + list(y_test),
           "Predicted": list(y_train_pred) + list(y_test_pred)}

results_df = pd.DataFrame(results)


In [21]:
results_df

Unnamed: 0,Set,Real,Predicted
0,Train,699800,5.494907e+05
1,Train,1700000,1.903916e+06
2,Train,258000,4.909812e+05
3,Train,245000,4.708587e+05
4,Train,373000,4.038826e+05
...,...,...,...
21592,Test,1010000,8.208157e+05
21593,Test,675000,9.955956e+05
21594,Test,442500,3.144228e+05
21595,Test,424950,5.322691e+05


In [22]:
#show error
results_df['Errors'] = results_df['Real'] - results_df['Predicted']
display(results_df)

Unnamed: 0,Set,Real,Predicted,Errors
0,Train,699800,5.494907e+05,150309.272586
1,Train,1700000,1.903916e+06,-203916.485303
2,Train,258000,4.909812e+05,-232981.237058
3,Train,245000,4.708587e+05,-225858.706960
4,Train,373000,4.038826e+05,-30882.564082
...,...,...,...,...
21592,Test,1010000,8.208157e+05,189184.297054
21593,Test,675000,9.955956e+05,-320595.588771
21594,Test,442500,3.144228e+05,128077.170989
21595,Test,424950,5.322691e+05,-107319.086327


In [24]:
def error_metrics_report(y_real_train: list, y_real_test: list, y_pred_train: list, y_pred_test: list) -> pd.DataFrame:
    '''
   The error metrics report function calculates for regression.

    Parameters:
    - y_real_train (list): The actual target values for the training dataset.
    - y_real_test (list): The actual target values for the testing dataset.
    - y_pred_train (list): The predicted target values for the training dataset.
    - y_pred_test (list): The predicted target values for the testing dataset.

    Returns:
    - metrics_df (DataFrame): A Pandas DataFrame containing error metrics for both the training and testing datasets.
    - 'Metric' (str): The name of the error metric.
    - 'Training Set' (float): The error metric value for the training set.
    - 'Testing Set' (float): The error metric value for the testing set.
    '''

    MAE_train = mean_absolute_error(y_real_train, y_pred_train)
    MAE_test  = mean_absolute_error(y_real_test, y_pred_test)

    # Mean squared error
    MSE_train = mean_squared_error(y_real_train, y_pred_train)
    MSE_test  = mean_squared_error(y_real_test, y_pred_test)

    # Root mean squared error
    RMSE_train = mean_squared_error(y_real_train, y_pred_train)
    RMSE_test  = mean_squared_error(y_real_test, y_pred_test)

    # R2
    R2_train = r2_score(y_real_train, y_pred_train)
    R2_test  = r2_score(y_real_test, y_pred_test)

    results = {"Metric":['MAE', 'MSE', 'RMSE', 'R2'] , 
               "Train": [MAE_train, MSE_train, RMSE_train, R2_train],
               "Test":  [MAE_test, MSE_test, RMSE_test, R2_test]}

    results_df = pd.DataFrame(results).round(2)

    return results_df

In [25]:
error_metrics_report(list(results_df[results_df["Set"]=="Train"]["Real"]),
                     list(results_df[results_df["Set"]=="Test"]["Real"]),
                     list(results_df[results_df["Set"]=="Train"]["Predicted"]),
                     list(results_df[results_df["Set"]=="Test"]["Predicted"]))

Unnamed: 0,Metric,Train,Test
0,MAE,150920.2,151545.6
1,MSE,55512010000.0,59142510000.0
2,RMSE,55512010000.0,59142510000.0
3,R2,0.59,0.55
