## Data modelling

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
#load data
df = pd.read_csv('../data/cleaned/data_without_outlier.csv')

df

Unnamed: 0,sqft_above,sqft_lot15,sqft_living15,waterfront,view,condition,grade,bedrooms,bathrooms,sqft_basement_cat,floors,yr_built_cat,zip,price
0,1180.0,5650.0,1340.0,0,0,3,7,3,1.00,1,1.0,1,1,221900
1,2170.0,7639.0,1690.0,0,0,3,7,3,2.25,2,2.0,1,1,538000
2,770.0,8062.0,2720.0,0,0,3,6,2,1.00,1,1.0,1,0,180000
3,1050.0,5000.0,1360.0,0,0,5,7,4,3.00,3,1.0,1,1,604000
4,1680.0,7503.0,1800.0,0,0,3,8,3,2.00,1,1.0,1,0,510000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18853,1530.0,1509.0,1530.0,0,0,3,10,6,3.25,1,2.0,1,0,925850
18854,2310.0,7200.0,1830.0,0,0,4,7,2,1.00,2,1.0,1,0,255500
18855,1020.0,2007.0,1020.0,0,0,3,10,3,2.50,1,2.0,1,0,450000
18856,1600.0,1287.0,1410.0,0,0,3,7,3,1.00,1,1.0,1,0,280000


In [4]:
#define data
X = df.drop(['price'], axis=1)
y = df['price']

In [5]:
X.head()

Unnamed: 0,sqft_above,sqft_lot15,sqft_living15,waterfront,view,condition,grade,bedrooms,bathrooms,sqft_basement_cat,floors,yr_built_cat,zip
0,1180.0,5650.0,1340.0,0,0,3,7,3,1.0,1,1.0,1,1
1,2170.0,7639.0,1690.0,0,0,3,7,3,2.25,2,2.0,1,1
2,770.0,8062.0,2720.0,0,0,3,6,2,1.0,1,1.0,1,0
3,1050.0,5000.0,1360.0,0,0,5,7,4,3.0,3,1.0,1,1
4,1680.0,7503.0,1800.0,0,0,3,8,3,2.0,1,1.0,1,0


Now we are going to split the data into train and test set using sklearn, with test_size =0.2, and a random_state=42

In [6]:
#split data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2,
                                                    random_state=42)

After our split,we will first convert to a datafram, ensure to minmax scale to reduce range of our training and test set before we fit our model

In [7]:
#convert train and test data to dataframe
X_train_df = pd.DataFrame(X_train, columns=X.columns)
X_test_df = pd.DataFrame(X_test, columns=X.columns)

In [8]:
#now to scale the data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

scaler.fit(X_train_df)


In [9]:
#save the scaler
import pickle
path="../scalers/"
scaler_file_name= "min_max_scaler_without_outliers.pkl"

with open(path + scaler_file_name, 'wb') as file:
    pickle.dump(scaler, file)

In [10]:
#transform the data
X_train_scaled = scaler.transform(X_train_df)
X_test_scaled = scaler.transform(X_test_df)

#convert the scaled data to dataframe
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X.columns)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X.columns)

In [11]:
X_train_scaled_df.head()

Unnamed: 0,sqft_above,sqft_lot15,sqft_living15,waterfront,view,condition,grade,bedrooms,bathrooms,sqft_basement_cat,floors,yr_built_cat,zip
0,0.302671,0.423043,0.277228,0.0,0.0,0.75,0.5,0.03125,0.133333,0.333333,0.0,0.0,1.0
1,0.264095,0.038405,0.234323,0.0,0.0,0.5,0.7,0.09375,0.266667,0.0,0.4,0.5,0.0
2,0.397626,0.600568,0.419142,0.0,0.0,0.5,0.3,0.03125,0.066667,0.0,0.0,0.0,0.0
3,0.323442,0.459791,0.330033,0.0,0.0,0.75,0.3,0.0625,0.066667,0.666667,0.0,0.0,1.0
4,0.727003,0.298361,0.785479,0.0,0.0,0.5,0.5,0.0625,0.233333,0.0,0.4,0.0,1.0


In [12]:
X_train_scaled_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
sqft_above,15086.0,0.380031,0.195092,0.0,0.234421,0.329377,0.490504,1.0
sqft_lot15,15086.0,0.384239,0.187192,0.0,0.257353,0.388129,0.494053,1.0
sqft_living15,15086.0,0.41301,0.186458,0.0,0.273927,0.379538,0.524752,1.0
waterfront,15086.0,0.008087,0.089566,0.0,0.0,0.0,0.0,1.0
view,15086.0,0.059791,0.193093,0.0,0.0,0.0,0.0,1.0
condition,15086.0,0.611991,0.16662,0.0,0.5,0.5,0.75,1.0
grade,15086.0,0.459751,0.116564,0.0,0.4,0.4,0.5,1.0
bedrooms,15086.0,0.073978,0.029155,0.0,0.0625,0.0625,0.09375,1.0
bathrooms,15086.0,0.208339,0.100884,0.0,0.133333,0.2,0.266667,1.0
sqft_basement_cat,15086.0,0.262716,0.356502,0.0,0.0,0.0,0.666667,1.0


Now to build the model, we will use Linear regression because we are meant to predict price of the real estate .
Linear regression is great for price prediction as it predicts numerical values

In [13]:
#now to build the model
from sklearn.linear_model import LinearRegression

In [14]:
linear_model = LinearRegression()
linear_model.fit(X_train_scaled_df, y_train)

In [15]:
#save model
path="../models/"
model_file_name= "linear_model_without_outliers.pkl"

with open(path + model_file_name, 'wb') as file:
    pickle.dump(linear_model, file)

Evaluate model
we will compare against real value and then subsequently we will check with metrics such as accuracy score etc

In [16]:
y_train_pred = linear_model.predict(X_train_scaled_df)
y_test_pred = linear_model.predict(X_test_scaled_df)

In [19]:
y_train_pred = [round(value, 2) for value in y_train_pred]
y_test_pred= [round(value, 2) for value in y_test_pred]


results = {"Set": ["Train"]*X_train.shape[0] + ["Test"]*X_test.shape[0], 
           "Real": list(y_train) + list(y_test),
           "Predicted": list(y_train_pred) + list(y_test_pred)}

results_df = pd.DataFrame(results)


In [20]:
results_df

Unnamed: 0,Set,Real,Predicted
0,Train,530000,632903.51
1,Train,920000,856208.02
2,Train,365000,73627.36
3,Train,342000,287462.57
4,Train,500000,622557.06
...,...,...,...
18853,Test,440000,557890.29
18854,Test,184900,85191.63
18855,Test,528000,340272.29
18856,Test,930000,975100.37


In [21]:
#show error
results_df['Errors'] = results_df['Real'] - results_df['Predicted']
display(results_df)

Unnamed: 0,Set,Real,Predicted,Errors
0,Train,530000,632903.51,-102903.51
1,Train,920000,856208.02,63791.98
2,Train,365000,73627.36,291372.64
3,Train,342000,287462.57,54537.43
4,Train,500000,622557.06,-122557.06
...,...,...,...,...
18853,Test,440000,557890.29,-117890.29
18854,Test,184900,85191.63,99708.37
18855,Test,528000,340272.29,187727.71
18856,Test,930000,975100.37,-45100.37


In [22]:
#error metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Mean absolute error
MAE_train = mean_absolute_error(results_df[results_df["Set"]=="Train"]["Real"], results_df[results_df["Set"]=="Train"]["Predicted"])
MAE_test  = mean_absolute_error(results_df[results_df["Set"]=="Test"]["Real"],  results_df[results_df["Set"]=="Test"]["Predicted"])

# Mean squared error
MSE_train = mean_squared_error(results_df[results_df["Set"]=="Train"]["Real"], results_df[results_df["Set"]=="Train"]["Predicted"])
MSE_test  = mean_squared_error(results_df[results_df["Set"]=="Test"]["Real"],  results_df[results_df["Set"]=="Test"]["Predicted"])

# Root mean squared error
RMSE_train = mean_squared_error(results_df[results_df["Set"]=="Train"]["Real"], 
                                results_df[results_df["Set"]=="Train"]["Predicted"],
                                squared=False)
RMSE_test  = mean_squared_error(results_df[results_df["Set"]=="Test"]["Real"],  
                                results_df[results_df["Set"]=="Test"]["Predicted"],
                                squared=False)

# R2
R2_train = r2_score(results_df[results_df["Set"]=="Train"]["Real"], results_df[results_df["Set"]=="Train"]["Predicted"])
R2_test  = r2_score(results_df[results_df["Set"]=="Test"]["Real"],  results_df[results_df["Set"]=="Test"]["Predicted"])

In [26]:
def error_metrics_report(y_real_train: list, y_real_test: list, y_pred_train: list, y_pred_test: list) -> pd.DataFrame:
    '''
   The error metrics report function calculates for regression.

    Parameters:
    - y_real_train (list): The actual target values for the training dataset.
    - y_real_test (list): The actual target values for the testing dataset.
    - y_pred_train (list): The predicted target values for the training dataset.
    - y_pred_test (list): The predicted target values for the testing dataset.

    Returns:
    - metrics_df (DataFrame): A Pandas DataFrame containing error metrics for both the training and testing datasets.
    - 'Metric' (str): The name of the error metric.
    - 'Training Set' (float): The error metric value for the training set.
    - 'Testing Set' (float): The error metric value for the testing set.
    '''

    MAE_train = mean_absolute_error(y_real_train, y_pred_train)
    MAE_test  = mean_absolute_error(y_real_test, y_pred_test)

    # Mean squared error
    MSE_train = mean_squared_error(y_real_train, y_pred_train)
    MSE_test  = mean_squared_error(y_real_test, y_pred_test)

    # Root mean squared error
    RMSE_train = mean_squared_error(y_real_train, y_pred_train)
    RMSE_test  = mean_squared_error(y_real_test, y_pred_test)

    # R2
    R2_train = r2_score(y_real_train, y_pred_train)
    R2_test  = r2_score(y_real_test, y_pred_test)

    results = {"Metric":['MAE', 'MSE', 'RMSE', 'R2'] , 
               "Train": [MAE_train, MSE_train, RMSE_train, R2_train],
               "Test":  [MAE_test, MSE_test, RMSE_test, R2_test]}

    results_df = pd.DataFrame(results).round(2)
    pd.set_option('display.float_format', lambda x: '{:.2f}'.format(x))

    return results_df

In [27]:
error_metrics_report(list(results_df[results_df["Set"]=="Train"]["Real"]),
                     list(results_df[results_df["Set"]=="Test"]["Real"]),
                     list(results_df[results_df["Set"]=="Train"]["Predicted"]),
                     list(results_df[results_df["Set"]=="Test"]["Predicted"]))

Unnamed: 0,Metric,Train,Test
0,MAE,152100.99,158579.91
1,MSE,54046336478.4,72699050597.01
2,RMSE,54046336478.4,72699050597.01
3,R2,0.57,0.56
