In [7]:
import pandas as pd
import numpy as np
import joblib

Importing train, test and validation dataset

In [8]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

In [9]:
train.head()

Unnamed: 0,Serviced,Newly Built,Furnished,Bedrooms,Bathrooms,Toilets,City_Lekki,City_Ajah,City_Yaba,City_Ikoyi,...,Neighborhood_Oworonshoki,Neighborhood_Awolowo Road,Neighborhood_1004,Property Type_Self-contained,Property Type_Fully Detached Duplex,Property Type_Flat,Property Type_Apartment,Property Type_Terraced Duplex,Property Type_Semi Detached Duplex,Price
0,0,0,0,1,1,1,1,0,0,0,...,0,0,0,1,0,0,0,0,0,1000000.0
1,0,1,1,4,5,6,0,1,0,0,...,0,0,0,0,1,0,0,0,0,2500000.0
2,0,0,0,3,2,3,0,0,1,0,...,0,0,0,0,0,1,0,0,0,1800000.0
3,1,0,0,3,3,4,0,0,0,1,...,0,0,0,0,0,1,0,0,0,5000000.0
4,0,0,0,2,2,3,1,0,0,0,...,0,0,0,0,0,1,0,0,0,2500000.0


In [10]:
train.shape, test.shape

((36716, 91), (9179, 10))

Let's do the necessary splits

In [11]:
X_train = train.drop(["Price"], axis=1)
y_train = train["Price"]

Here are the categorical and numerical features that need to be preprocessed

Importing tools needed for preprocessing

In [12]:
encoder = joblib.load("tools/encoder_joblib")

Splitting test data

In [13]:
X_test = test.drop(["Price"], axis=1)
y_test = test["Price"]

Here we are preprocessing the test data so it can be used by the machine learning model

In [14]:
X_test = encoder.transform(X_test) # enconding categorical values

In [15]:
X_test.head()

Unnamed: 0,Serviced,Newly Built,Furnished,Bedrooms,Bathrooms,Toilets,City_Lekki,City_Ajah,City_Yaba,City_Ikoyi,...,Neighborhood_Ligali Ayorinde,Neighborhood_Oworonshoki,Neighborhood_Awolowo Road,Neighborhood_1004,Property Type_Self-contained,Property Type_Fully Detached Duplex,Property Type_Flat,Property Type_Apartment,Property Type_Terraced Duplex,Property Type_Semi Detached Duplex
0,0,1,0,3,3,4,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,0,0,0,1,1,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,1,0,2,2,3,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,0,0,1,1,1,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,0,0,4,4,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


## Model Building and Validation

Lets import the necessary machinne learning models that would be used for fitting the data

In [16]:
from sklearn.ensemble import RandomForestRegressor # Random Forest
from xgboost import XGBRegressor # XGBoost
from sklearn.svm import SVR # SVM
from sklearn.neighbors import KNeighborsRegressor # KNN
from lightgbm import LGBMRegressor # LightGBM

### Random Forest

Here we are initializing the model, then fitting the data and making prediction on the validation data

In [17]:
rf = RandomForestRegressor(max_depth=11, n_estimators=20, random_state=51)
rf.fit(X_train, y_train)
rf_predictions = rf.predict(X_test)

### XGBoost

In [18]:
xgb = XGBRegressor(random_state=51, n_estimators=60, learning_rate=0.202)
xgb.fit(X_train, y_train)
xgb_predictions = xgb.predict(X_test)

### LightGBM

In [19]:
# lgb = LGBMRegressor(random_state=51)
lgb = LGBMRegressor(random_state=51, n_estimators=117)
lgb.fit(X_train, y_train)
lgb_predictions = lgb.predict(X_test)

### Model Performance on Test data

Here we are checking the performance of the machine learning models on the test data

In [20]:
# import MAE metric
from sklearn.metrics import mean_absolute_error

In [21]:
print(f"Random Forest: {mean_absolute_error(y_test, rf_predictions) / 1e6}")
print(f"XGBoost: {mean_absolute_error(y_test, xgb_predictions) / 1e6}")
print(f"LightGBM: {mean_absolute_error(y_test, lgb_predictions) / 1e6}")

Random Forest: 1.2264270764515677
XGBoost: 1.1817461188480498
LightGBM: 1.1724231384132595


### Model Save for Deployment

In [22]:
joblib.dump(lgb, "tools/model_joblib")

['tools/model_joblib']