### Imports

In [60]:
import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn import tree
from sklearn import linear_model
from sklearn import metrics 
import xgboost as xgb


The columns `ads_id`, `Unnamed: 0`, and `prop_name` are dropped since they wouldn't be useful for the prediction model as they are irrelevant to the monthly rental pricing. 

In [2]:
#Loading the data in
data = pd.read_csv("../data/Rental_Price_cleaned.csv")
data.drop(columns=['Unnamed: 0', 'ads_id', 'prop_name'], inplace=True)

  data = pd.read_csv("../data/Rental_Price_cleaned.csv")


In [3]:
data.head()

Unnamed: 0,completion_year,monthly_rent,location,property_type,rooms,parking,bathroom,size,furnished,facilities,...,Minimart,MP_Hall,No_Fac,Parking,Playground,Sauna,Security,Squash,Pool,Tennis
0,2022.0,4200.0,Kuala Lumpur - Taman Desa,Condominium,5,2.0,6.0,1842,Fully Furnished,"Minimart, Gymnasium, Security, Playground, Swi...",...,True,True,False,True,True,False,True,False,True,False
1,,2300.0,Kuala Lumpur - Cheras,Condominium,3,1.0,2.0,1170,Partially Furnished,"Playground, Parking, Barbeque area, Security, ...",...,False,False,False,True,True,True,True,False,True,False
2,,1000.0,Kuala Lumpur - Taman Desa,Apartment,3,0.0,2.0,650,Fully Furnished,"Minimart, Jogging Track, Lift, Swimming Pool",...,True,False,False,False,False,False,False,False,True,False
3,2020.0,1700.0,Kuala Lumpur - Sentul,Apartment,2,1.0,2.0,743,Partially Furnished,"Parking, Playground, Swimming Pool, Squash Cou...",...,True,False,False,True,True,False,True,True,True,False
4,,1299.0,Kuala Lumpur - Mont Kiara,Service Residence,1,1.0,1.0,494,Not Furnished,"Parking, Security, Lift, Swimming Pool, Playgr...",...,True,True,False,True,True,False,True,False,True,False


In [4]:
binary = data.select_dtypes(bool).columns.to_list()
for cols in binary:
    data[cols] = data[cols].astype('category').cat.codes

In [5]:
data.head()

Unnamed: 0,completion_year,monthly_rent,location,property_type,rooms,parking,bathroom,size,furnished,facilities,...,Minimart,MP_Hall,No_Fac,Parking,Playground,Sauna,Security,Squash,Pool,Tennis
0,2022.0,4200.0,Kuala Lumpur - Taman Desa,Condominium,5,2.0,6.0,1842,Fully Furnished,"Minimart, Gymnasium, Security, Playground, Swi...",...,1,1,0,1,1,0,1,0,1,0
1,,2300.0,Kuala Lumpur - Cheras,Condominium,3,1.0,2.0,1170,Partially Furnished,"Playground, Parking, Barbeque area, Security, ...",...,0,0,0,1,1,1,1,0,1,0
2,,1000.0,Kuala Lumpur - Taman Desa,Apartment,3,0.0,2.0,650,Fully Furnished,"Minimart, Jogging Track, Lift, Swimming Pool",...,1,0,0,0,0,0,0,0,1,0
3,2020.0,1700.0,Kuala Lumpur - Sentul,Apartment,2,1.0,2.0,743,Partially Furnished,"Parking, Playground, Swimming Pool, Squash Cou...",...,1,0,0,1,1,0,1,1,1,0
4,,1299.0,Kuala Lumpur - Mont Kiara,Service Residence,1,1.0,1.0,494,Not Furnished,"Parking, Security, Lift, Swimming Pool, Playgr...",...,1,1,0,1,1,0,1,0,1,0


In [6]:
data['property_type'].unique()

array(['Condominium', 'Apartment', 'Service Residence', 'Studio', 'Flat',
       'Duplex', 'Others', 'Townhouse Condo',
       'Condo / Services residence / Penthouse / Townhouse', 'Houses'],
      dtype=object)

In [8]:
pred_data = data[['monthly_rent', 'region', 'rooms', 'property_type', 'parking', 'bathroom', 'size', 'furnished']]

In [25]:
pred_data.dtypes

monthly_rent     float64
region            object
rooms             object
property_type     object
parking          float64
bathroom         float64
size               int64
furnished         object
dtype: object

Since some of the attributes are of type 'object', they cannot be used to fit the prediction models I will be implementing. Therefore, I converted them into categorical numerical variables.

In [28]:
obj = pred_data.select_dtypes(object).columns.to_list()
for cols in obj: 
    pred_data[cols] = pred_data[cols].astype('category').cat.codes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pred_data[cols] = pred_data[cols].astype('category').cat.codes
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pred_data[cols] = pred_data[cols].astype('category').cat.codes
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pred_data[cols] = pred_data[cols].astype('category').cat.codes
A value is tryin

##### Correlation Matrix

In [29]:
corr_matrix = pred_data.corr()

In [30]:
corr_matrix

Unnamed: 0,monthly_rent,region,rooms,property_type,parking,bathroom,size,furnished
monthly_rent,1.0,-0.273719,0.139196,0.202058,0.215729,0.409035,0.537059,-0.294201
region,-0.273719,1.0,-0.365083,-0.0189,0.02419,-0.111444,-0.134397,0.058747
rooms,0.139196,-0.365083,1.0,-0.119961,0.081827,0.305619,0.258563,0.069451
property_type,0.202058,-0.0189,-0.119961,1.0,0.054726,-0.155453,-0.093587,-0.182939
parking,0.215729,0.02419,0.081827,0.054726,1.0,0.264033,0.279568,0.010809
bathroom,0.409035,-0.111444,0.305619,-0.155453,0.264033,1.0,0.673156,0.096354
size,0.537059,-0.134397,0.258563,-0.093587,0.279568,0.673156,1.0,0.029393
furnished,-0.294201,0.058747,0.069451,-0.182939,0.010809,0.096354,0.029393,1.0


### Training Prediction models

In [31]:
y = pred_data['monthly_rent']
X = pred_data.drop(['monthly_rent'], axis=1)

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X,y, 
                                                    random_state = 808,
                                                    test_size=0.3, 
                                                    shuffle = True)

In [98]:
model_metrics = pd.DataFrame(columns=['Model', 'MAE', 'MSE', 'RMSE'])

#### Multivariate Linear Regression

In [33]:
lin_regr = linear_model.LinearRegression()
lin_regr.fit(X_train, y_train)

In [34]:
lr_pred = lin_regr.predict(X_test)

In [99]:
lr_metrics = pd.DataFrame({'Model': ['Multivariate Linear Regression'], 
                'MAE': [metrics.mean_absolute_error(y_test, lr_pred)], 
                'MSE': [metrics.mean_squared_error(y_test, lr_pred)], 
                'RMSE': [np.sqrt(metrics.mean_squared_error(y_test, lr_pred))]})
model_metrics = pd.concat([model_metrics,lr_metrics], ignore_index = True)

#### Random Forest Regressor

In [62]:
rf_regr = RandomForestRegressor()
rf_regr.fit(X_train, y_train)

In [63]:
rf_regr_pred = rf_regr.predict(X_test)

In [100]:
rf_metrics = pd.DataFrame({'Model': ['Random Forest'], 
                'MAE': [metrics.mean_absolute_error(y_test, rf_regr_pred)], 
                'MSE': [metrics.mean_squared_error(y_test, rf_regr_pred)], 
                'RMSE': [np.sqrt(metrics.mean_squared_error(y_test, rf_regr_pred))]})
model_metrics = pd.concat([model_metrics,rf_metrics], ignore_index = True)

In [45]:
dt_model = tree.DecisionTreeRegressor()
dt_model.fit(X_train, y_train)

In [46]:
dt_pred = dt_model.predict(X_test)

In [101]:
dt_metrics = pd.DataFrame({'Model': ['Decision Tree'], 
                'MAE': [metrics.mean_absolute_error(y_test, dt_pred)], 
                'MSE': [metrics.mean_squared_error(y_test, dt_pred)], 
                'RMSE': [np.sqrt(metrics.mean_squared_error(y_test, dt_pred))]})

model_metrics = pd.concat([model_metrics,dt_metrics], ignore_index = True)

In [102]:
model_metrics

Unnamed: 0,Model,MAE,MSE,RMSE
0,Multivariate Linear Regression,412.286029,513125.961777,716.32811
1,Random Forest,279.770697,280622.134722,529.737798
2,Decision Tree,307.225628,440518.758772,663.715872


From the `model_metrics` DataFrame, we can see that the Random Forest model has smallest values for all the metrics. Thus it is our best performing model.