In [41]:
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt

import statsmodels.api as sm
from statsmodels.stats import diagnostic as diag
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Import label encoder 
from sklearn import preprocessing 

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

import pylab
import math

In [42]:
# importing dataset
data = pd.read_csv("Dataset.csv", encoding='cp1252')
data

Unnamed: 0,Name,Car Spaces,Bedrooms,Bathrooms,Floor area (m2),Land Size(m2),Waterfront,Location,Price
0,Bungalow House & Lot for sale in Lapu-Lapu Cit...,2.0,2.0,3.0,150.0,173.0,1.0,Lapu-Lapu,3800000.00
1,Ready for occupancy single detached Beach house,1.0,3.0,2.0,80.0,291.0,1.0,Argao,7078000.00
2,Two Storey House Villa with Pool - Camotes Isl...,5.0,3.0,2.0,385.4,1500.0,1.0,Camotes Island,11900000.00
3,A Luxury 5-Bedroom Resort-Style Home,4.0,5.0,5.0,360.0,462.0,0.0,Lapu-Lapu,52000000.00
4,"Townhouse for Sale in Banilad, Mandaue City",1.0,4.0,3.0,103.0,60.0,0.0,Mandaue,10500000.00
...,...,...,...,...,...,...,...,...,...
262,"House and Lot For Sale at Cordova, Cebu",,3.0,2.0,63.0,62.0,0.0,Cordova,2705711
263,A 5-bedroom Home for Sale in Maryville,,3.0,5.0,450.0,438.0,0.0,Cebu,30000000
264,3-BR House and Lot for Sale at Pacific Grand V...,,3.0,3.0,156.0,144.0,0.0,Lapu-Lapu,15000000
265,"House and Lot For Sale at Liloan, Cebu",2.0,3.0,3.0,150.0,170.0,0.0,Liloan,8000000


In [43]:
# renaming columns
data.columns = ['name', 'carspaces', 'bedrooms', 'bathrooms', 'floorarea', 'landsize', 'waterfront', 'location', 'price']

In [44]:
data[data['name'].duplicated() == True]

Unnamed: 0,name,carspaces,bedrooms,bathrooms,floorarea,landsize,waterfront,location,price
86,"House for sale at Tres de Abril St., Punta Pri...",4.0,5.0,5.0,328.0,200.5,0.0,Cebu,22850000
107,House and Lot for Sale in Maryville Subdivisio...,1.0,4.0,4.0,185.0,250.0,,Cebu,15000000
119,Briella (RFO) 2 storey single detached with ba...,2.0,5.0,4.0,210.0,180.0,,Minglanilla,9133966
139,4Br House and Lot For Sale in Banawa Cebu City,2.0,4.0,4.0,114.0,75.0,,Cebu,8652000
151,Furnished 3 Bedroom House for Sale near Cebu I...,1.0,3.0,3.0,200.0,140.0,,Cebu,18000000
156,2 BR-RFO TOWNHOUSE WITH 2 T&B AND 1 CARPORT/lo...,1.0,2.0,2.0,65.0,40.0,,Minglanilla,3500000
189,Two Storey House Villa with Pool - Camotes Isl...,,3.0,2.0,385.4,1500.0,1.0,Camotes,11900000
204,"10-Bedroom House and Lot for Sale, Cebu City",,10.0,10.0,527.0,267.0,0.0,Cebu,20000000
214,"1 Bedroom Townhouse in Agus, Lapu-Lapu City",,1.0,1.0,54.0,35.0,0.0,Lapu-Lapu,1500000
221,Beautiful exclusive pocket subdivision in Pobl...,1.0,3.0,2.0,61.0,59.0,0.0,Cebu,4200000


In [45]:
data.drop_duplicates(subset = ["name"], keep = 'first', inplace = True) 
data[data['name'].duplicated() == True]

Unnamed: 0,name,carspaces,bedrooms,bathrooms,floorarea,landsize,waterfront,location,price


In [46]:
data

Unnamed: 0,name,carspaces,bedrooms,bathrooms,floorarea,landsize,waterfront,location,price
0,Bungalow House & Lot for sale in Lapu-Lapu Cit...,2.0,2.0,3.0,150.0,173.0,1.0,Lapu-Lapu,3800000.00
1,Ready for occupancy single detached Beach house,1.0,3.0,2.0,80.0,291.0,1.0,Argao,7078000.00
2,Two Storey House Villa with Pool - Camotes Isl...,5.0,3.0,2.0,385.4,1500.0,1.0,Camotes Island,11900000.00
3,A Luxury 5-Bedroom Resort-Style Home,4.0,5.0,5.0,360.0,462.0,0.0,Lapu-Lapu,52000000.00
4,"Townhouse for Sale in Banilad, Mandaue City",1.0,4.0,3.0,103.0,60.0,0.0,Mandaue,10500000.00
...,...,...,...,...,...,...,...,...,...
262,"House and Lot For Sale at Cordova, Cebu",,3.0,2.0,63.0,62.0,0.0,Cordova,2705711
263,A 5-bedroom Home for Sale in Maryville,,3.0,5.0,450.0,438.0,0.0,Cebu,30000000
264,3-BR House and Lot for Sale at Pacific Grand V...,,3.0,3.0,156.0,144.0,0.0,Lapu-Lapu,15000000
265,"House and Lot For Sale at Liloan, Cebu",2.0,3.0,3.0,150.0,170.0,0.0,Liloan,8000000


In [47]:
# check for null values
display(data.isnull().any())

name          False
carspaces      True
bedrooms       True
bathrooms      True
floorarea      True
landsize       True
waterfront     True
location      False
price          True
dtype: bool

In [48]:
# identify the mean of carspaces column to fill in null columns
data['carspaces'].mean()

2.0122699386503067

In [49]:
# # filling null values
data[['carspaces']] = data[['carspaces']].fillna(data['carspaces'].mean())
display(data.isnull().any())

name          False
carspaces     False
bedrooms       True
bathrooms      True
floorarea      True
landsize       True
waterfront     True
location      False
price          True
dtype: bool

In [50]:
data[['bedrooms']] = data[['bedrooms']].fillna(data['bedrooms'].mean())
data[['bathrooms']] = data[['bathrooms']].fillna(data['bathrooms'].mean())
data[['floorarea']] = data[['floorarea']].fillna(data['floorarea'].mean())
data[['landsize']] = data[['landsize']].fillna(data['landsize'].mean())
data[['waterfront']] = data[['waterfront']].fillna('0')
display(data.isnull().any())

name          False
carspaces     False
bedrooms      False
bathrooms     False
floorarea     False
landsize      False
waterfront    False
location      False
price          True
dtype: bool

In [51]:
display(data.isnull().any())

name          False
carspaces     False
bedrooms      False
bathrooms     False
floorarea     False
landsize      False
waterfront    False
location      False
price          True
dtype: bool

In [53]:
# now, the price has the only column that contains null values
# we will be remove those rows with null prices since we are predicting the prices for house sales
data.dropna(axis = 0, how = 'any', inplace = True)
data

Unnamed: 0,name,carspaces,bedrooms,bathrooms,floorarea,landsize,waterfront,location,price
0,Bungalow House & Lot for sale in Lapu-Lapu Cit...,2.00000,2.0,3.0,150.0,173.0,1,Lapu-Lapu,3800000.00
1,Ready for occupancy single detached Beach house,1.00000,3.0,2.0,80.0,291.0,1,Argao,7078000.00
2,Two Storey House Villa with Pool - Camotes Isl...,5.00000,3.0,2.0,385.4,1500.0,1,Camotes Island,11900000.00
3,A Luxury 5-Bedroom Resort-Style Home,4.00000,5.0,5.0,360.0,462.0,0,Lapu-Lapu,52000000.00
4,"Townhouse for Sale in Banilad, Mandaue City",1.00000,4.0,3.0,103.0,60.0,0,Mandaue,10500000.00
...,...,...,...,...,...,...,...,...,...
262,"House and Lot For Sale at Cordova, Cebu",2.01227,3.0,2.0,63.0,62.0,0,Cordova,2705711
263,A 5-bedroom Home for Sale in Maryville,2.01227,3.0,5.0,450.0,438.0,0,Cebu,30000000
264,3-BR House and Lot for Sale at Pacific Grand V...,2.01227,3.0,3.0,156.0,144.0,0,Lapu-Lapu,15000000
265,"House and Lot For Sale at Liloan, Cebu",2.00000,3.0,3.0,150.0,170.0,0,Liloan,8000000


In [54]:
data.drop('name', axis = 1, inplace = True)
data

Unnamed: 0,carspaces,bedrooms,bathrooms,floorarea,landsize,waterfront,location,price
0,2.00000,2.0,3.0,150.0,173.0,1,Lapu-Lapu,3800000.00
1,1.00000,3.0,2.0,80.0,291.0,1,Argao,7078000.00
2,5.00000,3.0,2.0,385.4,1500.0,1,Camotes Island,11900000.00
3,4.00000,5.0,5.0,360.0,462.0,0,Lapu-Lapu,52000000.00
4,1.00000,4.0,3.0,103.0,60.0,0,Mandaue,10500000.00
...,...,...,...,...,...,...,...,...
262,2.01227,3.0,2.0,63.0,62.0,0,Cordova,2705711
263,2.01227,3.0,5.0,450.0,438.0,0,Cebu,30000000
264,2.01227,3.0,3.0,156.0,144.0,0,Lapu-Lapu,15000000
265,2.00000,3.0,3.0,150.0,170.0,0,Liloan,8000000


In [55]:
# identify if the location values are all provinces
location_list = data.location.unique()
location_list

array(['Lapu-Lapu', 'Argao', 'Camotes Island', 'Mandaue', 'Cebu',
       'Consolacion', 'Bogo', 'Oslob', 'Talisay', 'Liloan', 'Medellin',
       'Compostela', 'Minglanilla', 'Alcoy', 'Lapu-lapu', 'Cordova',
       'Toledo', 'Talamban', 'Borbon', 'Bantayan', 'Santander', 'Badian',
       'Banilad', 'Guadalupe', 'Naga', 'Lahug', 'Danao', 'Carcar',
       'Catmon', 'Guadalupe, Cebu', 'Carmen'], dtype=object)

In [56]:
data.loc[data['location'].str.contains('Guadalupe'), 'location'] = 'Cebu'
data.loc[data['location'].str.contains('Cordova'), 'location'] = 'Lapu-Lapu'
data.loc[data['location'].str.contains('Camotes'), 'location'] = 'Camotes Island'
data.loc[data['location'].str.contains('Bantayan'), 'location'] = 'Bantayan Island'

In [57]:
# sorting the list
location_list = data.location.unique()
location_list.sort()
location_list

array(['Alcoy', 'Argao', 'Badian', 'Banilad', 'Bantayan Island', 'Bogo',
       'Borbon', 'Camotes Island', 'Carcar', 'Carmen', 'Catmon', 'Cebu',
       'Compostela', 'Consolacion', 'Danao', 'Lahug', 'Lapu-Lapu',
       'Lapu-lapu', 'Liloan', 'Mandaue', 'Medellin', 'Minglanilla',
       'Naga', 'Oslob', 'Santander', 'Talamban', 'Talisay', 'Toledo'],
      dtype=object)

In [58]:
# identify the number of provinces in the dataset composed of 28 provinces
len(location_list)

28

In [59]:
# label_encoder object knows how to understand word labels. 
label_encoder = preprocessing.LabelEncoder() 

In [60]:
# Encode labels in column 'location'. 
# This converts the categorical into numerical
data['location']= label_encoder.fit_transform(data['location']) 
data

Unnamed: 0,carspaces,bedrooms,bathrooms,floorarea,landsize,waterfront,location,price
0,2.00000,2.0,3.0,150.0,173.0,1,16,3800000.00
1,1.00000,3.0,2.0,80.0,291.0,1,1,7078000.00
2,5.00000,3.0,2.0,385.4,1500.0,1,7,11900000.00
3,4.00000,5.0,5.0,360.0,462.0,0,16,52000000.00
4,1.00000,4.0,3.0,103.0,60.0,0,19,10500000.00
...,...,...,...,...,...,...,...,...
262,2.01227,3.0,2.0,63.0,62.0,0,16,2705711
263,2.01227,3.0,5.0,450.0,438.0,0,11,30000000
264,2.01227,3.0,3.0,156.0,144.0,0,16,15000000
265,2.00000,3.0,3.0,150.0,170.0,0,18,8000000


In [61]:
# TODO: check for duplicates