In [57]:
# import dependencies
import numpy as np
import pandas as pd
import matplotlib as plt
import matplotlib.pyplot as plotty
import missingno as msno

from sklearn import preprocessing
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Perceptron
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.svm import SVR
from sklearn.ensemble import AdaBoostRegressor

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
%matplotlib inline

# Data Preparation

In [None]:
# price_paid               int64 [233500, 270000, 176000, 450000, 440000]
# money, possibly look into normalizing
# deed_date               object ['05/07/2017', '05/07/2017', '05/07/2017', '05/07/2017', '05/07/2017']
# date object convert to timestamp and then add days since value
# property_type           object ['F', 'T', 'O', 'S', 'D']
# categorical data perform label encoder
# new_build               object ['N', 'Y']
# categorical data perform label encoder
# estate_type             object ['L', 'F']
# categorical data perform label encoder
# district                object ['TOWER HAMLETS', 'CITY OF LONDON', 'HACKNEY', 'HARROW', 'WALTHAM FOREST']
# categorical data perform label encoder
# transaction_category    object ['A', 'B']
# categorical data perform label encoder

In [64]:
# Load the datasets
pricing_df = pd.read_csv('../data/raw/01_06_2014_until_04_06_2019.csv')
# We can load the 1gb master csv of postcode data into memory with pandas
postcode_df = pd.read_csv('../data/raw/NSPL_MAY_2019_UK.csv', low_memory=False)

In [67]:
left = pricing_df
right = postcode_df
merged = pd.merge(left, right, how='left', left_on='postcode', right_on='pcd')
# filename = '../data-science/summative-assignment/data/interim/left_merged.csv'
# merged.to_csv(filename, encoding='utf-8', index=False)
merged.shape

(345551, 57)

In [77]:
df = merged

False

In [None]:
# Check for duplicate values
df.duplicated().any()

In [74]:
# remove property type other
df[df['property_type'] != 'O']

(325379, 57)

In [None]:
# lat long
# Remove outliers
df = df.loc[(df['price_paid'] < (1000000)) & (df['price_paid'] > (10000))]


# Convert the date object type to... timestamp
df['time_stamp'] = pd.to_datetime(df['deed_date'])
# Create features for year and month
df['year'] = df['time_stamp'].dt.year
df['month'] = df['time_stamp'].dt.month
# create a lambda to give days since
days_since = lambda x: x - df.time_stamp.min()
# apply the lambda on time_stamp, writing to new column
df['days_since'] = df['time_stamp'].map(days_since)
# encode the 'days since' and write to new column
df['days_since_encoded'] = df['days_since'].dt.days
# Setting the index to timestamp for time series operations
df.index = df['time_stamp']
# Dropping unnecessary column
df.drop(['time_stamp'], axis=1)


# round to the nearest thousand £
df.price_paid.round(decimals=3)

df.dropna()

df.drop(['unique_id', 'paon', 'saon', 'linked_data_url', 'street', 'locality', 'county', 'town', 'postcode'], axis=1)

df['property_type_encoded'] = df[['property_type']].apply(preprocessing.LabelEncoder().fit_transform)
df['new_build_encoded'] = df[['new_build']].apply(preprocessing.LabelEncoder().fit_transform)
df['estate_type_encoded'] = df[['estate_type']].apply(preprocessing.LabelEncoder().fit_transform)
df['transaction_category_encoded'] = df[['transaction_category']].apply(preprocessing.LabelEncoder().fit_transform)

In [None]:
corr = df.corr()
corr.style.background_gradient(cmap='coolwarm')

In [11]:
# subset - district days since time series latlon outliers removed 

Unnamed: 0_level_0,price_paid,deed_date,property_type,new_build,estate_type,district,transaction_category,price_log,property_type_encoded,new_build_encoded,estate_type_encoded,transaction_category_encoded,time_stamp,year,month,days_since,days_since_encoded
time_stamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2017-01-25,301000,25/01/2017,O,N,F,CROYDON,B,12.614866,2,0,0,1,2017-01-25,2017,1,1115 days,1115
2017-04-28,373500,28/04/2017,F,N,L,CROYDON,A,12.830673,1,0,1,0,2017-04-28,2017,4,1208 days,1208
2015-02-10,353000,02/10/2015,F,N,L,CROYDON,A,12.774223,1,0,1,0,2015-02-10,2015,2,400 days,400
2015-02-20,350000,20/02/2015,F,N,L,CROYDON,A,12.765688,1,0,1,0,2015-02-20,2015,2,410 days,410
2014-05-09,362500,05/09/2014,F,N,L,CROYDON,A,12.800780,1,0,1,0,2014-05-09,2014,5,123 days,123
2017-12-19,845000,19/12/2017,S,N,F,CROYDON,A,13.647092,3,0,0,0,2017-12-19,2017,12,1443 days,1443
2017-06-12,302000,06/12/2017,F,N,L,CROYDON,B,12.618182,1,0,1,1,2017-06-12,2017,6,1253 days,1253
2014-12-09,250000,12/09/2014,F,N,L,CROYDON,A,12.429216,1,0,1,0,2014-12-09,2014,12,337 days,337
2014-12-06,282500,12/06/2014,F,N,L,CROYDON,A,12.551434,1,0,1,0,2014-12-06,2014,12,334 days,334
2017-08-18,368000,18/08/2017,F,N,L,CROYDON,A,12.815838,1,0,1,0,2017-08-18,2017,8,1320 days,1320


# Comparing Various Models

In [55]:
# Split data into predictors X and output Y
predictors = ['days_since_encoded']
X = df[predictors]
y = df['price_paid']
# Split 80/20
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
# prepare models
df = unpickled
models = []

models.append(('LR', LinearRegression()))
models.append(('PER', Perceptron()))
models.append(('LA', Lasso()))
models.append(('DTR', DecisionTreeRegressor()))
models.append(('RFR', RandomForestRegressor()))
models.append(('GPR', GaussianProcessRegressor()))
models.append(('SVR', SVR()))
models.append(('ABR', AdaBoostRegressor()))

# results = []
names = []

# evaluate each model in turn
for name, model in models:
    model.fit(X, y)
    y_pred = model.predict(X_test)
    score = model.score(X, y)
    # Evaluate the performance of the algorithm with metrics
    # Mean Absolute Error
    mae = metrics.mean_absolute_error(y_test, y_pred)
    # Mean Squared Error
    mse = metrics.mean_squared_error(y_test, y_pred)
    # Root Mean Squared Error
    rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
    msg = "%s: %f, mae: %f, mse: %f, rmse: %f" % (name, score, mae, mse, rmse)
    print(msg)

# # boxplot algorithm comparison ##
# fig = plotty.figure()
# fig.suptitle('Algorithm Comparison')
# ax = fig.add_subplot(111)
# plotty.boxplot(results)
# ax.set_xticklabels(names)
# plotty.show()

LR: 0.039963, mae: 113872.068191, mse: 22101559013.093288, rmse: 148665.930909
PER: 0.000371, mae: 290186.670065, mse: 101226717576.423538, rmse: 318161.464631
LA: 0.039963, mae: 113872.068291, mse: 22101559013.596973, rmse: 148665.930911
DTR: 0.274004, mae: 98452.641995, mse: 17419098714.415501, rmse: 131981.433219
RFR: 0.247033, mae: 101830.287645, mse: 18169989513.280945, rmse: 134796.103480
GPR: 0.274004, mae: 98452.645342, mse: 17419089520.866940, rmse: 131981.398390
SVR: -0.022830, mae: 116270.677055, mse: 23461384913.474060, rmse: 153171.096861
ABR: -0.068060, mae: 128213.113324, mse: 24889382707.543934, rmse: 157763.692615
