In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import requests

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer


from sklearn.model_selection import train_test_split

import statsmodels.api as sm
from statsmodels.stats.diagnostic import het_white

In [2]:
df = pd.read_csv('transactions-2023-01-11.csv')

In [3]:
def drop_excess_columns(data):
    # Drop high cardinality columns
    data = data.drop(columns=["Transaction Number", "Property ID", "Transaction Size (sq.m)", "Parking", "Project"])
    # Drop low-cardinality columns
    data = data.drop(columns=["Registration type", "Is Free Hold?", "Master Project"])
    # Drop leaky columns
    data = data.drop(columns=["Transaction sub type", "Property Type", "Room(s)", "No. of Buyer", "No. of Seller"])
    return data


In [4]:
df = drop_excess_columns(df)

In [5]:
def get_oil_price():
    # data from https://tradingeconomics.com/commodity/crude-oil
    r = requests.get('https://markets.tradingeconomics.com/chart?s=cl1:com&interval=1d&span=5y&securify=new&url=/commodity/crude-oil&AUTH=Iyl9RVWfzjLi7c1HYd9NvPSbYw4QDhtHNC8UlBnpCaBG6anMsTTRMHyqxcJokUmS&ohlc=0')
    
    oil_data = pd.DataFrame(r.json()['series'][0]['data']).rename(columns={"y": "price"})
    oil_data = oil_data[(oil_data['date'] > '2021-03-01') & (oil_data['date'] < '2022-02-04')]

    return oil_data
    

In [6]:
oil_data = get_oil_price()
oil_data

Unnamed: 0,date,x,price,percentChange,change
880,2021-03-01T00:00:00,1614556800000,60.64,-1.398374,-0.86
881,2021-03-02T00:00:00,1614643200000,59.75,-1.467678,-0.89
882,2021-03-03T00:00:00,1614729600000,61.28,2.560669,1.53
883,2021-03-04T00:00:00,1614816000000,63.83,4.161227,2.55
884,2021-03-05T00:00:00,1614902400000,66.09,3.540655,2.26
...,...,...,...,...,...
1116,2022-01-28T00:00:00,1643328000000,85.43,0.164146,0.14
1117,2022-01-31T00:00:00,1643587200000,86.49,1.240782,1.06
1118,2022-02-01T00:00:00,1643673600000,86.46,-0.034686,-0.03
1119,2022-02-02T00:00:00,1643760000000,86.69,0.266019,0.23


In [7]:
def drop_period_after_war(data):
    war_date = '2022-02-24'
    return data[data['Transaction Date'] < war_date]

In [8]:
df = drop_period_after_war(df)

In [9]:
def merge_oil_to_data(data, oil):
    data['date_without_time'] = pd.to_datetime(data['Transaction Date']).dt.strftime('%Y-%m-%d')
    oil['date_without_time'] = pd.to_datetime(oil['date']).dt.strftime('%Y-%m-%d')
    data = data.merge(oil, left_on='date_without_time', right_on='date_without_time')
    return data

In [10]:
df = merge_oil_to_data(df, oil_data)

In [11]:
def add_norm_amount(data):
    data['norm_amount']  = (data['Amount'] - data['Amount'].min()) / (data['Amount'].max() - data['Amount'].min()) 
    return data

In [12]:
df = add_norm_amount(df)

In [13]:
df

Unnamed: 0,Transaction Date,Transaction Type,Usage,Area,Property Sub Type,Amount,Property Size (sq.m),Nearest Metro,Nearest Mall,Nearest Landmark,date_without_time,date,x,price,percentChange,change,norm_amount
0,2021-03-02 13:53:10,Mortgage,Residential,AL BARARI,Flat,1435909.09,138.93,,,IMG World Adventures,2021-03-02,2021-03-02T00:00:00,1614643200000,59.75,-1.467678,-0.89,0.000305
1,2021-03-02 13:53:10,Mortgage,Residential,AL BARARI,Flat,1435909.09,87.26,,,IMG World Adventures,2021-03-02,2021-03-02T00:00:00,1614643200000,59.75,-1.467678,-0.89,0.000305
2,2021-03-02 13:53:10,Mortgage,Residential,AL BARARI,Flat,1435909.09,76.13,,,IMG World Adventures,2021-03-02,2021-03-02T00:00:00,1614643200000,59.75,-1.467678,-0.89,0.000305
3,2021-03-02 13:53:10,Mortgage,Residential,AL BARARI,Flat,1435909.09,130.05,,,IMG World Adventures,2021-03-02,2021-03-02T00:00:00,1614643200000,59.75,-1.467678,-0.89,0.000305
4,2021-03-02 13:53:10,Mortgage,Residential,AL BARARI,Flat,1435909.09,52.15,,,IMG World Adventures,2021-03-02,2021-03-02T00:00:00,1614643200000,59.75,-1.467678,-0.89,0.000305
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69700,2021-07-23 03:41:08,Mortgage,Residential,JUMEIRAH VILLAGE CIRCLE,Flat,888655.00,106.55,Dubai Internet City,Mall of the Emirates,Sports City Swimming Academy,2021-07-23,2021-07-23T00:00:00,1626998400000,72.07,0.222500,0.16,0.000189
69701,2021-07-30 01:08:56,Mortgage,Residential,ARABIAN RANCHES II,Residential,3283133.00,836.75,,,Motor City,2021-07-30,2021-07-30T00:00:00,1627603200000,73.95,0.448248,0.33,0.000698
69702,2021-07-30 01:42:39,Mortgage,Residential,Nad Al Shiba Third,,2270000.00,468.74,Creek Metro Station,City Centre Mirdif,,2021-07-30,2021-07-30T00:00:00,1627603200000,73.95,0.448248,0.33,0.000483
69703,2021-07-30 01:44:58,Mortgage,Residential,Nad Al Shiba Third,,2155300.00,469.30,Creek Metro Station,City Centre Mirdif,,2021-07-30,2021-07-30T00:00:00,1627603200000,73.95,0.448248,0.33,0.000458


In [14]:
def clean_outliers_in_data(data):
    quantiles = data.quantile(0.95)
    data = data[(data['Amount'] < quantiles['Amount']) | (data['Property Size (sq.m)'] < quantiles['Property Size (sq.m)'])]
    return data

In [15]:
df = clean_outliers_in_data(df)

  quantiles = df.quantile(0.95)


In [16]:
df

Unnamed: 0,Transaction Date,Transaction Type,Usage,Area,Property Sub Type,Amount,Property Size (sq.m),Nearest Metro,Nearest Mall,Nearest Landmark,date_without_time,date,x,price,percentChange,change,norm_amount
0,2021-03-02 13:53:10,Mortgage,Residential,AL BARARI,Flat,1435909.09,138.93,,,IMG World Adventures,2021-03-02,2021-03-02T00:00:00,1614643200000,59.75,-1.467678,-0.89,0.000305
1,2021-03-02 13:53:10,Mortgage,Residential,AL BARARI,Flat,1435909.09,87.26,,,IMG World Adventures,2021-03-02,2021-03-02T00:00:00,1614643200000,59.75,-1.467678,-0.89,0.000305
2,2021-03-02 13:53:10,Mortgage,Residential,AL BARARI,Flat,1435909.09,76.13,,,IMG World Adventures,2021-03-02,2021-03-02T00:00:00,1614643200000,59.75,-1.467678,-0.89,0.000305
3,2021-03-02 13:53:10,Mortgage,Residential,AL BARARI,Flat,1435909.09,130.05,,,IMG World Adventures,2021-03-02,2021-03-02T00:00:00,1614643200000,59.75,-1.467678,-0.89,0.000305
4,2021-03-02 13:53:10,Mortgage,Residential,AL BARARI,Flat,1435909.09,52.15,,,IMG World Adventures,2021-03-02,2021-03-02T00:00:00,1614643200000,59.75,-1.467678,-0.89,0.000305
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69700,2021-07-23 03:41:08,Mortgage,Residential,JUMEIRAH VILLAGE CIRCLE,Flat,888655.00,106.55,Dubai Internet City,Mall of the Emirates,Sports City Swimming Academy,2021-07-23,2021-07-23T00:00:00,1626998400000,72.07,0.222500,0.16,0.000189
69701,2021-07-30 01:08:56,Mortgage,Residential,ARABIAN RANCHES II,Residential,3283133.00,836.75,,,Motor City,2021-07-30,2021-07-30T00:00:00,1627603200000,73.95,0.448248,0.33,0.000698
69702,2021-07-30 01:42:39,Mortgage,Residential,Nad Al Shiba Third,,2270000.00,468.74,Creek Metro Station,City Centre Mirdif,,2021-07-30,2021-07-30T00:00:00,1627603200000,73.95,0.448248,0.33,0.000483
69703,2021-07-30 01:44:58,Mortgage,Residential,Nad Al Shiba Third,,2155300.00,469.30,Creek Metro Station,City Centre Mirdif,,2021-07-30,2021-07-30T00:00:00,1627603200000,73.95,0.448248,0.33,0.000458


In [17]:
from pandas.plotting import scatter_matrix

def plot_matrix(data):
    scatter_matrix(df[['norm_amount', 'Property Size (sq.m)']], figsize=(12, 8))
    scatter_matrix(df[['norm_amount', 'price']], figsize=(12, 8))
    return df['Amount'].describe()

In [None]:
plot_matrix(df)

count    6.791300e+04
mean     1.939577e+06
std      3.442441e+06
min      1.000000e+03
25%      7.222897e+05
50%      1.327500e+06
75%      2.298800e+06
max      3.200000e+08
Name: Amount, dtype: float64

In [None]:
def split (df, target="Amount"):
    # subset target vector from explanatory matrix
    target = target
    y = df[target]  
    X = df.drop(columns=[target])
    return y, X

In [None]:
y, X = split(df, target="Amount")

In [None]:
# Get numeric and categorical features
num_features = list(X._get_numeric_data().columns)
cat_features = list(set(X.columns) - set(num_features))