In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import requests

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer


from sklearn.model_selection import train_test_split

import statsmodels.api as sm
import statsmodels.stats.api as sms
from statsmodels.stats.diagnostic import het_white
# from category_encoders import OneHotEncoder

In [2]:
# Data set of Dubai Real Estate from 01/01/2021 till 11/01/2023
df = pd.read_csv('transactions-2023-01-11.csv')

In [3]:
def impute_data(df):
    df["Transaction Date"] = pd.to_datetime(df["Transaction Date"])
    # Replace NaN values with Propety Type
    df["Property Sub Type"] = df["Property Sub Type"].fillna("Land")
    # Replace Nan values where there is no closest Metro Station or Mall with "No metro around", "No mall around"
    df["Nearest Metro"] = df["Nearest Metro"].fillna("No metro around")
    df["Nearest Mall"] = df["Nearest Mall"].fillna("No mall around")
    df["Nearest Landmark"] = df["Nearest Landmark"].fillna("No landmark around")
    return df

In [4]:
def drop_excess_columns(data):
    # Drop high cardinality columns
    data = data.drop(columns=["Transaction Number", "Property ID", "Transaction Size (sq.m)", "Parking", "Project"])
    # Drop low-cardinality columns
    data = data.drop(columns=["Registration type", "Is Free Hold?", "Master Project"])
    # Drop leaky columns
    data = data.drop(columns=["Transaction sub type", "Property Type", "Room(s)", "No. of Buyer", "No. of Seller"])
    return data

In [5]:
def get_oil_price():
    # data from https://tradingeconomics.com/commodity/crude-oil
    r = requests.get('https://markets.tradingeconomics.com/chart?s=cl1:com&interval=1w&span=5y&securify=new&url=/commodity/crude-oil&AUTH=nCUl2XKce%2BoKz2Gux8jbnsBR9lI4I5ttOWajwJM4oCel63SvMd94HQyeKvWrZV4R&ohlc=0')
    oil_data = pd.DataFrame(r.json()['series'][0]['data']).rename(columns={"y": "price"})
    oil_data = oil_data[(oil_data['date'] > '2020-12-31') & (oil_data['date'] < '2022-01-01')]

    return oil_data

In [6]:
def drop_period_after_war(data):
    war_date = '2022-02-24'
    return data[data['Transaction Date'] < war_date]

In [7]:
# create empty dictionary with columns as keys by list comprehension
def quar_dict2(columns):  # takes as input list of column's names
    dict_keys = {column: None for column in columns}
    return dict_keys

In [8]:
def from_iterable(iterables):
    # chain.from_iterable(['ABC', 'DEF']) --> A B C D E F
    for it in iterables:
        for element in it:
            yield element

In [9]:
df = impute_data(df)
df = drop_excess_columns(df)
df = drop_period_after_war(df)
df = df.drop(columns="Transaction Date")

In [10]:
def split (df, target="Amount"):
    # subset target vector from explanatory matrix
    target = target
    y = df[target]  
    X = df.drop(columns=["Area", "Property Sub Type", "Nearest Metro", "Nearest Mall", "Nearest Landmark", "Amount"]) # [target]
    return y, X


In [11]:
y, X = split(df, target="Amount")
y.reset_index(drop=True, inplace=True)
X.head(2)

Unnamed: 0,Transaction Type,Usage,Property Size (sq.m)
0,Mortgage,Residential,138.93
1,Mortgage,Residential,87.26


In [12]:
# Get numeric and categorical features
num_features = list(X._get_numeric_data().columns)
cat_features = list(set(X.columns) - set(num_features))

In [13]:
ohe_for_sm = OneHotEncoder(handle_unknown="ignore")
X_for_sm_cat = ohe_for_sm.fit(X[cat_features]).transform(X[cat_features]).toarray()
# len(pd.DataFrame(X_for_sm_cat, columns=ohe_for_sm.get_feature_names_out()))
X_for_sm_num = sm.add_constant(X[num_features]).reset_index(drop=True)
X_for_sm = pd.concat([X_for_sm_num, pd.DataFrame(X_for_sm_cat, columns=ohe_for_sm.get_feature_names_out()).reset_index(drop=True)], axis=1)
pd.DataFrame(X_for_sm).head(2)

Unnamed: 0,const,Property Size (sq.m),Transaction Type_Gifts,Transaction Type_Mortgage,Transaction Type_Sales,Usage_Commercial,Usage_Residential
0,1.0,138.93,0.0,1.0,0.0,0.0,1.0
1,1.0,87.26,0.0,1.0,0.0,0.0,1.0


In [14]:
# Linear Regression Model
model = sm.OLS(y, X_for_sm).fit()

In [15]:
model.summary()

0,1,2,3
Dep. Variable:,Amount,R-squared:,0.304
Model:,OLS,Adj. R-squared:,0.304
Method:,Least Squares,F-statistic:,11670.0
Date:,"Mon, 13 Feb 2023",Prob (F-statistic):,0.0
Time:,01:59:01,Log-Likelihood:,-1985600.0
No. Observations:,106939,AIC:,3971000.0
Df Residuals:,106934,BIC:,3971000.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.451e+06,1.25e+05,27.650,0.000,3.21e+06,3.7e+06
Property Size (sq.m),2454.8072,11.558,212.391,0.000,2432.154,2477.461
Transaction Type_Gifts,1.339e+06,3.22e+05,4.157,0.000,7.08e+05,1.97e+06
Transaction Type_Mortgage,1.437e+06,1.71e+05,8.409,0.000,1.1e+06,1.77e+06
Transaction Type_Sales,6.747e+05,1.53e+05,4.404,0.000,3.74e+05,9.75e+05
Usage_Commercial,6.248e+06,2.4e+05,25.988,0.000,5.78e+06,6.72e+06
Usage_Residential,-2.797e+06,1.53e+05,-18.293,0.000,-3.1e+06,-2.5e+06

0,1,2,3
Omnibus:,373996.989,Durbin-Watson:,1.958
Prob(Omnibus):,0.0,Jarque-Bera (JB):,525382086271.19
Skew:,67.19,Prob(JB):,0.0
Kurtosis:,10860.802,Cond. No.,6.36e+18


### Normality of the residuals

In [27]:
# Jarque-Bera test:
def jarque_bera_test (model):
    name = ["Jarque-Bera", "Chi^2 two-tail prob.", "Skew", "Kurtosis"]
    test = sms.jarque_bera(model.resid)
    output = dict(zip(name, test))
    return output

In [20]:
# Heteroskedasticity tests - не понятно почему не работает-_-
def white_test_het(model):
    # White test
    residuals_array = model.resid

    #perform White's test
    white_test = sm.stats.diagnostic.het_white(residuals_array,  model.model.exog)

    #define labels to use for output of White's test
    labels = ['Test Statistic', 'Test Statistic p-value', 'F-Statistic', 'F-Test p-value']
    output = dict(zip(labels, np.around(white_test, 2)))
    #print results of White's test
    print(dict(zip(labels, np.around(white_test, 2))))
    return output

In [None]:
sns.regplot(x=pd.DataFrame(X_for_sm)["Property Size (sq.m)"], y=y)
plt.xlabel("Property Size (sq.m)")
plt.ylabel("Price")
plt.title("Property Size (sq.m) vs Price");