In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import requests

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer


from sklearn.model_selection import train_test_split

import statsmodels.api as sm
import statsmodels.stats.api as sms
from statsmodels.stats.diagnostic import het_white
# from category_encoders import OneHotEncoder

In [2]:
# Data set of Dubai Real Estate from 01/01/2021 till 11/01/2023
df = pd.read_csv('transactions-2023-01-11.csv')

In [3]:
def impute_data(df):
    df["Transaction Date"] = pd.to_datetime(df["Transaction Date"])
    # Replace NaN values with Propety Type
    df["Property Sub Type"] = df["Property Sub Type"].fillna("Land")
    # Replace Nan values where there is no closest Metro Station or Mall with "No metro around", "No mall around"
    df["Nearest Metro"] = df["Nearest Metro"].fillna("No metro around")
    df["Nearest Mall"] = df["Nearest Mall"].fillna("No mall around")
    df["Nearest Landmark"] = df["Nearest Landmark"].fillna("No landmark around")
    return df

In [4]:
def drop_excess_columns(data):
    # Drop high cardinality columns
    data = data.drop(columns=["Transaction Number", "Property ID", "Transaction Size (sq.m)", "Parking", "Project"])
    # Drop low-cardinality columns
    data = data.drop(columns=["Registration type", "Is Free Hold?", "Master Project"])
    # Drop leaky columns
    data = data.drop(columns=["Transaction sub type", "Property Type", "Room(s)", "No. of Buyer", "No. of Seller"])
    return data

In [5]:
def get_oil_price():
    # data from https://tradingeconomics.com/commodity/crude-oil
    r = requests.get('https://markets.tradingeconomics.com/chart?s=cl1:com&interval=1d&span=5y&securify=new&url=/commodity/crude-oil&AUTH=Iyl9RVWfzjLi7c1HYd9NvPSbYw4QDhtHNC8UlBnpCaBG6anMsTTRMHyqxcJokUmS&ohlc=0')
    
    oil_data = pd.DataFrame(r.json()['series'][0]['data']).rename(columns={"y": "price"})
    oil_data = oil_data[(oil_data['date'] > '2021-03-01') & (oil_data['date'] < '2022-02-04')]

    return oil_data
    

In [None]:
oil_data = get_oil_price()
oil_data

In [6]:
def drop_period_after_war(data):
    war_date = '2022-02-24'
    return data[data['Transaction Date'] < war_date]

In [7]:
# create empty dictionary with columns as keys by list comprehension
def quar_dict2(columns):  # takes as input list of column's names
    dict_keys = {column: None for column in columns}
    return dict_keys

In [8]:
def from_iterable(iterables):
    # chain.from_iterable(['ABC', 'DEF']) --> A B C D E F
    for it in iterables:
        for element in it:
            yield element

In [9]:
df = impute_data(df)
df = drop_excess_columns(df)
df = drop_period_after_war(df)
df = df.drop(columns=["Transaction Date", "Area"])
df = df[df['Property Sub Type'].isin(
    ["Commercial", "Flat", 
     "Hotel Apartment", "Hotel Rooms", 
     "Office", "Residential", "Residential / Attached Villas",
     "Residential Flats", "Stacked Townhouses", "Villa"]
)]

In [10]:
def split (df, target="Amount"):
    # subset target vector from explanatory matrix
    target = target
    y = df[target]
    X = df.drop(columns=[target])
#    X = df.drop(columns=["Area", "Property Sub Type", "Nearest Metro", "Nearest Mall", "Nearest Landmark", "Amount"]) # [target]
    return y, X


In [11]:
y, X = split(df, target="Amount")
y.reset_index(drop=True, inplace=True)
X.head(2)

Unnamed: 0,Transaction Type,Usage,Property Sub Type,Property Size (sq.m),Nearest Metro,Nearest Mall,Nearest Landmark
0,Mortgage,Residential,Flat,138.93,No metro around,No mall around,IMG World Adventures
1,Mortgage,Residential,Flat,87.26,No metro around,No mall around,IMG World Adventures


In [12]:
# Get numeric and categorical features
num_features = list(X._get_numeric_data().columns)
cat_features = list(set(X.columns) - set(num_features))

In [13]:
ohe_for_sm = OneHotEncoder(handle_unknown="ignore")
X_for_sm_cat = ohe_for_sm.fit(X[cat_features]).transform(X[cat_features]).toarray()
# len(pd.DataFrame(X_for_sm_cat, columns=ohe_for_sm.get_feature_names_out()))
X_for_sm_num = sm.add_constant(X[num_features]).reset_index(drop=True)
X_for_sm = pd.concat([X_for_sm_num, pd.DataFrame(X_for_sm_cat, columns=ohe_for_sm.get_feature_names_out()).reset_index(drop=True)], axis=1)
pd.DataFrame(X_for_sm).head(2)

Unnamed: 0,const,Property Size (sq.m),Property Sub Type_Commercial,Property Sub Type_Flat,Property Sub Type_Hotel Apartment,Property Sub Type_Hotel Rooms,Property Sub Type_Office,Property Sub Type_Residential,Property Sub Type_Residential / Attached Villas,Property Sub Type_Residential Flats,...,Nearest Landmark_Dubai International Airport,Nearest Landmark_Dubai Parks and Resorts,Nearest Landmark_Expo 2020 Site,Nearest Landmark_Global Village,Nearest Landmark_Hamdan Sports Complex,Nearest Landmark_IMG World Adventures,Nearest Landmark_Jabel Ali,Nearest Landmark_Motor City,Nearest Landmark_No landmark around,Nearest Landmark_Sports City Swimming Academy
0,1.0,138.93,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,1.0,87.26,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [14]:
# Linear Regression Model
model = sm.OLS(y, X_for_sm).fit()

In [15]:
model.summary()

0,1,2,3
Dep. Variable:,Amount,R-squared:,0.308
Model:,OLS,Adj. R-squared:,0.308
Method:,Least Squares,F-statistic:,509.3
Date:,"Tue, 14 Feb 2023",Prob (F-statistic):,0.0
Time:,22:13:47,Log-Likelihood:,-1837000.0
No. Observations:,99566,AIC:,3674000.0
Df Residuals:,99478,BIC:,3675000.0
Df Model:,87,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.458e+04,9.38e+05,0.048,0.962,-1.79e+06,1.88e+06
Property Size (sq.m),2281.0013,11.357,200.851,0.000,2258.742,2303.260
Property Sub Type_Commercial,2.389e+06,6.63e+05,3.605,0.000,1.09e+06,3.69e+06
Property Sub Type_Flat,-2.15e+06,2.66e+06,-0.809,0.419,-7.36e+06,3.06e+06
Property Sub Type_Hotel Apartment,-2.462e+06,2.69e+06,-0.916,0.359,-7.73e+06,2.8e+06
Property Sub Type_Hotel Rooms,-3.106e+06,2.7e+06,-1.150,0.250,-8.4e+06,2.19e+06
Property Sub Type_Office,-2.592e+06,2.69e+06,-0.965,0.334,-7.86e+06,2.67e+06
Property Sub Type_Residential,6.675e+05,2.67e+06,0.250,0.802,-4.56e+06,5.89e+06
Property Sub Type_Residential / Attached Villas,1.669e+06,2.24e+07,0.074,0.941,-4.23e+07,4.57e+07

0,1,2,3
Omnibus:,375039.145,Durbin-Watson:,1.965
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1061452140813.08
Skew:,84.202,Prob(JB):,0.0
Kurtosis:,15997.705,Cond. No.,5.44e+16


### Normality of the residuals

In [None]:
# Jarque-Bera test:
def jarque_bera_test (model):
    name = ["Jarque-Bera", "Chi^2 two-tail prob.", "Skew", "Kurtosis"]
    test = sms.jarque_bera(model.resid)
    output = dict(zip(name, test))
    return output

In [None]:
# Heteroskedasticity tests - не понятно почему не работает-_-
def white_test_het(model):
    # White test
    residuals_array = model.resid

    #perform White's test
    white_test = sm.stats.diagnostic.het_white(residuals_array,  model.model.exog)

    #define labels to use for output of White's test
    labels = ['Test Statistic', 'Test Statistic p-value', 'F-Statistic', 'F-Test p-value']
    output = dict(zip(labels, np.around(white_test, 2)))
    #print results of White's test
    print(dict(zip(labels, np.around(white_test, 2))))
    return output

Part 1A - Classical Regression Analysis <br>
a) Choose a model and state the assumptions about the nature of
explanatory variables and disturbances. The model should include
categorical and continuous explanatory variables as discussed before; <br>
- We are using ordinary least squares method. <br>
1. Transaction Type - Whether transaction was conducted via direct sale, using mortgage or gifted (Sales 67%, Mort 29%, 4%) <br>
2. Usage - Residential or Commercial property (Res 95%, Comm 5%) <br>
3. Area - Names of the Dubai's areas <br>
4. Property Sub Type -  <br>
5. Property Size (sq.m) - Size of property <br>
6. Nearest Metro - Name of nearest metro or "No metro around" <br>
7. Nearest Mall - Name of nearest mall or "No mall around" <br>
8. Nearest Landmark - Name of nearest landmark or "No landmark around" <br>


In [19]:
df["Nearest Metro"].value_counts(normalize=True)

No metro around                         0.279583
Buj Khalifa Dubai Mall Metro Station    0.104102
Business Bay Metro Station              0.076452
Dubai Internet City                     0.060864
Nakheel Metro Station                   0.060251
Damac Properties                        0.053281
First Abu Dhabi Bank Metro Station      0.039421
Rashidiya Metro Station                 0.032119
Jumeirah Lakes Towers                   0.030703
Jumeirah Beach Residency                0.024416
Sharaf Dg Metro Station                 0.023161
Palm Jumeirah                           0.019836
Ibn Battuta Metro Station               0.019726
Harbour Tower                           0.019384
Mina Seyahi                             0.019023
Noor Bank Metro Station                 0.018751
Creek Metro Station                     0.016944
Dubai Marina                            0.015768
Jumeirah Beach Resdency                 0.013941
Marina Towers                           0.009973
DANUBE Metro Station

In [18]:
# X["Area"].unique()
sub = df[df["Nearest Metro"].str.contains("station")] 
# for i in sub["Property Sub Type"].unique():
#     print(i) 382
# sub["Property Sub Type"].unique()
sub

Unnamed: 0,Transaction Type,Usage,Property Sub Type,Amount,Property Size (sq.m),Nearest Metro,Nearest Mall,Nearest Landmark


In [None]:
mask = df[df['Property Sub Type'].isin(
    ["Commercial", "Flat", 
     "Hotel Apartment", "Hotel Rooms", 
     "Office", "Residential", "Residential / Attached Villas",
     "Residential Flats", "Stacked Townhouses", "Villa"]
)]
# df_sub = df[mask]

mask.info()

In [None]:
df_sub = (
    df
    .groupby(df["Property Sub Type"])
    .mean(["Amount", "Property Size (sq.m)"])
    .round(0)
)
df_sub

In [None]:
sns.regplot(x=pd.DataFrame(X_for_sm)["Property Size (sq.m)"], y=y)
plt.xlabel("Property Size (sq.m)")
plt.ylabel("Price")
plt.title("Property Size (sq.m) vs Price");

ModuleNotFoundError: No module named 'geojson'