In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import requests

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer


from sklearn.model_selection import train_test_split

import statsmodels.api as sm
import statsmodels.stats.api as sms
from statsmodels.stats.diagnostic import het_white
# from category_encoders import OneHotEncoder

In [10]:
# Data set of Dubai Real Estate from 01/01/2021 till 11/01/2023
df = pd.read_csv('transactions-2023-01-11.csv')

In [12]:
def impute_data(df):
    df["Transaction Date"] = pd.to_datetime(df["Transaction Date"])
    # Replace NaN values with Propety Type
    df["Property Sub Type"] = df["Property Sub Type"].fillna("Land")
    # Replace Nan values where there is no closest Metro Station or Mall with "No metro around", "No mall around"
    df["Nearest Metro"] = df["Nearest Metro"].fillna("No metro around")
    df["Nearest Mall"] = df["Nearest Mall"].fillna("No mall around")
    df["Nearest Landmark"] = df["Nearest Landmark"].fillna("No landmark around")
    return df

In [None]:
def drop_excess_columns(data):
    # Drop high cardinality columns
    data = data.drop(columns=["Transaction Number", "Property ID", "Transaction Size (sq.m)", "Parking", "Project"])
    # Drop low-cardinality columns
    data = data.drop(columns=["Registration type", "Is Free Hold?", "Master Project"])
    # Drop leaky columns
    data = data.drop(columns=["Transaction sub type", "Property Type", "Room(s)", "No. of Buyer", "No. of Seller"])
    return data

In [None]:
def get_oil_price():
    # data from https://tradingeconomics.com/commodity/crude-oil
    r = requests.get('https://markets.tradingeconomics.com/chart?s=cl1:com&interval=1d&span=5y&securify=new&url=/commodity/crude-oil&AUTH=Iyl9RVWfzjLi7c1HYd9NvPSbYw4QDhtHNC8UlBnpCaBG6anMsTTRMHyqxcJokUmS&ohlc=0')
    
    oil_data = pd.DataFrame(r.json()['series'][0]['data']).rename(columns={"y": "price"})
    oil_data = oil_data[(oil_data['date'] > '2021-03-01') & (oil_data['date'] < '2022-02-04')]

    return oil_data
    

In [None]:
# oil_data = get_oil_price()
# oil_data

In [None]:
def drop_period_after_war(data):
    war_date = '2022-02-24'
    return data[data['Transaction Date'] < war_date]

In [None]:
# create empty dictionary with columns as keys by list comprehension
def quar_dict2(columns):  # takes as input list of column's names
    dict_keys = {column: None for column in columns}
    return dict_keys

In [None]:
def from_iterable(iterables):
    # chain.from_iterable(['ABC', 'DEF']) --> A B C D E F
    for it in iterables:
        for element in it:
            yield element

In [None]:
df = impute_data(df)
df = drop_excess_columns(df)
df = drop_period_after_war(df)
df = df.drop(columns=["Transaction Date", "Area"])
df = df[df['Property Sub Type'].isin(
    ["Commercial", "Flat", 
     "Hotel Apartment", "Hotel Rooms", 
     "Office", "Residential", "Residential / Attached Villas",
     "Residential Flats", "Stacked Townhouses", "Villa"]
)]

In [None]:
def split (df, target="Amount"):
    # subset target vector from explanatory matrix
    target = target
    y = df[target]
    X = df.drop(columns=[target])
#    X = df.drop(columns=["Area", "Property Sub Type", "Nearest Metro", "Nearest Mall", "Nearest Landmark", "Amount"]) # [target]
    return y, X


In [None]:
y, X = split(df, target="Amount")
y.reset_index(drop=True, inplace=True)
X.head(2)

In [None]:
# Get numeric and categorical features
num_features = list(X._get_numeric_data().columns)
cat_features = list(set(X.columns) - set(num_features))

In [None]:
ohe_for_sm = OneHotEncoder(handle_unknown="ignore")
X_for_sm_cat = ohe_for_sm.fit(X[cat_features]).transform(X[cat_features]).toarray()
# len(pd.DataFrame(X_for_sm_cat, columns=ohe_for_sm.get_feature_names_out()))
X_for_sm_num = sm.add_constant(X[num_features]).reset_index(drop=True)
X_for_sm = pd.concat([X_for_sm_num, pd.DataFrame(X_for_sm_cat, columns=ohe_for_sm.get_feature_names_out()).reset_index(drop=True)], axis=1)
pd.DataFrame(X_for_sm).head(2)

In [None]:
# Linear Regression Model
model = sm.OLS(y, X_for_sm).fit()

In [None]:
model.summary()

### Normality of the residuals

In [None]:
# Jarque-Bera test:
def jarque_bera_test (model):
    name = ["Jarque-Bera", "Chi^2 two-tail prob.", "Skew", "Kurtosis"]
    test = sms.jarque_bera(model.resid)
    output = dict(zip(name, test))
    return output

In [None]:
# Heteroskedasticity tests - не понятно почему не работает-_-
def white_test_het(model):
    # White test
    residuals_array = model.resid

    #perform White's test
    white_test = sm.stats.diagnostic.het_white(residuals_array,  model.model.exog)

    #define labels to use for output of White's test
    labels = ['Test Statistic', 'Test Statistic p-value', 'F-Statistic', 'F-Test p-value']
    output = dict(zip(labels, np.around(white_test, 2)))
    #print results of White's test
    print(dict(zip(labels, np.around(white_test, 2))))
    return output

Part 1A - Classical Regression Analysis <br>
a) Choose a model and state the assumptions about the nature of
explanatory variables and disturbances. The model should include
categorical and continuous explanatory variables as discussed before; <br>
- We are using ordinary least squares method. <br>
1. Transaction Type - Whether transaction was conducted via direct sale, using mortgage or gifted (Sales 67%, Mort 29%, 4%) <br>
2. Usage - Residential or Commercial property (Res 95%, Comm 5%) <br>
3. Area - Names of the Dubai's areas <br>
4. Property Sub Type -  <br>
5. Property Size (sq.m) - Size of property <br>
6. Nearest Metro - Name of nearest metro or "No metro around" <br>
7. Nearest Mall - Name of nearest mall or "No mall around" <br>
8. Nearest Landmark - Name of nearest landmark or "No landmark around" <br>


In [None]:
df["Nearest Metro"].value_counts(normalize=True)

In [None]:
# X["Area"].unique()
sub = df[df["Nearest Metro"].str.contains("station")] 
# for i in sub["Property Sub Type"].unique():
#     print(i) 382
# sub["Property Sub Type"].unique()
sub

In [None]:
mask = df[df['Property Sub Type'].isin(
    ["Commercial", "Flat", 
     "Hotel Apartment", "Hotel Rooms", 
     "Office", "Residential", "Residential / Attached Villas",
     "Residential Flats", "Stacked Townhouses", "Villa"]
)]
# df_sub = df[mask]

mask.info()

In [None]:
df_sub = (
    df
    .groupby(df["Property Sub Type"])
    .mean(["Amount", "Property Size (sq.m)"])
    .round(0)
)
df_sub

In [None]:
sns.regplot(x=pd.DataFrame(X_for_sm)["Property Size (sq.m)"], y=y)
plt.xlabel("Property Size (sq.m)")
plt.ylabel("Price")
plt.title("Property Size (sq.m) vs Price");