In [1]:
import pandas as pd
import requests
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import OneHotEncoder
from category_encoders import OneHotEncoder

In [2]:
# Data set of Dubai Real Estate from 01/01/2021 till 11/01/2023
df = pd.read_csv('transactions-2023-01-11.csv')

In [3]:
def impute_data(df):
    df["Transaction Date"] = pd.to_datetime(df["Transaction Date"])
    # Replace NaN values with Propety Type
    df["Property Sub Type"] = df["Property Sub Type"].fillna("Land")
    # Replace Nan values where there is no closest Metro Station or Mall with "No metro around", "No mall around"
    df["Nearest Metro"] = df["Nearest Metro"].fillna("No metro around")
    df["Nearest Mall"] = df["Nearest Mall"].fillna("No mall around")
    df["Nearest Landmark"] = df["Nearest Landmark"].fillna("No landmark around")
    return df

In [4]:
def drop_excess_columns(data):
    # Drop high cardinality columns
    data = data.drop(columns=["Transaction Number", "Property ID", "Transaction Size (sq.m)", "Parking", "Project"])
    # Drop low-cardinality columns
    data = data.drop(columns=["Registration type", "Is Free Hold?", "Master Project"])
    # Drop leaky columns
    data = data.drop(columns=["Transaction sub type", "Property Type", "Room(s)", "No. of Buyer", "No. of Seller"])
    return data


In [5]:
df = impute_data(df)
df = drop_excess_columns(df)

In [6]:
def get_oil_price():
    # data from https://tradingeconomics.com/commodity/crude-oil
    r = requests.get('https://markets.tradingeconomics.com/chart?s=cl1:com&interval=1w&span=5y&securify=new&url=/commodity/crude-oil&AUTH=nCUl2XKce%2BoKz2Gux8jbnsBR9lI4I5ttOWajwJM4oCel63SvMd94HQyeKvWrZV4R&ohlc=0')
    oil_data = pd.DataFrame(r.json()['series'][0]['data']).rename(columns={"y": "price"})
    oil_data = oil_data[(oil_data['date'] > '2020-12-31') & (oil_data['date'] < '2022-01-01')]

    return oil_data
    

In [None]:
oil_data = get_oil_price()
oil_data.head()

In [8]:
def drop_period_after_war(data):
    war_date = '2022-02-24'
    return data[data['Transaction Date'] < war_date]

In [9]:
df = drop_period_after_war(df)

In [None]:
# df['Usage'].hist()

In [None]:
# df['Amount'].hist()

In [None]:
# from pandas.plotting import scatter_matrix

# scatter_matrix(df[['Amount', 'Property Size (sq.m)']], figsize=(12, 8))

In [None]:
# df[['Amount', 'Property Size (sq.m)']]

In [10]:
def split (df):
    target = "Amount"
    y = df[target]
    X = df.drop(columns=[target, "Transaction Date"])

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # print("X_train shape:", X_train.shape)
    # print("y_train shape:", y_train.shape)
    # print("X_test shape:", X_test.shape)
    # print("y_test shape:", y_test.shape)
    return df

In [13]:
def baseline ():
    y_mean = y_train.mean()
    y_pred_baseline = [y_mean] * len(y_train)

    print("Mean apt price:", round(y_mean, 2))

    print("Baseline MSE:", round(mean_squared_error(y_train, y_pred_baseline) / 1_000_000_000, 2), "millions")
    pass

In [12]:
def regression ():
    model = make_pipeline(
        OneHotEncoder(use_cat_names=True),
        Ridge()
    ).fit(X_train, y_train)
    return model

In [None]:
y_pred_training = model.predict(X_train)

print("Training MSE:", round(mean_squared_error(y_train, y_pred_training) / 1_000_000_000, 2), "millions")