In [1]:
import json
import geojson
import gzip

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import requests

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer


from sklearn.model_selection import train_test_split

import statsmodels.api as sm
import statsmodels.stats.api as sms
from statsmodels.stats.diagnostic import het_white
# from category_encoders import OneHotEncoder

In [2]:
# Data set of Dubai Real Estate from 01/01/2021 till 11/01/2023
df = pd.read_csv('transactions-2023-01-11.csv')  # https://dubailand.gov.ae/en/open-data/real-estate-data/#/

In [3]:
def impute_data(df):
    df["Transaction Date"] = pd.to_datetime(df["Transaction Date"])
    # Replace NaN values with Propety Type
    df["Property Sub Type"] = df["Property Sub Type"].fillna("Land")
    # Replace Nan values where there is no closest Metro Station or Mall with "No metro around", "No mall around"
    df["Nearest Metro"] = df["Nearest Metro"].fillna("No metro around")
    df["Nearest Mall"] = df["Nearest Mall"].fillna("No mall around")
    df["Nearest Landmark"] = df["Nearest Landmark"].fillna("No landmark around")
    return df

In [4]:
def drop_excess_columns(data):
    # Drop high cardinality columns
    data = data.drop(columns=["Transaction Number", "Property ID", "Transaction Size (sq.m)", "Parking", "Project"])
    # Drop low-cardinality columns
    data = data.drop(columns=["Registration type", "Is Free Hold?", "Master Project"])
    # Drop leaky columns
    data = data.drop(columns=["Transaction sub type", "Property Type", "Room(s)", "No. of Buyer", "No. of Seller"])
    return data

In [5]:
def get_oil_price():
    # data from https://tradingeconomics.com/commodity/crude-oil
    r = requests.get('https://markets.tradingeconomics.com/chart?s=cl1:com&interval=1d&span=5y&securify=new&url=/commodity/crude-oil&AUTH=8pTHKtPlPegH%2F6HowPfl8zp75wwc%2FNa1S0o8Y7yWDYKHEaMj%2FIlYM%2F7osRfkTcfZ&ohlc=0')
    
    oil_data = pd.DataFrame(r.json()['series'][0]['data']).rename(columns={"y": "price"})
    oil_data = oil_data[(oil_data['date'] > '2021-03-01') & (oil_data['date'] < '2022-02-04')]

    return oil_data
    

In [6]:
# oil = get_oil_price()
# oil

In [7]:
def drop_period_after_war(data):
    war_date = '2022-02-24'
    return data[data['Transaction Date'] < war_date]

In [8]:
# create empty dictionary with columns as keys by list comprehension
def quar_dict2(columns):  # takes as input list of column's names
    dict_keys = {column: None for column in columns}
    return dict_keys

In [9]:
def from_iterable(iterables):
    # chain.from_iterable(['ABC', 'DEF']) --> A B C D E F
    for it in iterables:
        for element in it:
            yield element

In [10]:
df = impute_data(df)
df = drop_excess_columns(df)
df = drop_period_after_war(df)
df = df.drop(columns=["Transaction Date"])
df = df[df['Property Sub Type'].isin(
    ["Commercial", "Flat", 
     "Hotel Apartment", "Hotel Rooms", 
     "Office", "Residential", "Residential / Attached Villas",
     "Residential Flats", "Stacked Townhouses", "Villa"]
)]
df = df.merge(df["Area"].value_counts(), left_on="Area", right_index=True).drop(columns="Area_x")
df['Area'] = df['Area'].str.upper()

In [11]:
with open("dubai.geojson") as f:  # https://www.kaggle.com/datasets/subhanahsan/dubai-neighborhoods?select=dubai.geojson
    gj = geojson.load(f)
features = gj['features']
zones = pd.DataFrame.from_dict(features)["properties"].apply(pd.Series).drop(columns=["CNAME_A", "COMMUNITY_A", "COMMUNITY_E"])
zones['CNAME_E'] = zones['CNAME_E'].str.upper()

In [12]:
# import urllib.request
# from bs4 import BeautifulSoup

# #specify the url
# url = 'https://en.wikipedia.org/wiki/List_of_communities_in_Dubai'

# #query the website and return the html to the variable ‘page’
# page = urllib.request.urlopen(url)

# #parse the html using beautiful soup and store in variable `soup`
# soup = BeautifulSoup(page, 'html.parser')

# #find all table elements from the web page
# tables = soup.find_all('table')

# #find table with class="wikitable sortable"
# table = soup.find('table', {'class':'wikitable sortable'})

# #get all table rows
# table_rows = table.find_all('tr')

# #Iterate through table rows and get corresponding values
# out=[]
# for tr in table_rows:
#     td = tr.find_all('td')
#     row = [tr.text.strip() for tr in td if tr.text.strip()]
#     if row:
#         out.append(row)

# # Create dataframe
# codes = pd.DataFrame(out, columns=["Code", "Name", "3", "4", "5", "6"]).drop(columns=["3", "4", "5", "6"]).sort_values("Code").reset_index(drop=True)
# codes['Name'] = codes['Name'].str.upper()

In [13]:
print(df.shape)
df.head()

(99566, 10)


Unnamed: 0,Area,Transaction Type,Usage,Property Sub Type,Amount,Property Size (sq.m),Nearest Metro,Nearest Mall,Nearest Landmark,Area_y
0,AL BARARI,Mortgage,Residential,Flat,1435909.09,138.93,No metro around,No mall around,IMG World Adventures,691
1,AL BARARI,Mortgage,Residential,Flat,1435909.09,87.26,No metro around,No mall around,IMG World Adventures,691
2,AL BARARI,Mortgage,Residential,Flat,1435909.09,76.13,No metro around,No mall around,IMG World Adventures,691
3,AL BARARI,Mortgage,Residential,Flat,1435909.09,130.05,No metro around,No mall around,IMG World Adventures,691
4,AL BARARI,Mortgage,Residential,Flat,1435909.09,52.15,No metro around,No mall around,IMG World Adventures,691


In [14]:
zones[zones["COMM_NUM"].str.contains("645")]

Unnamed: 0,CNAME_E,COMM_NUM,SHAPE_AREA,SHAPE_LEN,Sector,Population 2018,Population 2019,Area Sq Km,Latitude,Longitude
199,WADI AL SAFA 3,645,30192055.8013,29186.4146517,6,9599,9764,30.192307,25.108115,55.315494


In [15]:
codes[codes["Code"].str.contains("645")]

NameError: name 'codes' is not defined

In [None]:
check = df.merge(zones, how='left', left_on="Area", right_on="CNAME_E")

In [None]:
zones

In [None]:
#   https://www.dsc.gov.ae/en-us/Themes/Pages/Population-and-Vital-Statistics.aspx?Theme=42

stat = pd.read_excel("DSC_SYB_2021_01_02.xlsx", sheet_name=0, header=6)
stat_cols = {
    stat.columns[5]: "Sector & Community",
    stat.columns[4]: "Population Density (km2)",
    stat.columns[3]: "Area (km2)",
    stat.columns[2]: "Total population",
    stat.columns[6]: "Community Code",
}
stat = stat.rename(columns=stat_cols).drop(columns=stat.columns[:2]).dropna().reset_index(drop=True)
stat.head()

In [None]:
aa = check[check["CNAME_E"].isna()]
aa["Area"].value_counts(normalize=True).head(50).sum()

In [None]:
aa["Area"].value_counts(normalize=True)