In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv("real_estate_dubai_original.csv")
df.head()

Unnamed: 0,transaction_id,trans_group_en,procedure_name_en,instance_date,property_type_en,property_sub_type_en,property_usage_en,reg_type_en,area_name_en,building_name_en,...,nearest_metro_en,nearest_mall_en,rooms_en,has_parking,procedure_area,actual_worth,meter_sale_price,no_of_parties_role_1,no_of_parties_role_2,no_of_parties_role_3
0,3-9-2006-163,Gifts,Grant,16-10-2006,Villa,,Residential,Existing Properties,Mankhool,,...,ADCB Metro Station,Dubai Mall,,0,3162.42,12000000.0,3794.56,3.0,1.0,0.0
1,3-9-2019-2944,Gifts,Grant,13-11-2019,Land,,Residential,Existing Properties,Mankhool,,...,ADCB Metro Station,Dubai Mall,,0,209.09,916659.0,4384.04,2.0,4.0,0.0
2,2-13-2001-690,Mortgages,Mortgage Registration,20-08-2001,Building,,Residential / Commercial,Existing Properties,Oud Metha,,...,Oud Metha Metro Station,Dubai Mall,,0,1337.8,4519342.0,3378.19,1.0,1.0,0.0
3,2-13-2020-9477,Mortgages,Mortgage Registration,30-11-2020,Building,,Residential,Existing Properties,Al Bada,,...,Trade Centre Metro Station,Dubai Mall,,0,278.71,2500000.0,8969.9,1.0,1.0,0.0
4,2-13-1999-532,Mortgages,Mortgage Registration,26-04-1999,Villa,,Residential,Existing Properties,Al Bada,,...,Trade Centre Metro Station,Dubai Mall,,0,3626.93,1900000.0,523.86,1.0,1.0,0.0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1548772 entries, 0 to 1548771
Data columns (total 24 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   transaction_id        1548772 non-null  object 
 1   trans_group_en        1548772 non-null  object 
 2   procedure_name_en     1548772 non-null  object 
 3   instance_date         1548772 non-null  object 
 4   property_type_en      1548772 non-null  object 
 5   property_sub_type_en  1227150 non-null  object 
 6   property_usage_en     1548772 non-null  object 
 7   reg_type_en           1548772 non-null  object 
 8   area_name_en          1548772 non-null  object 
 9   building_name_en      1092627 non-null  object 
 10  project_number        1106097 non-null  float64
 11  project_name_en       1106097 non-null  object 
 12  master_project_en     1319601 non-null  object 
 13  nearest_landmark_en   1265574 non-null  object 
 14  nearest_metro_en      1098916 non-

In [None]:
# Before starting cleaning, we want to take a copy of the data in case we did a mistake during preprocessing

df_clean = df.copy()

In [None]:
# We have some issues with instance_date because it has some Hijri date.
# We need to know how many rows has Hijri date.

dates = pd.to_datetime(df['instance_date'], dayfirst=True, errors='coerce')
problematic_rows = df_clean[dates.isna()]
print(problematic_rows['instance_date'].unique())

['04-02-1417' '30-01-1420' '02-07-1416' '23-11-1422']


In [None]:
# Since we have only 4 Hijri date, we can remove the records.

df_clean['instance_date'] = pd.to_datetime(df_clean['instance_date'], dayfirst=True, errors='coerce')
df_clean = df_clean.dropna(subset=['instance_date'])

In [None]:
# For the target we can choose either actual_worth or meter_sale_price.
# By choosing any of them, the other should be removed to avoid data leakage.
# So actual_worth will be the target and we need to remove meter_sale_price

df_clean = df_clean.drop(columns=['meter_sale_price'])

In [None]:
df_clean.info()


<class 'pandas.core.frame.DataFrame'>
Index: 1548768 entries, 0 to 1548771
Data columns (total 23 columns):
 #   Column                Non-Null Count    Dtype         
---  ------                --------------    -----         
 0   transaction_id        1548768 non-null  object        
 1   trans_group_en        1548768 non-null  object        
 2   procedure_name_en     1548768 non-null  object        
 3   instance_date         1548768 non-null  datetime64[ns]
 4   property_type_en      1548768 non-null  object        
 5   property_sub_type_en  1227150 non-null  object        
 6   property_usage_en     1548768 non-null  object        
 7   reg_type_en           1548768 non-null  object        
 8   area_name_en          1548768 non-null  object        
 9   building_name_en      1092627 non-null  object        
 10  project_number        1106097 non-null  float64       
 11  project_name_en       1106097 non-null  object        
 12  master_project_en     1319601 non-null  object 

In [None]:
# We need to check the unique values of room and how many records we have for each.

df_clean['rooms_en'].value_counts(dropna=False)

Unnamed: 0_level_0,count
rooms_en,Unnamed: 1_level_1
1 B/R,413331
,342271
2 B/R,293217
Studio,209930
3 B/R,165861
Office,57723
4 B/R,51478
Shop,6548
5 B/R,5266
PENTHOUSE,1626


In [None]:
# As we have 342271 NULL values, we need to do imputation and converting rooms_en to numbers of rooms so we can do linear regression

def convert_rooms(value):
    """
    Convert textual room info to numeric:
    - 'Studio', 'Single Room' -> 1
    - '1 B/R', '2 B/R', etc. -> extract number
    - Non-residential types like 'Office', 'Shop', 'GYM' -> 0
    """
    if pd.isna(value):
        return np.nan
    value = str(value).strip().upper()
    if value in ['STUDIO', 'SINGLE ROOM']:
        return 1
    elif 'B/R' in value:
        num = ''.join([c for c in value if c.isdigit()])
        if num:
            return int(num)
    # Non-residential types
    return 0

# Numeric version for regression
df_clean['rooms_en_num'] = df_clean['rooms_en'].apply(convert_rooms)

# Impute missing numeric rooms using median by property type
df_clean['rooms_en_num'] = df_clean.groupby('property_type_en')['rooms_en_num']\
                                   .transform(lambda x: x.fillna(x.median()))

# Consider the other as there is no room
df_clean['rooms_en_num'] = df_clean['rooms_en_num'].fillna(0)

# Fill original textual rooms for visualization purposes
df_clean['rooms_en'] = df_clean['rooms_en'].fillna('Unknown')

In [None]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1548768 entries, 0 to 1548771
Data columns (total 24 columns):
 #   Column                Non-Null Count    Dtype         
---  ------                --------------    -----         
 0   transaction_id        1548768 non-null  object        
 1   trans_group_en        1548768 non-null  object        
 2   procedure_name_en     1548768 non-null  object        
 3   instance_date         1548768 non-null  datetime64[ns]
 4   property_type_en      1548768 non-null  object        
 5   property_sub_type_en  1227150 non-null  object        
 6   property_usage_en     1548768 non-null  object        
 7   reg_type_en           1548768 non-null  object        
 8   area_name_en          1548768 non-null  object        
 9   building_name_en      1092627 non-null  object        
 10  project_number        1106097 non-null  float64       
 11  project_name_en       1106097 non-null  object        
 12  master_project_en     1319601 non-null  object 

In [None]:
# Now we need to handle other categorical columns with missing values
categorical_cols = [
    'property_sub_type_en', 'building_name_en', 'project_number',
    'project_name_en', 'master_project_en', 'nearest_landmark_en',
    'nearest_metro_en', 'nearest_mall_en'
]

# Fill missing values with 'Unknown' (standard for categorical features)
for col in categorical_cols:
    df_clean[col] = df_clean[col].fillna('Unknown')

In [None]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1548768 entries, 0 to 1548771
Data columns (total 24 columns):
 #   Column                Non-Null Count    Dtype         
---  ------                --------------    -----         
 0   transaction_id        1548768 non-null  object        
 1   trans_group_en        1548768 non-null  object        
 2   procedure_name_en     1548768 non-null  object        
 3   instance_date         1548768 non-null  datetime64[ns]
 4   property_type_en      1548768 non-null  object        
 5   property_sub_type_en  1548768 non-null  object        
 6   property_usage_en     1548768 non-null  object        
 7   reg_type_en           1548768 non-null  object        
 8   area_name_en          1548768 non-null  object        
 9   building_name_en      1548768 non-null  object        
 10  project_number        1548768 non-null  object        
 11  project_name_en       1548768 non-null  object        
 12  master_project_en     1548768 non-null  object 

In [None]:
# We still have some missing data in no_of_parties_rol 1,2, and 3. Since there are few, we can fill it by zeros

df_clean['no_of_parties_role_1'] = df_clean['no_of_parties_role_1'].fillna(0)
df_clean['no_of_parties_role_2'] = df_clean['no_of_parties_role_2'].fillna(0)
df_clean['no_of_parties_role_3'] = df_clean['no_of_parties_role_3'].fillna(0)

# we also don't need `transaction_id` for linear regression nor for visualization
df_clean = df_clean.drop(columns=['transaction_id'])

In [None]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1548768 entries, 0 to 1548771
Data columns (total 23 columns):
 #   Column                Non-Null Count    Dtype         
---  ------                --------------    -----         
 0   trans_group_en        1548768 non-null  object        
 1   procedure_name_en     1548768 non-null  object        
 2   instance_date         1548768 non-null  datetime64[ns]
 3   property_type_en      1548768 non-null  object        
 4   property_sub_type_en  1548768 non-null  object        
 5   property_usage_en     1548768 non-null  object        
 6   reg_type_en           1548768 non-null  object        
 7   area_name_en          1548768 non-null  object        
 8   building_name_en      1548768 non-null  object        
 9   project_number        1548768 non-null  object        
 10  project_name_en       1548768 non-null  object        
 11  master_project_en     1548768 non-null  object        
 12  nearest_landmark_en   1548768 non-null  object 

In [None]:
# After we cleaned the data, we should take a copy as CSV file that we will use it for visualization.

df_clean.to_csv("dubai_real_estate_clean.csv", index=False)