### Import Libraries ###

In [162]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import matplotlib 
matplotlib.rcParams["figure.figsize"] = (20,10)
import re

### Read in file and print top 5

In [163]:
df1 = pd.read_csv("../data/mas_housing 2.csv")
df1.head()

Unnamed: 0,Location,Price,Rooms,Bathrooms,Car Parks,Property Type,Size,Furnishing
0,"KLCC, Kuala Lumpur","RM 1,250,000",2+1,3.0,2.0,Serviced Residence,"Built-up : 1,335 sq. ft.",Fully Furnished
1,"Damansara Heights, Kuala Lumpur","RM 6,800,000",6,7.0,,Bungalow,Land area : 6900 sq. ft.,Partly Furnished
2,"Dutamas, Kuala Lumpur","RM 1,030,000",3,4.0,2.0,Condominium (Corner),"Built-up : 1,875 sq. ft.",Partly Furnished
3,"Cheras, Kuala Lumpur",,,,,,,
4,"Bukit Jalil, Kuala Lumpur","RM 900,000",4+1,3.0,2.0,Condominium (Corner),"Built-up : 1,513 sq. ft.",Partly Furnished


### Determine dataset shape (# records vs # features)

In [164]:
df1.shape

(53883, 8)

### List features and determine null values

In [165]:
df1.isnull().sum()

Location             0
Price              248
Rooms             1706
Bathrooms         2013
Car Parks        17567
Property Type       25
Size              1063
Furnishing        6930
dtype: int64

### Since there is a high number of null records especially for the Car Parks feature imputations should be done after all data has been cleaned

### Clean 'Location' feature

In [166]:
# Determine counts for each location and look for duplicates due to variations in naming.

df1['Location'].value_counts().sort_index()

Location
ADIVA Desa ParkCity, Kuala Lumpur        2
Alam Damai, Kuala Lumpur                 1
Ampang Hilir, Kuala Lumpur             629
Ampang, Kuala Lumpur                  1234
Bandar Damai Perdana, Kuala Lumpur     153
                                      ... 
cyberjaya, Kuala Lumpur                  1
duta Nusantara, Kuala Lumpur             1
kepong, Kuala Lumpur                     1
taman cheras perdana, Kuala Lumpur       1
taman connaught, Kuala Lumpur            1
Name: count, Length: 112, dtype: int64

### Remove Kuala Lumpur from the location name

In [167]:
# For location only retain area name, remove KL and commas. KLCC should be standardized to KLCC.
df1['Location'] = df1['Location'].str.replace(', Kuala Lumpur', '', regex=False)
df1['Location'] = df1['Location'].str.replace(r'\bklcc\b', 'KLCC', case=False, regex=True)
df1.head()


Unnamed: 0,Location,Price,Rooms,Bathrooms,Car Parks,Property Type,Size,Furnishing
0,KLCC,"RM 1,250,000",2+1,3.0,2.0,Serviced Residence,"Built-up : 1,335 sq. ft.",Fully Furnished
1,Damansara Heights,"RM 6,800,000",6,7.0,,Bungalow,Land area : 6900 sq. ft.,Partly Furnished
2,Dutamas,"RM 1,030,000",3,4.0,2.0,Condominium (Corner),"Built-up : 1,875 sq. ft.",Partly Furnished
3,Cheras,,,,,,,
4,Bukit Jalil,"RM 900,000",4+1,3.0,2.0,Condominium (Corner),"Built-up : 1,513 sq. ft.",Partly Furnished


In [168]:
# Some locations are named 'Other' which should be removed. Determine the count of 'Other' locations.
df1[df1['Location'] == 'Other'].value_counts()

Location  Price         Rooms  Bathrooms  Car Parks  Property Type                              Size                       Furnishing      
Other     RM 1,350,000  6      5.0        2.0        3.5-sty Terrace/Link House (Intermediate)  Land area : 20x70 sq. ft.  Partly Furnished    1
          RM 2,540,000  6      6.0        2.0        Bungalow                                   Land area : 10239 sq. ft.  Unfurnished         1
          RM 3,000,000  7+1    8.0        4.0        Bungalow (Corner)                          Land area : 5000 sq. ft.   Partly Furnished    1
          RM 3,300,000  7+1    8.0        4.0        Bungalow (Corner)                          Land area : 5000 sq. ft.   Fully Furnished     1
          RM 4,264,920  7+1    8.0        4.0        Bungalow (Corner)                          Built-up : 5,900 sq. ft.   Unfurnished         1
          RM 730,000    3+1    3.0        3.0        2-sty Terrace/Link House (Intermediate)    Land area : 22x75 sq. ft.  Partly Furni

In [169]:
# Remove 'Other' locations from the dataset.
df1 = df1[df1['Location'] != 'Other']
df1.reset_index(drop=True, inplace=True)
df1['Location'].value_counts()

Location
Mont Kiara                           5897
KLCC                                 5230
Cheras                               4533
Jalan Klang Lama (Old Klang Road)    2854
Setapak                              2710
                                     ... 
Santuari Park Pantai                    1
Bukit  Persekutuan                      1
Wangsa Melawati                         1
Taman Yarl OUG                          1
Taming Jaya                             1
Name: count, Length: 109, dtype: int64

In [170]:
# Standardize location names as camel case for consistency
df1['Location'] = df1['Location'].replace('SANTUARI PARK PANTAI', 'Santuari Park Pantai')
df1['Location'] = df1['Location'].replace('cyberjaya', 'Cyberjaya')
df1['Location'] = df1['Location'].replace('U-THANT', 'U-Thant')
df1['Location'] = df1['Location'].replace('TAMAN MELATI', 'Taman Melati')
df1['Location'] = df1['Location'].replace('SEMARAK', 'Semarak')
df1['Location'] = df1['Location'].replace('Off Gasing Indah,', 'Gasing Indah')
df1['Location'] = df1['Location'].replace('kepong', 'Kepong')
df1['Location'] = df1['Location'].replace('ADIVA Desa ParkCity', 'Adiva Desa Park City')
df1['Location'] = df1['Location'].replace('Sungai Long SL8', 'Sungai Long')
df1['Location'] = df1['Location'].replace('taman connaught', 'Taman Connaught')
df1['Location'] = df1['Location'].replace('duta Nusantara', 'Duta Nusantara')
df1['Location'] = df1['Location'].replace('taman cheras perdana', 'Taman Cheras Perdana')


In [171]:
# Remove specific unwanted locations
df1 = df1[~df1['Location'].str.contains('Landed Sd', na=False)]
df1.reset_index(drop=True, inplace=True)
df1 = df1[~df1['Location'].str.contains('Singapore', na=False)]
df1.reset_index(drop=True, inplace=True)

In [172]:
# Standardize location names for consistency
df1['Location'] = df1['Location'].str.replace('Taman Yarl, UOG', 'Taman Yarl', regex=False)
df1['Location'] = df1['Location'].str.replace('Taman Yarl OUG', 'Taman Yarl', regex=False)
df1['Location'] = df1['Location'].str.replace('TamanYarl', 'Taman Yarl', regex=False)
df1['Location'] = df1['Location'].str.replace('Bandar Sri damansara', 'Bandar Sri Damansara', regex=False)

In [173]:
# Check the final counts of locations after cleaning
df1['Location'].value_counts().sort_index()

Location
Adiva Desa Park City       2
Alam Damai                 1
Ampang                  1234
Ampang Hilir             629
Bandar Damai Perdana     153
                        ... 
Titiwangsa               178
U-Thant                    1
Ukay Heights               1
Wangsa Maju              971
Wangsa Melawati            1
Name: count, Length: 102, dtype: int64

In [174]:
# Add latitude and longitude for location to the dataset

# Read both CSVs
#df1 = pd.read_csv("../data/mas_housing 2.csv")
coords = pd.read_csv("../data/coordinates.csv", usecols=['Location', 'Latitude', 'Longitude'])

# Optional cleanup (recommended)
df1['Location_clean'] = df1['Location'].str.lower().str.strip().str.replace(", kuala lumpur", "", regex=False)
coords['Location_clean'] = coords['Location'].str.lower().str.strip()

# Create lookup dictionaries using the cleaned column
lat_map = dict(zip(coords['Location_clean'], coords['Latitude']))
lon_map = dict(zip(coords['Location_clean'], coords['Longitude']))

# Map coordinates to df1
df1['Latitude'] = df1['Location_clean'].map(lat_map)
df1['Longitude'] = df1['Location_clean'].map(lon_map)

# Drop helper column if not needed
df1.drop(columns='Location_clean', inplace=True)

unique_locations.to_csv("unique_locations.csv", index=False)
print("✅ Full unique list saved as unique_locations.csv")

# Check results
#print(df1[['Location', 'Latitude', 'Longitude']].head(50))

✅ Full unique list saved as unique_locations.csv


In [175]:
# Set display options to show all rows
pd.set_option('display.max_rows', None)

# Show complete counts for each column
print("### Location Counts:")
print(df1['Location'].value_counts().sort_index())
print(f"\nTotal unique locations: {df1['Location'].nunique()}")

print("\n### Latitude Counts:")
print(df1['Latitude'].value_counts().sort_index())
print(f"\nTotal unique latitudes: {df1['Latitude'].nunique()}")

print("\n### Longitude Counts:")
print(df1['Longitude'].value_counts().sort_index())
print(f"\nTotal unique longitudes: {df1['Longitude'].nunique()}")

# Reset display options back to default if needed
pd.reset_option('display.max_rows')

### Location Counts:
Location
Adiva Desa Park City                    2
Alam Damai                              1
Ampang                               1234
Ampang Hilir                          629
Bandar Damai Perdana                  153
Bandar Menjalara                      542
Bandar Sri Damansara                    7
Bandar Tasik Selatan                   93
Bangsar                              1897
Bangsar South                         487
Batu Caves                            308
Brickfields                           247
Bukit  Persekutuan                      1
Bukit Bintang                         848
Bukit Damansara                         1
Bukit Jalil                          2593
Bukit Kiara                             9
Bukit Ledang                           15
Bukit Tunku (Kenny Hills)             569
Canary Residence                        1
Casa Rimba                              1
Chan Sow Lin                           11
Cheras                               4533
City

In [176]:
df1

Unnamed: 0,Location,Price,Rooms,Bathrooms,Car Parks,Property Type,Size,Furnishing,Latitude,Longitude
0,KLCC,"RM 1,250,000",2+1,3.0,2.0,Serviced Residence,"Built-up : 1,335 sq. ft.",Fully Furnished,3.159247,101.713366
1,Damansara Heights,"RM 6,800,000",6,7.0,,Bungalow,Land area : 6900 sq. ft.,Partly Furnished,3.152395,101.658445
2,Dutamas,"RM 1,030,000",3,4.0,2.0,Condominium (Corner),"Built-up : 1,875 sq. ft.",Partly Furnished,3.032282,101.759884
3,Cheras,,,,,,,,3.099192,101.737423
4,Bukit Jalil,"RM 900,000",4+1,3.0,2.0,Condominium (Corner),"Built-up : 1,513 sq. ft.",Partly Furnished,3.053346,101.680294
...,...,...,...,...,...,...,...,...,...,...
53853,Bangsar,"RM 5,100,000",5+1,4.0,,Bungalow (Corner),Land area : 7168 sq. ft.,Fully Furnished,3.127560,101.679060
53854,Bangsar,"RM 5,000,000",5,4.0,,Bungalow (Intermediate),Land area : 15000 sq. ft.,Unfurnished,3.127560,101.679060
53855,Bangsar,"RM 5,500,000",5+1,4.0,,Bungalow (Intermediate),Land area : 7168 sq. ft.,Partly Furnished,3.127560,101.679060
53856,Wangsa Maju,"RM 480,000",3,2.0,,Condominium (Intermediate),"Built-up : 1,150 sq. ft.",Unfurnished,3.205778,101.731861


In [177]:
# 'City Centre' in Locations provides innacurate coordinates from map API search and can be any place within KL. Therefore needs to be dropped.

# Remove records where Location is 'City Centre'
print(f"Records before removing City Centre: {len(df1)}")
df1 = df1[~df1['Location'].str.contains('City Centre', case=False, na=False)]
df1.reset_index(drop=True, inplace=True)
print(f"Records after removing City Centre: {len(df1)}")
df1

Records before removing City Centre: 53858
Records after removing City Centre: 53232


Unnamed: 0,Location,Price,Rooms,Bathrooms,Car Parks,Property Type,Size,Furnishing,Latitude,Longitude
0,KLCC,"RM 1,250,000",2+1,3.0,2.0,Serviced Residence,"Built-up : 1,335 sq. ft.",Fully Furnished,3.159247,101.713366
1,Damansara Heights,"RM 6,800,000",6,7.0,,Bungalow,Land area : 6900 sq. ft.,Partly Furnished,3.152395,101.658445
2,Dutamas,"RM 1,030,000",3,4.0,2.0,Condominium (Corner),"Built-up : 1,875 sq. ft.",Partly Furnished,3.032282,101.759884
3,Cheras,,,,,,,,3.099192,101.737423
4,Bukit Jalil,"RM 900,000",4+1,3.0,2.0,Condominium (Corner),"Built-up : 1,513 sq. ft.",Partly Furnished,3.053346,101.680294
...,...,...,...,...,...,...,...,...,...,...
53227,Bangsar,"RM 5,100,000",5+1,4.0,,Bungalow (Corner),Land area : 7168 sq. ft.,Fully Furnished,3.127560,101.679060
53228,Bangsar,"RM 5,000,000",5,4.0,,Bungalow (Intermediate),Land area : 15000 sq. ft.,Unfurnished,3.127560,101.679060
53229,Bangsar,"RM 5,500,000",5+1,4.0,,Bungalow (Intermediate),Land area : 7168 sq. ft.,Partly Furnished,3.127560,101.679060
53230,Wangsa Maju,"RM 480,000",3,2.0,,Condominium (Intermediate),"Built-up : 1,150 sq. ft.",Unfurnished,3.205778,101.731861


In [178]:
# Move lat and lon columns beside location

# Get current column list
cols = df1.columns.tolist()

# Remove Latitude and Longitude from their current positions
cols.remove('Latitude')
cols.remove('Longitude')

# Find the index of Location column
loc_idx = cols.index('Location')

# Insert Latitude and Longitude after Location
cols.insert(loc_idx + 1, 'Latitude')
cols.insert(loc_idx + 2, 'Longitude')

# Reorder the dataframe columns
df1 = df1[cols]

# Display the result
df1

Unnamed: 0,Location,Latitude,Longitude,Price,Rooms,Bathrooms,Car Parks,Property Type,Size,Furnishing
0,KLCC,3.159247,101.713366,"RM 1,250,000",2+1,3.0,2.0,Serviced Residence,"Built-up : 1,335 sq. ft.",Fully Furnished
1,Damansara Heights,3.152395,101.658445,"RM 6,800,000",6,7.0,,Bungalow,Land area : 6900 sq. ft.,Partly Furnished
2,Dutamas,3.032282,101.759884,"RM 1,030,000",3,4.0,2.0,Condominium (Corner),"Built-up : 1,875 sq. ft.",Partly Furnished
3,Cheras,3.099192,101.737423,,,,,,,
4,Bukit Jalil,3.053346,101.680294,"RM 900,000",4+1,3.0,2.0,Condominium (Corner),"Built-up : 1,513 sq. ft.",Partly Furnished
...,...,...,...,...,...,...,...,...,...,...
53227,Bangsar,3.127560,101.679060,"RM 5,100,000",5+1,4.0,,Bungalow (Corner),Land area : 7168 sq. ft.,Fully Furnished
53228,Bangsar,3.127560,101.679060,"RM 5,000,000",5,4.0,,Bungalow (Intermediate),Land area : 15000 sq. ft.,Unfurnished
53229,Bangsar,3.127560,101.679060,"RM 5,500,000",5+1,4.0,,Bungalow (Intermediate),Land area : 7168 sq. ft.,Partly Furnished
53230,Wangsa Maju,3.205778,101.731861,"RM 480,000",3,2.0,,Condominium (Intermediate),"Built-up : 1,150 sq. ft.",Unfurnished


In [179]:
# Replace proper coordinates for Keramat

# Replace Latitude/Longitude for 'Keramat'
mask = df1['Location'].astype(str).str.strip().str.lower() == 'keramat'
df1.loc[mask, 'Latitude'] = 3.1689
df1.loc[mask, 'Longitude'] = 101.7277

# Ensure numeric types and verify update
df1['Latitude'] = pd.to_numeric(df1['Latitude'], errors='coerce')
df1['Longitude'] = pd.to_numeric(df1['Longitude'], errors='coerce')
print(f"Keramat rows updated: {mask.sum()}")
df1.loc[mask, ['Location', 'Latitude', 'Longitude']].head(10)

# ...existing code...

Keramat rows updated: 204


Unnamed: 0,Location,Latitude,Longitude
1406,Keramat,3.1689,101.7277
2993,Keramat,3.1689,101.7277
3320,Keramat,3.1689,101.7277
3458,Keramat,3.1689,101.7277
3754,Keramat,3.1689,101.7277
4140,Keramat,3.1689,101.7277
4141,Keramat,3.1689,101.7277
4144,Keramat,3.1689,101.7277
4296,Keramat,3.1689,101.7277
4575,Keramat,3.1689,101.7277


In [180]:
# Replace proper coordinates for Pantai

# Replace Latitude/Longitude for 'Pantai'
mask = df1['Location'].astype(str).str.strip().str.lower() == 'pantai'
df1.loc[mask, 'Latitude'] = 3.1048
df1.loc[mask, 'Longitude'] = 101.6640

# Ensure numeric types and verify update
df1['Latitude'] = pd.to_numeric(df1['Latitude'], errors='coerce')
df1['Longitude'] = pd.to_numeric(df1['Longitude'], errors='coerce')
print(f"Pantai rows updated: {mask.sum()}")
df1.loc[mask, ['Location', 'Latitude', 'Longitude']].head(10)



Pantai rows updated: 344


Unnamed: 0,Location,Latitude,Longitude
561,Pantai,3.1048,101.664
1845,Pantai,3.1048,101.664
2473,Pantai,3.1048,101.664
2474,Pantai,3.1048,101.664
2726,Pantai,3.1048,101.664
2797,Pantai,3.1048,101.664
2798,Pantai,3.1048,101.664
2799,Pantai,3.1048,101.664
2856,Pantai,3.1048,101.664
2959,Pantai,3.1048,101.664


In [181]:
# Replace proper coordinates for Bukit Ledang

# Replace Latitude/Longitude for 'Bukit Ledang'
mask = df1['Location'].astype(str).str.strip().str.lower() == 'bukit ledang'
df1.loc[mask, 'Latitude'] = 3.1470
df1.loc[mask, 'Longitude'] = 101.6610

# Ensure numeric types and verify update
df1['Latitude'] = pd.to_numeric(df1['Latitude'], errors='coerce')
df1['Longitude'] = pd.to_numeric(df1['Longitude'], errors='coerce')
print(f"Bukit Ledang rows updated: {mask.sum()}")
df1.loc[mask, ['Location', 'Latitude', 'Longitude']].head(10)



Bukit Ledang rows updated: 15


Unnamed: 0,Location,Latitude,Longitude
4112,Bukit Ledang,3.147,101.661
11187,Bukit Ledang,3.147,101.661
16085,Bukit Ledang,3.147,101.661
18220,Bukit Ledang,3.147,101.661
19377,Bukit Ledang,3.147,101.661
19386,Bukit Ledang,3.147,101.661
21018,Bukit Ledang,3.147,101.661
21048,Bukit Ledang,3.147,101.661
23749,Bukit Ledang,3.147,101.661
23750,Bukit Ledang,3.147,101.661


In [182]:
# Replace proper coordinates for U-Thant

# Replace Latitude/Longitude for 'U-Thant'
mask = df1['Location'].astype(str).str.strip().str.lower() == 'u-thant'
df1.loc[mask, 'Latitude'] = 3.1575
df1.loc[mask, 'Longitude'] = 101.7380

# Ensure numeric types and verify update
df1['Latitude'] = pd.to_numeric(df1['Latitude'], errors='coerce')
df1['Longitude'] = pd.to_numeric(df1['Longitude'], errors='coerce')
print(f"U-Thant rows updated: {mask.sum()}")
df1.loc[mask, ['Location', 'Latitude', 'Longitude']].head(10)



U-Thant rows updated: 1


Unnamed: 0,Location,Latitude,Longitude
8541,U-Thant,3.1575,101.738


In [183]:
# Replace Latitude/Longitude for 'Happy Garden'
mask = df1['Location'].astype(str).str.strip().str.lower() == 'happy garden'
df1.loc[mask, 'Latitude'] = 3.0903
df1.loc[mask, 'Longitude'] = 101.6875

# Ensure numeric types and verify update
df1['Latitude'] = pd.to_numeric(df1['Latitude'], errors='coerce')
df1['Longitude'] = pd.to_numeric(df1['Longitude'], errors='coerce')
print(f"Happy Garden rows updated: {mask.sum()}")
df1.loc[mask, ['Location', 'Latitude', 'Longitude']].head(10)

Happy Garden rows updated: 2


Unnamed: 0,Location,Latitude,Longitude
7693,Happy Garden,3.0903,101.6875
7711,Happy Garden,3.0903,101.6875


In [184]:
df1


Unnamed: 0,Location,Latitude,Longitude,Price,Rooms,Bathrooms,Car Parks,Property Type,Size,Furnishing
0,KLCC,3.159247,101.713366,"RM 1,250,000",2+1,3.0,2.0,Serviced Residence,"Built-up : 1,335 sq. ft.",Fully Furnished
1,Damansara Heights,3.152395,101.658445,"RM 6,800,000",6,7.0,,Bungalow,Land area : 6900 sq. ft.,Partly Furnished
2,Dutamas,3.032282,101.759884,"RM 1,030,000",3,4.0,2.0,Condominium (Corner),"Built-up : 1,875 sq. ft.",Partly Furnished
3,Cheras,3.099192,101.737423,,,,,,,
4,Bukit Jalil,3.053346,101.680294,"RM 900,000",4+1,3.0,2.0,Condominium (Corner),"Built-up : 1,513 sq. ft.",Partly Furnished
...,...,...,...,...,...,...,...,...,...,...
53227,Bangsar,3.127560,101.679060,"RM 5,100,000",5+1,4.0,,Bungalow (Corner),Land area : 7168 sq. ft.,Fully Furnished
53228,Bangsar,3.127560,101.679060,"RM 5,000,000",5,4.0,,Bungalow (Intermediate),Land area : 15000 sq. ft.,Unfurnished
53229,Bangsar,3.127560,101.679060,"RM 5,500,000",5+1,4.0,,Bungalow (Intermediate),Land area : 7168 sq. ft.,Partly Furnished
53230,Wangsa Maju,3.205778,101.731861,"RM 480,000",3,2.0,,Condominium (Intermediate),"Built-up : 1,150 sq. ft.",Unfurnished


### Clean Price Column

In [185]:
# Remove 'RM' and commas from Price and convert to numeric type

df1['Price'] = (
    df1['Price']
    .astype(str)
    .str.replace('RM', '', regex=False)
    .str.replace(',', '', regex=False)
    .str.strip()
    .replace({'': np.nan, '<NA>': np.nan})
    .astype(float)
    .astype('Int64')  # Nullable integer
)

df1

Unnamed: 0,Location,Latitude,Longitude,Price,Rooms,Bathrooms,Car Parks,Property Type,Size,Furnishing
0,KLCC,3.159247,101.713366,1250000,2+1,3.0,2.0,Serviced Residence,"Built-up : 1,335 sq. ft.",Fully Furnished
1,Damansara Heights,3.152395,101.658445,6800000,6,7.0,,Bungalow,Land area : 6900 sq. ft.,Partly Furnished
2,Dutamas,3.032282,101.759884,1030000,3,4.0,2.0,Condominium (Corner),"Built-up : 1,875 sq. ft.",Partly Furnished
3,Cheras,3.099192,101.737423,,,,,,,
4,Bukit Jalil,3.053346,101.680294,900000,4+1,3.0,2.0,Condominium (Corner),"Built-up : 1,513 sq. ft.",Partly Furnished
...,...,...,...,...,...,...,...,...,...,...
53227,Bangsar,3.127560,101.679060,5100000,5+1,4.0,,Bungalow (Corner),Land area : 7168 sq. ft.,Fully Furnished
53228,Bangsar,3.127560,101.679060,5000000,5,4.0,,Bungalow (Intermediate),Land area : 15000 sq. ft.,Unfurnished
53229,Bangsar,3.127560,101.679060,5500000,5+1,4.0,,Bungalow (Intermediate),Land area : 7168 sq. ft.,Partly Furnished
53230,Wangsa Maju,3.205778,101.731861,480000,3,2.0,,Condominium (Intermediate),"Built-up : 1,150 sq. ft.",Unfurnished


In [186]:
# Verify Price column conversion

# 1. Summary statistics
print(df1['Price'].describe())

# 2. Show lowest and highest prices
print("Lowest prices:\n", df1['Price'].nsmallest(10))
print("Highest prices:\n", df1['Price'].nlargest(10))

df1.shape

count            52988.0
mean       2091531.63739
std      13920228.873494
min                 10.0
25%             580000.0
50%             985000.0
75%            1990000.0
max         1980000000.0
Name: Price, dtype: Float64
Lowest prices:
 39546     10
16169    308
19693    330
45964    330
21658    365
35422    380
21655    390
20099    400
4664     408
45080    493
Name: Price, dtype: Int64
Highest prices:
 20830    1980000000
11974    1600000000
4948     1123000000
34546     814572000
32234     415177610
17723     370416816
32235     370000000
29442     339768000
25612     262666800
7389      216000000
Name: Price, dtype: Int64


(53232, 10)

In [187]:
# Remove records where Price is below 100,000 or above 20,000,000
df1 = df1[(df1['Price'] >= 100000) & (df1['Price'] <= 50000000)]
df1.reset_index(drop=True, inplace=True)

df1.shape

(52506, 10)

In [188]:
# Verify Price column conversion
# 1. Summary statistics
print(df1['Price'].describe())

# 2. Show lowest and highest prices
print("Lowest prices:\n", df1['Price'].nsmallest(10))
print("Highest prices:\n", df1['Price'].nlargest(10))

df1.shape

count           52506.0
mean     1840847.406906
std      2650682.735846
min            105000.0
25%            588000.0
50%            995000.0
75%           1999999.0
max          50000000.0
Name: Price, dtype: Float64
Lowest prices:
 4255     105000
48903    110000
6521     118000
28589    120000
36975    120000
38859    120000
40113    120000
40364    120000
41448    120000
15187    125000
Name: Price, dtype: Int64
Highest prices:
 3732     50000000
8613     50000000
38988    50000000
43955    50000000
51169    49590000
52150    49590000
2914     49000000
24452    48787200
3681     48264480
2913     47916000
Name: Price, dtype: Int64


(52506, 10)

In [189]:
df1

Unnamed: 0,Location,Latitude,Longitude,Price,Rooms,Bathrooms,Car Parks,Property Type,Size,Furnishing
0,KLCC,3.159247,101.713366,1250000,2+1,3.0,2.0,Serviced Residence,"Built-up : 1,335 sq. ft.",Fully Furnished
1,Damansara Heights,3.152395,101.658445,6800000,6,7.0,,Bungalow,Land area : 6900 sq. ft.,Partly Furnished
2,Dutamas,3.032282,101.759884,1030000,3,4.0,2.0,Condominium (Corner),"Built-up : 1,875 sq. ft.",Partly Furnished
3,Bukit Jalil,3.053346,101.680294,900000,4+1,3.0,2.0,Condominium (Corner),"Built-up : 1,513 sq. ft.",Partly Furnished
4,Taman Tun Dr Ismail,3.136141,101.630737,5350000,4+2,5.0,4.0,Bungalow,Land area : 7200 sq. ft.,Partly Furnished
...,...,...,...,...,...,...,...,...,...,...
52501,Bangsar,3.127560,101.679060,5100000,5+1,4.0,,Bungalow (Corner),Land area : 7168 sq. ft.,Fully Furnished
52502,Bangsar,3.127560,101.679060,5000000,5,4.0,,Bungalow (Intermediate),Land area : 15000 sq. ft.,Unfurnished
52503,Bangsar,3.127560,101.679060,5500000,5+1,4.0,,Bungalow (Intermediate),Land area : 7168 sq. ft.,Partly Furnished
52504,Wangsa Maju,3.205778,101.731861,480000,3,2.0,,Condominium (Intermediate),"Built-up : 1,150 sq. ft.",Unfurnished


### Clean Rooms column

In [190]:
# Rooms column has mixed types, some are strings with '+' indicating multiple rooms. Sum the numbers in such cases, replace the output and convert to integer.

def sum_rooms(val):
    if isinstance(val, str) and '+' in val:
        parts = re.findall(r'\d+', val)
        if len(parts) == 2:
            return int(parts[0]) + int(parts[1])
    try:
        return int(val)
    except:
        return np.nan

df1.loc[:, 'Rooms'] = df1['Rooms'].apply(sum_rooms)
df1.loc[:, 'Rooms'] = df1['Rooms'].astype('Int64')

In [191]:
df1

Unnamed: 0,Location,Latitude,Longitude,Price,Rooms,Bathrooms,Car Parks,Property Type,Size,Furnishing
0,KLCC,3.159247,101.713366,1250000,3,3.0,2.0,Serviced Residence,"Built-up : 1,335 sq. ft.",Fully Furnished
1,Damansara Heights,3.152395,101.658445,6800000,6,7.0,,Bungalow,Land area : 6900 sq. ft.,Partly Furnished
2,Dutamas,3.032282,101.759884,1030000,3,4.0,2.0,Condominium (Corner),"Built-up : 1,875 sq. ft.",Partly Furnished
3,Bukit Jalil,3.053346,101.680294,900000,5,3.0,2.0,Condominium (Corner),"Built-up : 1,513 sq. ft.",Partly Furnished
4,Taman Tun Dr Ismail,3.136141,101.630737,5350000,6,5.0,4.0,Bungalow,Land area : 7200 sq. ft.,Partly Furnished
...,...,...,...,...,...,...,...,...,...,...
52501,Bangsar,3.127560,101.679060,5100000,6,4.0,,Bungalow (Corner),Land area : 7168 sq. ft.,Fully Furnished
52502,Bangsar,3.127560,101.679060,5000000,5,4.0,,Bungalow (Intermediate),Land area : 15000 sq. ft.,Unfurnished
52503,Bangsar,3.127560,101.679060,5500000,6,4.0,,Bungalow (Intermediate),Land area : 7168 sq. ft.,Partly Furnished
52504,Wangsa Maju,3.205778,101.731861,480000,3,2.0,,Condominium (Intermediate),"Built-up : 1,150 sq. ft.",Unfurnished


In [192]:
# Remove missing value records and convert column to integer
'''Print the number of records before cleaning
Remove any rows where Rooms column has missing values
Reset the index to maintain consecutive numbering
Convert the Rooms column to integer type
Print the number of remaining records
Show the new data type and distribution of values'''

# Remove records with missing values in Rooms column and convert to integer
print(f"Records before removing missing values: {len(df1)}")

# Remove missing values
df1 = df1.dropna(subset=['Rooms'])
df1.reset_index(drop=True, inplace=True)

# Convert to integer type
df1['Rooms'] = df1['Rooms'].astype(int)

print(f"Records after removing missing values: {len(df1)}")

# Verify the changes
print("\nRooms column data type:", df1['Rooms'].dtype)
print("\nUnique values in Rooms column:")
print(df1['Rooms'].value_counts().sort_index())

Records before removing missing values: 52506
Records after removing missing values: 49975

Rooms column data type: int64

Unique values in Rooms column:
Rooms
1      2202
2      6421
3     15818
4     12921
5      6832
6      3694
7      1562
8       355
9       103
10       39
11       10
12       11
13        1
14        3
16        2
18        1
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['Rooms'] = df1['Rooms'].astype(int)


### Bathrooms column

In [193]:
# Convert Bathrooms to numeric, then to Int64 (nullable integer)
# Ensure Bathrooms is numeric and then convert to nullable integer
df1.loc[:, 'Bathrooms'] = pd.to_numeric(df1['Bathrooms'], errors='coerce')
df1.loc[:, 'Bathrooms'] = df1['Bathrooms'].astype('Int64')


[3, 7, 4, 3, 5, 4, 4, 2, 3, 3,
 ...
 2, 2, 2, 6, 6, 4, 4, 4, 2, 2]
Length: 49975, dtype: Int64' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df1.loc[:, 'Bathrooms'] = df1['Bathrooms'].astype('Int64')


In [194]:
df1.dtypes


Location          object
Latitude         float64
Longitude        float64
Price              Int64
Rooms              int64
Bathrooms          Int64
Car Parks        float64
Property Type     object
Size              object
Furnishing        object
dtype: object

In [195]:
df1

Unnamed: 0,Location,Latitude,Longitude,Price,Rooms,Bathrooms,Car Parks,Property Type,Size,Furnishing
0,KLCC,3.159247,101.713366,1250000,3,3,2.0,Serviced Residence,"Built-up : 1,335 sq. ft.",Fully Furnished
1,Damansara Heights,3.152395,101.658445,6800000,6,7,,Bungalow,Land area : 6900 sq. ft.,Partly Furnished
2,Dutamas,3.032282,101.759884,1030000,3,4,2.0,Condominium (Corner),"Built-up : 1,875 sq. ft.",Partly Furnished
3,Bukit Jalil,3.053346,101.680294,900000,5,3,2.0,Condominium (Corner),"Built-up : 1,513 sq. ft.",Partly Furnished
4,Taman Tun Dr Ismail,3.136141,101.630737,5350000,6,5,4.0,Bungalow,Land area : 7200 sq. ft.,Partly Furnished
...,...,...,...,...,...,...,...,...,...,...
49970,Bangsar,3.127560,101.679060,5100000,6,4,,Bungalow (Corner),Land area : 7168 sq. ft.,Fully Furnished
49971,Bangsar,3.127560,101.679060,5000000,5,4,,Bungalow (Intermediate),Land area : 15000 sq. ft.,Unfurnished
49972,Bangsar,3.127560,101.679060,5500000,6,4,,Bungalow (Intermediate),Land area : 7168 sq. ft.,Partly Furnished
49973,Wangsa Maju,3.205778,101.731861,480000,3,2,,Condominium (Intermediate),"Built-up : 1,150 sq. ft.",Unfurnished


### Car Parks column

In [196]:
# Change car park values to integers
df1.loc[:, 'Car Parks'] = pd.to_numeric(df1['Car Parks'], errors='coerce')
df1.loc[:, 'Car Parks'] = df1['Car Parks'].astype('Int64')
print(df1['Car Parks'].dtype)

Int64


[   2, <NA>,    2,    2,    4,    4,    3,    1, <NA>, <NA>,
 ...
 <NA>, <NA>, <NA>,    3, <NA>, <NA>, <NA>, <NA>, <NA>,    3]
Length: 49975, dtype: Int64' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df1.loc[:, 'Car Parks'] = df1['Car Parks'].astype('Int64')


In [197]:
print(df1['Car Parks'].unique())

<IntegerArray>
[2, <NA>, 4, 3, 1, 5, 6, 7, 10, 9, 8, 13, 15, 28, 11, 20, 12, 18, 30, 24, 16,
 17]
Length: 22, dtype: Int64


In [198]:
# Fill missing values in Car Parks with 2 since 2 is median value
df1['Car Parks'] = df1['Car Parks'].fillna(2)

# Verify the changes
print("Car Parks unique values after filling NAs:")
print(df1['Car Parks'].unique())

Car Parks unique values after filling NAs:
<IntegerArray>
[2, 4, 3, 1, 5, 6, 7, 10, 9, 8, 13, 15, 28, 11, 20, 12, 18, 30, 24, 16, 17]
Length: 21, dtype: Int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['Car Parks'] = df1['Car Parks'].fillna(2)


### Property type column

In [199]:
### Clean Property Type Column
print("### Unique Property Types:")
print(df1['Property Type'].unique())
print(f"\nTotal unique property types: {df1['Property Type'].nunique()}")
# Remove all records where 'Property Type' starts with 'Residential Land'
df1 = df1[~df1['Property Type'].str.startswith('Residential Land', na=False)]
df1.reset_index(drop=True, inplace=True)

df1

### Unique Property Types:
['Serviced Residence' 'Bungalow' 'Condominium (Corner)'
 'Semi-detached House' '2-sty Terrace/Link House (EndLot)'
 'Apartment (Intermediate)' '2-sty Terrace/Link House (Intermediate)'
 'Bungalow (Intermediate)' 'Semi-detached House (Intermediate)'
 'Bungalow (Corner)' 'Serviced Residence (Intermediate)' 'Condominium'
 'Condominium (Intermediate)' 'Condominium (EndLot)'
 'Serviced Residence (Corner)' '3-sty Terrace/Link House (Intermediate)'
 'Serviced Residence (Duplex)' '2-sty Terrace/Link House'
 '2-sty Terrace/Link House (Corner)'
 '2.5-sty Terrace/Link House (Intermediate)'
 '3-sty Terrace/Link House (Corner)' '3-sty Terrace/Link House (EndLot)'
 '3.5-sty Terrace/Link House (Intermediate)'
 'Serviced Residence (Penthouse)'
 '1-sty Terrace/Link House (Intermediate)'
 '1.5-sty Terrace/Link House (EndLot)' 'Apartment' 'Condominium (Duplex)'
 'Serviced Residence (EndLot)' '4-sty Terrace/Link House'
 '4-sty Terrace/Link House (Intermediate)' 'Townhouse'
 'Sem

Unnamed: 0,Location,Latitude,Longitude,Price,Rooms,Bathrooms,Car Parks,Property Type,Size,Furnishing
0,KLCC,3.159247,101.713366,1250000,3,3,2,Serviced Residence,"Built-up : 1,335 sq. ft.",Fully Furnished
1,Damansara Heights,3.152395,101.658445,6800000,6,7,2,Bungalow,Land area : 6900 sq. ft.,Partly Furnished
2,Dutamas,3.032282,101.759884,1030000,3,4,2,Condominium (Corner),"Built-up : 1,875 sq. ft.",Partly Furnished
3,Bukit Jalil,3.053346,101.680294,900000,5,3,2,Condominium (Corner),"Built-up : 1,513 sq. ft.",Partly Furnished
4,Taman Tun Dr Ismail,3.136141,101.630737,5350000,6,5,4,Bungalow,Land area : 7200 sq. ft.,Partly Furnished
...,...,...,...,...,...,...,...,...,...,...
49934,Bangsar,3.127560,101.679060,5100000,6,4,2,Bungalow (Corner),Land area : 7168 sq. ft.,Fully Furnished
49935,Bangsar,3.127560,101.679060,5000000,5,4,2,Bungalow (Intermediate),Land area : 15000 sq. ft.,Unfurnished
49936,Bangsar,3.127560,101.679060,5500000,6,4,2,Bungalow (Intermediate),Land area : 7168 sq. ft.,Partly Furnished
49937,Wangsa Maju,3.205778,101.731861,480000,3,2,2,Condominium (Intermediate),"Built-up : 1,150 sq. ft.",Unfurnished


In [200]:
# Parse Property Type into multiple columns of Type, Storeys, Position, Layout, Land Status

# Function to parse property strings
def parse_property(text):
    # Extract storeys if any (e.g., 2-sty, 3.5-sty)
    storey_match = re.search(r'(\d+(\.\d+)?)-sty', text)
    storeys = float(storey_match.group(1)) if storey_match else None

    # Extract bracket content (e.g., Corner, EndLot, Penthouse)
    bracket_match = re.search(r'\((.*?)\)', text)
    detail = bracket_match.group(1).strip() if bracket_match else None

    # Remove storey and bracketed details from base type
    base = re.sub(r'(\d+(\.\d+)?)-sty\s*', '', text)  # remove "X-sty"
    base = re.sub(r'\s*\(.*\)', '', base)            # remove "(...)"
    base = base.strip()

    # Identify categories
    positions = ['Corner', 'EndLot', 'Intermediate']
    layouts = ['Penthouse', 'Duplex', 'Triplex', 'Studio', 'SOHO']
    land_status = 'Land' if 'Land' in base else None

    position = detail if detail in positions else None
    layout = detail if detail in layouts else None

    return pd.Series([base, storeys, position, layout, land_status])

# Apply parsing to df1["Property Type"]
df1[['MainType', 'Storeys', 'Position', 'Layout', 'LandStatus']] = df1['Property Type'].apply(parse_property)

# Optional: Standardise MainType (merge similar terms)
df1['MainType'] = df1['MainType'].replace({
    'Terrace/Link House': 'Terrace/Link House',
    'Cluster House': 'Cluster House',
    'Bungalow Land': 'Bungalow Land',
    'Bungalow': 'Bungalow',
    'Semi-detached House': 'Semi-detached House',
    'Condominium': 'Condominium',
    'Serviced Residence': 'Serviced Residence',
    'Apartment': 'Apartment',
    'Flat': 'Flat',
    'Townhouse': 'Townhouse'
})

# Preview
df1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1[['MainType', 'Storeys', 'Position', 'Layout', 'LandStatus']] = df1['Property Type'].apply(parse_property)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1[['MainType', 'Storeys', 'Position', 'Layout', 'LandStatus']] = df1['Property Type'].apply(parse_property)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-v

Unnamed: 0,Location,Latitude,Longitude,Price,Rooms,Bathrooms,Car Parks,Property Type,Size,Furnishing,MainType,Storeys,Position,Layout,LandStatus
0,KLCC,3.159247,101.713366,1250000,3,3,2,Serviced Residence,"Built-up : 1,335 sq. ft.",Fully Furnished,Serviced Residence,,,,
1,Damansara Heights,3.152395,101.658445,6800000,6,7,2,Bungalow,Land area : 6900 sq. ft.,Partly Furnished,Bungalow,,,,
2,Dutamas,3.032282,101.759884,1030000,3,4,2,Condominium (Corner),"Built-up : 1,875 sq. ft.",Partly Furnished,Condominium,,Corner,,
3,Bukit Jalil,3.053346,101.680294,900000,5,3,2,Condominium (Corner),"Built-up : 1,513 sq. ft.",Partly Furnished,Condominium,,Corner,,
4,Taman Tun Dr Ismail,3.136141,101.630737,5350000,6,5,4,Bungalow,Land area : 7200 sq. ft.,Partly Furnished,Bungalow,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49934,Bangsar,3.127560,101.679060,5100000,6,4,2,Bungalow (Corner),Land area : 7168 sq. ft.,Fully Furnished,Bungalow,,Corner,,
49935,Bangsar,3.127560,101.679060,5000000,5,4,2,Bungalow (Intermediate),Land area : 15000 sq. ft.,Unfurnished,Bungalow,,Intermediate,,
49936,Bangsar,3.127560,101.679060,5500000,6,4,2,Bungalow (Intermediate),Land area : 7168 sq. ft.,Partly Furnished,Bungalow,,Intermediate,,
49937,Wangsa Maju,3.205778,101.731861,480000,3,2,2,Condominium (Intermediate),"Built-up : 1,150 sq. ft.",Unfurnished,Condominium,,Intermediate,,


### Clean Size

In [201]:
# Remove records with missing values in 'Size' and reset index
print(f"Records before removing missing Size: {len(df1)}")
df1 = df1.dropna(subset=['Size'])
df1.reset_index(drop=True, inplace=True)
print(f"Records after removing missing Size: {len(df1)}")

# Quick verification
print("Missing Size count:", df1['Size'].isna().sum())
df1

Records before removing missing Size: 49939
Records after removing missing Size: 49044
Missing Size count: 0


Unnamed: 0,Location,Latitude,Longitude,Price,Rooms,Bathrooms,Car Parks,Property Type,Size,Furnishing,MainType,Storeys,Position,Layout,LandStatus
0,KLCC,3.159247,101.713366,1250000,3,3,2,Serviced Residence,"Built-up : 1,335 sq. ft.",Fully Furnished,Serviced Residence,,,,
1,Damansara Heights,3.152395,101.658445,6800000,6,7,2,Bungalow,Land area : 6900 sq. ft.,Partly Furnished,Bungalow,,,,
2,Dutamas,3.032282,101.759884,1030000,3,4,2,Condominium (Corner),"Built-up : 1,875 sq. ft.",Partly Furnished,Condominium,,Corner,,
3,Bukit Jalil,3.053346,101.680294,900000,5,3,2,Condominium (Corner),"Built-up : 1,513 sq. ft.",Partly Furnished,Condominium,,Corner,,
4,Taman Tun Dr Ismail,3.136141,101.630737,5350000,6,5,4,Bungalow,Land area : 7200 sq. ft.,Partly Furnished,Bungalow,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49039,Bangsar,3.127560,101.679060,5100000,6,4,2,Bungalow (Corner),Land area : 7168 sq. ft.,Fully Furnished,Bungalow,,Corner,,
49040,Bangsar,3.127560,101.679060,5000000,5,4,2,Bungalow (Intermediate),Land area : 15000 sq. ft.,Unfurnished,Bungalow,,Intermediate,,
49041,Bangsar,3.127560,101.679060,5500000,6,4,2,Bungalow (Intermediate),Land area : 7168 sq. ft.,Partly Furnished,Bungalow,,Intermediate,,
49042,Wangsa Maju,3.205778,101.731861,480000,3,2,2,Condominium (Intermediate),"Built-up : 1,150 sq. ft.",Unfurnished,Condominium,,Intermediate,,


In [203]:
# ...existing code...

# Remove 'Built-up', 'Land area' (case-insensitive) and colons from Size column, trim whitespace
df1['Size'] = df1['Size'].astype(str)
df1['Size'] = df1['Size'].str.replace(r'(?i)\b(?:built-?up|land area)\b', '', regex=True)  # remove words
df1['Size'] = df1['Size'].str.replace(':', '', regex=False)                                    # remove colons
df1['Size'] = df1['Size'].str.replace(r'[\u00A0\s]+', ' ', regex=True).str.strip()             # normalize whitespace
df1.loc[df1['Size'].isin(['', 'nan', 'None']), 'Size'] = np.nan                                # empty -> NaN

# Quick check
print("Sample cleaned Size values:")
print(df1['Size'].dropna().head(20))

# ...existing code...

Sample cleaned Size values:
0       1,335 sq. ft.
1        6900 sq. ft.
2       1,875 sq. ft.
3       1,513 sq. ft.
4        7200 sq. ft.
5        3600 sq. ft.
6       25x75 sq. ft.
7         904 sq. ft.
8     22 x 80 sq. ft.
9        1900 sq. ft.
10       6000 sq. ft.
11       3600 sq. ft.
12       8500 sq. ft.
13      4,842 sq. ft.
14      1,830 sq. ft.
15      3,720 sq. ft.
16      1,798 sq. ft.
17        904 sq. ft.
18      2,163 sq. ft.
19      2,163 sq. ft.
Name: Size, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['Size'] = df1['Size'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['Size'] = df1['Size'].str.replace(r'(?i)\b(?:built-?up|land area)\b', '', regex=True)  # remove words
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['Size'] = df1['Size'].str.replace(':', '', regex=False)  

In [225]:


# List distinct Size values that contain letters (text-only entries)
s = df1['Size'].dropna().astype(str).str.strip()
text_mask = s.str.contains(r'[A-Za-z]', regex=True)
unique_text_sizes = pd.Series(s[text_mask].unique()).sort_values()

print(f"Distinct text-only Size values ({len(unique_text_sizes)}):")
for v in unique_text_sizes:
    print(v)

# Optional: save list for review
unique_text_sizes.to_csv("unique_size_text_values.csv", index=False)

# ...existing code...

Distinct text-only Size values (0):


In [205]:
# Remove apostrophes and bracketed content from Size column
df1['Size'] = df1['Size'].astype(str)
df1['Size'] = df1['Size'].str.replace("'", "", regex=False)  # remove apostrophes
df1['Size'] = df1['Size'].str.replace(r'\([^)]*\)', '', regex=True)  # remove bracketed content
df1['Size'] = df1['Size'].str.strip()  # remove extra whitespace

# Quick check
print("Sample cleaned Size values:")
print(df1['Size'].dropna().head(20))

Sample cleaned Size values:
0       1,335 sq. ft.
1        6900 sq. ft.
2       1,875 sq. ft.
3       1,513 sq. ft.
4        7200 sq. ft.
5        3600 sq. ft.
6       25x75 sq. ft.
7         904 sq. ft.
8     22 x 80 sq. ft.
9        1900 sq. ft.
10       6000 sq. ft.
11       3600 sq. ft.
12       8500 sq. ft.
13      4,842 sq. ft.
14      1,830 sq. ft.
15      3,720 sq. ft.
16      1,798 sq. ft.
17        904 sq. ft.
18      2,163 sq. ft.
19      2,163 sq. ft.
Name: Size, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['Size'] = df1['Size'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['Size'] = df1['Size'].str.replace("'", "", regex=False)  # remove apostrophes
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['Size'] = df1['Size'].str.replace(r'\([^)]*\)', '', regex=True)  # remove bracket

In [207]:
# Function to multiply numbers connected by x/X
def multiply_dimensions(val):
    if pd.isna(val):
        return np.nan
    # Convert to string and lowercase
    s = str(val).lower()
    # Look for pattern: number x number
    match = re.search(r'(\d+(?:\.\d+)?)\s*[xX]\s*(\d+(?:\.\d+)?)', s)
    if match:
        try:
            num1 = float(match.group(1))
            num2 = float(match.group(2))
            return num1 * num2
        except ValueError:
            return val
    return val

# Apply the multiplication
df1['Size'] = df1['Size'].apply(multiply_dimensions)

# Quick check
print("Sample multiplied Size values:")
print(df1['Size'].dropna().head(20))

Sample multiplied Size values:
0     1,335 sq. ft.
1      6900 sq. ft.
2     1,875 sq. ft.
3     1,513 sq. ft.
4      7200 sq. ft.
5      3600 sq. ft.
6            1875.0
7       904 sq. ft.
8            1760.0
9      1900 sq. ft.
10     6000 sq. ft.
11     3600 sq. ft.
12     8500 sq. ft.
13    4,842 sq. ft.
14    1,830 sq. ft.
15    3,720 sq. ft.
16    1,798 sq. ft.
17      904 sq. ft.
18    2,163 sq. ft.
19    2,163 sq. ft.
Name: Size, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['Size'] = df1['Size'].apply(multiply_dimensions)


In [209]:
# Function to handle ranges (keep first number before dash)
def clean_ranges(val):
    if pd.isna(val):
        return np.nan
    # Convert to string
    s = str(val)
    # Look for pattern: number-number
    match = re.search(r'(\d+(?:\.\d+)?)\s*-\s*\d+(?:\.\d+)?', s)
    if match:
        try:
            return float(match.group(1))  # Return only the first number
        except ValueError:
            return val
    return val

# Apply the range cleaning
df1['Size'] = df1['Size'].apply(clean_ranges)

# Quick check
print("Sample cleaned range values:")
print(df1['Size'].dropna().head(20))

Sample cleaned range values:
0     1,335 sq. ft.
1      6900 sq. ft.
2     1,875 sq. ft.
3     1,513 sq. ft.
4      7200 sq. ft.
5      3600 sq. ft.
6            1875.0
7       904 sq. ft.
8            1760.0
9      1900 sq. ft.
10     6000 sq. ft.
11     3600 sq. ft.
12     8500 sq. ft.
13    4,842 sq. ft.
14    1,830 sq. ft.
15    3,720 sq. ft.
16    1,798 sq. ft.
17      904 sq. ft.
18    2,163 sq. ft.
19    2,163 sq. ft.
Name: Size, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['Size'] = df1['Size'].apply(clean_ranges)


In [211]:
# Remove commas from Size column
df1['Size'] = df1['Size'].astype(str).str.replace(',', '', regex=False)

# Quick check
print("Sample values after removing commas:")
print(df1['Size'].dropna().head(20))

Sample values after removing commas:
0     1335 sq. ft.
1     6900 sq. ft.
2     1875 sq. ft.
3     1513 sq. ft.
4     7200 sq. ft.
5     3600 sq. ft.
6           1875.0
7      904 sq. ft.
8           1760.0
9     1900 sq. ft.
10    6000 sq. ft.
11    3600 sq. ft.
12    8500 sq. ft.
13    4842 sq. ft.
14    1830 sq. ft.
15    3720 sq. ft.
16    1798 sq. ft.
17     904 sq. ft.
18    2163 sq. ft.
19    2163 sq. ft.
Name: Size, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['Size'] = df1['Size'].astype(str).str.replace(',', '', regex=False)


In [213]:
# Remove 'sq. ft.' from Size column
df1['Size'] = df1['Size'].astype(str).str.replace('sq. ft.', '', regex=False)
df1['Size'] = df1['Size'].str.strip()  # remove any trailing whitespace

# Quick check
print("Sample values after removing 'sq. ft.':")
print(df1['Size'].dropna().head(20))

Sample values after removing 'sq. ft.':
0       1335
1       6900
2       1875
3       1513
4       7200
5       3600
6     1875.0
7        904
8     1760.0
9       1900
10      6000
11      3600
12      8500
13      4842
14      1830
15      3720
16      1798
17       904
18      2163
19      2163
Name: Size, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['Size'] = df1['Size'].astype(str).str.replace('sq. ft.', '', regex=False)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['Size'] = df1['Size'].str.strip()  # remove any trailing whitespace


In [216]:
# Remove all text from Size column 
df1['Size'] = df1['Size'].astype(str)
df1['Size'] = df1['Size'].str.replace(r'[A-Za-z]+', '', regex=True)  # remove all letters
df1['Size'] = df1['Size'].str.strip()  # remove trailing whitespace

# Convert to float and handle any remaining non-numeric values
df1['Size'] = pd.to_numeric(df1['Size'], errors='coerce')

# Quick check
print("Sample values after removing all text:")
print(df1['Size'].dropna().head(20))

Sample values after removing all text:
0     1335.0
1     6900.0
2     1875.0
3     1513.0
4     7200.0
5     3600.0
6     1875.0
7      904.0
8     1760.0
9     1900.0
10    6000.0
11    3600.0
12    8500.0
13    4842.0
14    1830.0
15    3720.0
16    1798.0
17     904.0
18    2163.0
19    2163.0
Name: Size, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['Size'] = df1['Size'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['Size'] = df1['Size'].str.replace(r'[A-Za-z]+', '', regex=True)  # remove all letters
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['Size'] = df1['Size'].str.strip()  # remove trailing whitespace
A value i

In [220]:
# Remove decimals by converting to integers
df1['Size'] = df1['Size'].astype(float).round().astype('Int64')

# Quick check
print("Sample values after removing decimals:")
print(df1['Size'].head(20))
print("\nColumn type:", df1['Size'].dtype)

Sample values after removing decimals:
0     1335
1     6900
2     1875
3     1513
4     7200
5     3600
6     1875
7      904
8     1760
9     1900
10    6000
11    3600
12    8500
13    4842
14    1830
15    3720
16    1798
17     904
18    2163
19    2163
Name: Size, dtype: Int64

Column type: Int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['Size'] = df1['Size'].astype(float).round().astype('Int64')


In [223]:
# Remove records with Size values outside reasonable range
print(f"Records before filtering Size range: {len(df1)}")

df1 = df1[(df1['Size'] >= 400) & (df1['Size'] <= 10000)]
df1.reset_index(drop=True, inplace=True)

print(f"Records after filtering Size range: {len(df1)}")

# Verify the changes
print("\nSize statistics after filtering:")
print(df1['Size'].describe())
print("\nSmallest sizes:", df1['Size'].nsmallest(5))
print("Largest sizes:", df1['Size'].nlargest(5))

Records before filtering Size range: 49044
Records after filtering Size range: 47963

Size statistics after filtering:
count        47963.0
mean     1986.766049
std      1602.133501
min            400.0
25%           1023.0
50%           1410.0
75%           2208.0
max          10000.0
Name: Size, dtype: Float64

Smallest sizes: 6290     400
7998     405
41745    411
11671    413
15840    417
Name: Size, dtype: Int64
Largest sizes: 114     10000
1050    10000
1412    10000
1627    10000
2182    10000
Name: Size, dtype: Int64


In [224]:
df1

Unnamed: 0,Location,Latitude,Longitude,Price,Rooms,Bathrooms,Car Parks,Property Type,Size,Furnishing,MainType,Storeys,Position,Layout,LandStatus
0,KLCC,3.159247,101.713366,1250000,3,3,2,Serviced Residence,1335,Fully Furnished,Serviced Residence,,,,
1,Damansara Heights,3.152395,101.658445,6800000,6,7,2,Bungalow,6900,Partly Furnished,Bungalow,,,,
2,Dutamas,3.032282,101.759884,1030000,3,4,2,Condominium (Corner),1875,Partly Furnished,Condominium,,Corner,,
3,Bukit Jalil,3.053346,101.680294,900000,5,3,2,Condominium (Corner),1513,Partly Furnished,Condominium,,Corner,,
4,Taman Tun Dr Ismail,3.136141,101.630737,5350000,6,5,4,Bungalow,7200,Partly Furnished,Bungalow,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47958,Mont Kiara,3.169999,101.652147,6500000,6,6,2,Bungalow (Corner),6500,Partly Furnished,Bungalow,,Corner,,
47959,Bangsar,3.127560,101.679060,5100000,6,4,2,Bungalow (Corner),7168,Fully Furnished,Bungalow,,Corner,,
47960,Bangsar,3.127560,101.679060,5500000,6,4,2,Bungalow (Intermediate),7168,Partly Furnished,Bungalow,,Intermediate,,
47961,Wangsa Maju,3.205778,101.731861,480000,3,2,2,Condominium (Intermediate),1150,Unfurnished,Condominium,,Intermediate,,


In [226]:
# List values containing special symbols
special_chars = r'[#xX*\+\-\\/\(\)\[\]\{\}\^\$\&\%\@\!\?\,\.]'
symbol_mask = df1['Size'].astype(str).str.contains(special_chars, regex=True)
special_values = pd.Series(df1.loc[symbol_mask, 'Size'].unique()).sort_values()

print(f"Values containing special symbols ({len(special_values)}):")
for v in special_values:
    print(v)

Values containing special symbols (0):


### Clean Furnishing Column

In [227]:
# Remove records with missing values in Furnishing column
print(f"Records before removing missing Furnishing: {len(df1)}")
df1 = df1.dropna(subset=['Furnishing'])
df1.reset_index(drop=True, inplace=True)
print(f"Records after removing missing Furnishing: {len(df1)}")

# Quick verification
print("\nUnique Furnishing values:")
print(df1['Furnishing'].value_counts())

Records before removing missing Furnishing: 47963
Records after removing missing Furnishing: 42727

Unique Furnishing values:
Furnishing
Partly Furnished    24784
Fully Furnished     12281
Unfurnished          5228
Unknown               434
Name: count, dtype: int64


In [228]:
# Remove records with missing values and 'Unknown' in Furnishing column
print(f"Records before cleaning Furnishing: {len(df1)}")

# Remove missing values and 'Unknown'
df1 = df1.dropna(subset=['Furnishing'])
df1 = df1[~df1['Furnishing'].str.contains('Unknown', case=False, na=False)]
df1.reset_index(drop=True, inplace=True)

print(f"Records after cleaning Furnishing: {len(df1)}")

# Quick verification
print("\nUnique Furnishing values:")
print(df1['Furnishing'].value_counts())

Records before cleaning Furnishing: 42727
Records after cleaning Furnishing: 42293

Unique Furnishing values:
Furnishing
Partly Furnished    24784
Fully Furnished     12281
Unfurnished          5228
Name: count, dtype: int64


### Rearrange columns of the data frame and remove Layout and Land Status columns

In [None]:

# Ensure MainType exists (create if missing), then replace Property Type with MainType and rename to 'Type'
if 'MainType' not in df1.columns:
    # parse_property is defined earlier in this notebook; reuse it to create MainType
    df1[['MainType', 'Storeys', 'Position', 'Layout', 'LandStatus']] = df1['Property Type'].apply(parse_property)

# Create new column 'Type' from MainType, drop original columns
df1['Type'] = df1['MainType']
if 'Property Type' in df1.columns:
    df1.drop(columns=['Property Type'], inplace=True)
if 'MainType' in df1.columns:
    df1.drop(columns=['MainType'], inplace=True)

# Quick verification
print("Columns now include 'Type':", 'Type' in df1.columns)
print("Sample Type values:", pd.Series(df1['Type'].unique()).sort_values()[:20])


Columns now include 'Type': True
Sample Type values: 5              Apartment
1               Bungalow
8          Bungalow Land
9          Cluster House
2            Condominium
7                   Flat
3    Semi-detached House
0     Serviced Residence
4     Terrace/Link House
6              Townhouse
dtype: object


In [None]:

# Move 'Type' column to immediately after 'Longitude' (if both exist)
if 'Type' in df1.columns and 'Longitude' in df1.columns:
    cols = df1.columns.tolist()
    cols.remove('Type')
    lon_idx = cols.index('Longitude')
    cols.insert(lon_idx + 1, 'Type')
    df1 = df1[cols]
    print("Moved 'Type' after 'Longitude'.")
else:
    print("No change: 'Type' or 'Longitude' column not found.")
    
# Quick check
df1.head()

Moved 'Type' after 'Longitude'.


Unnamed: 0,Location,Latitude,Longitude,Type,Price,Rooms,Bathrooms,Car Parks,Size,Furnishing,Storeys,Position,Layout,LandStatus
0,KLCC,3.159247,101.713366,Serviced Residence,1250000,3,3,2,1335,Fully Furnished,,,,
1,Damansara Heights,3.152395,101.658445,Bungalow,6800000,6,7,2,6900,Partly Furnished,,,,
2,Dutamas,3.032282,101.759884,Condominium,1030000,3,4,2,1875,Partly Furnished,,Corner,,
3,Bukit Jalil,3.053346,101.680294,Condominium,900000,5,3,2,1513,Partly Furnished,,Corner,,
4,Taman Tun Dr Ismail,3.136141,101.630737,Bungalow,5350000,6,5,4,7200,Partly Furnished,,,,


In [235]:

# Drop 'Layout' and 'LandStatus' if they exist, then move 'Price' to the end
for col in ['Layout', 'LandStatus']:
    if col in df1.columns:
        df1.drop(columns=[col], inplace=True)
        print(f"Dropped column: {col}")

if 'Price' in df1.columns:
    cols = [c for c in df1.columns if c != 'Price'] + ['Price']
    df1 = df1[cols]
    print("Moved 'Price' to the end of the dataframe.")
else:
    print("Column 'Price' not found — no reordering performed.")

# Quick verification
print(df1.columns.tolist())
df1

Moved 'Price' to the end of the dataframe.
['Location', 'Latitude', 'Longitude', 'Type', 'Rooms', 'Bathrooms', 'Car Parks', 'Size', 'Furnishing', 'Storeys', 'Position', 'Price']


Unnamed: 0,Location,Latitude,Longitude,Type,Rooms,Bathrooms,Car Parks,Size,Furnishing,Storeys,Position,Price
0,KLCC,3.159247,101.713366,Serviced Residence,3,3,2,1335,Fully Furnished,,,1250000
1,Damansara Heights,3.152395,101.658445,Bungalow,6,7,2,6900,Partly Furnished,,,6800000
2,Dutamas,3.032282,101.759884,Condominium,3,4,2,1875,Partly Furnished,,Corner,1030000
3,Bukit Jalil,3.053346,101.680294,Condominium,5,3,2,1513,Partly Furnished,,Corner,900000
4,Taman Tun Dr Ismail,3.136141,101.630737,Bungalow,6,5,4,7200,Partly Furnished,,,5350000
...,...,...,...,...,...,...,...,...,...,...,...,...
42288,Mont Kiara,3.169999,101.652147,Bungalow,6,6,2,6500,Partly Furnished,,Corner,6500000
42289,Bangsar,3.127560,101.679060,Bungalow,6,4,2,7168,Fully Furnished,,Corner,5100000
42290,Bangsar,3.127560,101.679060,Bungalow,6,4,2,7168,Partly Furnished,,Intermediate,5500000
42291,Wangsa Maju,3.205778,101.731861,Condominium,3,2,2,1150,Unfurnished,,Intermediate,480000


### Clean Storeys Column

In [236]:
# List unique values of 'Type' column (sorted) and show counts
print(f"Unique Type count: {df1['Type'].nunique()}\n")
print("Sorted unique values:")
for v in sorted(df1['Type'].dropna().unique()):
    print(v)

print("\nValue counts:")
print(df1['Type'].value_counts())

Unique Type count: 10

Sorted unique values:
Apartment
Bungalow
Bungalow Land
Cluster House
Condominium
Flat
Semi-detached House
Serviced Residence
Terrace/Link House
Townhouse

Value counts:
Type
Condominium            19314
Serviced Residence     10547
Terrace/Link House      6501
Bungalow                2067
Semi-detached House     1816
Apartment               1313
Townhouse                466
Flat                     253
Bungalow Land             10
Cluster House              6
Name: count, dtype: int64


In [None]:

# Remove 'Bungalow Land' and 'Cluster House' from Type (case-insensitive)
print(f"Records before removing Types: {len(df1)}")
mask_remove = df1['Type'].astype(str).str.contains(r'^(?:Bungalow Land|Cluster House)$', case=False, na=False)
removed = mask_remove.sum()
df1 = df1[~mask_remove].reset_index(drop=True)
print(f"Removed {removed} records. Records after removal: {len(df1)}")

# Quick verification
print(f"\nUnique Type count after removal: {df1['Type'].nunique()}")
print(df1['Type'].value_counts())


Records before removing Types: 42293
Removed 16 records. Records after removal: 42277

Unique Type count after removal: 8
Type
Condominium            19314
Serviced Residence     10547
Terrace/Link House      6501
Bungalow                2067
Semi-detached House     1816
Apartment               1313
Townhouse                466
Flat                     253
Name: count, dtype: int64


In [None]:

# List unique Type values with sum of missing Storeys beside each
missing_storeys = (
    df1.groupby(df1['Type'].fillna('<MISSING TYPE>'))['Storeys']
       .apply(lambda s: s.isna().sum())
       .reset_index(name='Missing_Storeys')
       .sort_values('Missing_Storeys', ascending=False)
)
print(missing_storeys.to_string(index=False))


               Type  Missing_Storeys
          Apartment                0
           Bungalow                0
        Condominium                0
               Flat                0
Semi-detached House                0
 Serviced Residence                0
 Terrace/Link House                0
          Townhouse                0


In [None]:

# Robustly clean Storeys, fill missing with 1, then convert to numpy int64
print(f"Missing Storeys before clean: {df1['Storeys'].isna().sum()}")

# Extract leading numeric (handles '3-sty', '3.5-sty', '3', etc.)
df1['Storeys'] = df1['Storeys'].astype(str).str.extract(r'(\d+(?:\.\d+)?)', expand=False)

# Convert to numeric (coerce non-numeric to NaN)
df1['Storeys'] = pd.to_numeric(df1['Storeys'], errors='coerce')

# Fill NaNs with 1, round any floats and convert to int64
df1['Storeys'] = df1['Storeys'].fillna(1).round().astype('int64')

print(f"Missing Storeys after fill: {df1['Storeys'].isna().sum()}")
print(f"Storeys dtype: {df1['Storeys'].dtype}")


Missing Storeys before clean: 0
Missing Storeys after fill: 0
Storeys dtype: int64


### Clean Position

In [None]:

# List unique Position values per Type and sum their occurrences
tmp = df1.copy()
tmp['Position'] = tmp['Position'].fillna('<MISSING>')

# Counts per Type x Position
pos_counts = (
    tmp.groupby(['Type', 'Position'])
       .size()
       .reset_index(name='Count')
       .sort_values(['Type', 'Count'], ascending=[True, False])
)
print("Counts by Type and Position:")
print(pos_counts.to_string(index=False))

# Summary per Type: unique positions (comma-separated), number of unique positions, and total occurrences
summary = (
    pos_counts.groupby('Type')
              .agg(UniquePositions=('Position', lambda s: ', '.join(sorted(s.unique()))),
                   NumUnique=('Position', 'nunique'),
                   TotalCount=('Count', 'sum'))
              .reset_index()
              .sort_values('TotalCount', ascending=False)
)
print("\nSummary per Type:")
print(summary.to_string(index=False))


Counts by Type and Position:
               Type     Position  Count
          Apartment Intermediate    596
          Apartment    <MISSING>    448
          Apartment       Corner    239
          Apartment       EndLot     30
           Bungalow    <MISSING>    898
           Bungalow Intermediate    710
           Bungalow       Corner    429
           Bungalow       EndLot     30
        Condominium    <MISSING>   9104
        Condominium       Corner   5198
        Condominium Intermediate   4674
        Condominium       EndLot    338
               Flat Intermediate    117
               Flat    <MISSING>     82
               Flat       Corner     44
               Flat       EndLot     10
Semi-detached House Intermediate    838
Semi-detached House    <MISSING>    616
Semi-detached House       Corner    298
Semi-detached House       EndLot     64
 Serviced Residence    <MISSING>   5460
 Serviced Residence Intermediate   2808
 Serviced Residence       Corner   2108
 Serviced R

In [None]:

# Fill missing Position values with 'Unknown' and normalize empty strings
print(f"Position missing before: {df1['Position'].isna().sum()}")
df1['Position'] = df1['Position'].fillna('Unknown')
df1['Position'] = df1['Position'].replace(r'^\s*$', 'Unknown', regex=True)  # catch empty/blank strings
df1['Position'] = df1['Position'].astype(str)
print(f"Position missing after: {df1['Position'].isna().sum()}")
print(df1['Position'].value_counts())


Position missing before: 18098
Position missing after: 0
Position
Unknown         18098
Intermediate    14015
Corner           9098
EndLot           1066
Name: count, dtype: int64


In [None]:

# List unique Position values and their counts
positions = df1['Position'].fillna('Unknown').astype(str).str.strip()
print("Unique Position values (sorted):")
print(sorted(positions.unique()))
print("\nPosition value counts (sorted by name):")
print(positions.value_counts().sort_index())


Unique Position values (sorted):
['Corner', 'EndLot', 'Intermediate', 'Unknown']

Position value counts (sorted by name):
Position
Corner           9098
EndLot           1066
Intermediate    14015
Unknown         18098
Name: count, dtype: int64


### Additional overlooked cleaning - Remove records with missing values in Bathrooms

In [None]:

# Remove records with missing values in 'Bathrooms' and ensure integer type
print(f"Records before removing missing Bathrooms: {len(df1)}")
df1['Bathrooms'] = pd.to_numeric(df1['Bathrooms'], errors='coerce')  # ensure numeric
df1 = df1.dropna(subset=['Bathrooms']).reset_index(drop=True)
df1['Bathrooms'] = df1['Bathrooms'].astype('Int64')  # nullable integer
print(f"Records after removing missing Bathrooms: {len(df1)}")


Records before removing missing Bathrooms: 42277
Records after removing missing Bathrooms: 42077


### Save cleaned data into csv file

In [None]:

# Save cleaned dataframe to CSV
df1.to_csv("../data/cleaned_data.csv", index=False)
print("Saved cleaned_data to ../data/cleaned_data.csv")


Saved cleaned_data to ../data/cleaned_data.csv
