In [None]:
import pandas as pd
import numpy as np
pd.set_option("display.max_colwidth", None)

import warnings
warnings.filterwarnings("ignore")

In [None]:
data = pd.read_csv("./gurgaon_properties.csv").drop(columns="Unnamed: 0")
data.head()

## areaWithType

In [None]:
data["areaWithType"]

In [None]:
import re
pattern = r"Super Built up area (\d+)"
text = "Super Built up area 3905(362.79 sq.m.)Built Up area: 3305 sq.ft. (307.04 sq.m.)"
re.findall(pattern, text)

In [None]:
def get_super_builtup_area(value):
    if pd.isna(value):
        return np.nan
    else:
        pattern = re.compile(r"Super Built up area (\d+)")
        match = pattern.search(value)
        if match:
            return float(match.group(1))
        else:
            return np.nan

In [None]:
# This function checks if the area is provided in sq.m. and converts it to sqft if needed
def convert_to_sqft(text, area_value):
    if area_value is None:
        return None
    match = re.search(r'{} \((\d+\.?\d*) sq.m.\)'.format(area_value), text)
    if match:
        sq_m_value = float(match.group(1))
        return sq_m_value * 10.7639  # conversion factor from sq.m. to sqft
    return area_value

In [None]:
data["super_builtup_area"] = data["areaWithType"].apply(get_super_builtup_area)
data["super_builtup_area"] = data.apply(lambda x: convert_to_sqft(x["areaWithType"], x["super_builtup_area"]), axis=1)

In [None]:
pattern = r"Carpet area\s*:\s* (\d+\.?\d*)"
text = "Carpet area: 3556 (330.36 sq.m.)"
re.findall(pattern, text)

In [None]:
# this function extracts built up area or carpet area
def get_area(value, area_type):
    if pd.isna(value):
        return np.nan
    else:
        pattern = re.compile(area_type + r"\s*:\s*(\d+\.?\d*)")
        match = pattern.search(value)
        if match:
            return match.group(1)
        else:
            return np.nan

In [None]:
data["builtup_area"] = data["areaWithType"].apply(lambda x: get_area(x, "Built Up area"))
data["builtup_area"] = data.apply(lambda x: convert_to_sqft(x["areaWithType"], x["builtup_area"]), axis=1)

In [None]:
data["carpet_area"] = data["areaWithType"].apply(lambda x: get_area(x, "Carpet area"))
data["carpet_area"] = data.apply(lambda x: convert_to_sqft(x["areaWithType"], x["carpet_area"]), axis=1)

In [None]:
def extract_plot_area(value):
    if pd.isna(value):
        return np.nan
    pattern = re.compile("Plot area\s*(\d+\.?\d*)")
    match = pattern.search(value)
    if match:
        return float(match.group(1))
    else:
        return np.nan

In [None]:
data["plot_area"] = data["areaWithType"].apply(extract_plot_area)

In [None]:
data[["areaWithType", "plot_area"]].sample(7)

#### PROBLEM
Some values of built up area are in square feet, some are in square yards and some are in square feet. Converting them all to square feet

In [None]:
data["builtup_area"] = pd.to_numeric(data["builtup_area"])

In [None]:
def fill_builtup_area(row):
    if not pd.isna(row['plot_area']):
        return row['plot_area']
    else:
        return row['builtup_area']
    
data["builtup_area"] = data.apply(fill_builtup_area, axis=1)

In [None]:
def convert_scale(row):
    if pd.isna(row["area"]) or pd.isna(row["builtup_area"]):
        return row["builtup_area"]
    else:
        if round(row["area"] / row["builtup_area"]) == 9.0:
            return row["builtup_area"] * 9
        elif round(row["area"] / row["builtup_area"]) == 11.0:
            return row["builtup_area"] * 10.7
        else:
            return row["builtup_area"]

In [None]:
data["builtup_area"] = data.apply(convert_scale, axis=1)

## additionalRoom

In [None]:
room_types = ["Study Room", "Servant Room", "Pooja Room", "Others", "Store Room"]

for room in room_types:
    data[room] = data["additionalRoom"].fillna("Missing").apply(lambda x: 1 if room in x else 0)

In [None]:
nan_index = data[data["additionalRoom"].isna()].index
data.loc[nan_index, "Study Room" : "Store Room"] = np.nan

In [None]:
data[["additionalRoom", "Study Room", "Servant Room", "Pooja Room", "Others", "Store Room"]].sample(7)

## Extract sector from address

In [None]:
import re

pattern = r"(Sector[^\d*]\d+\w*)"
text = "Sector 61 Gurgaon, Gurgaon, Haryana"
re.findall(pattern, text)

In [None]:
flats["sector"] = flats["address"].str.extract(pattern)
flats[["address", "sector"]].sample(7)

## Categories of agePossession
0-1 year old / within  3 months / within 6 months / under construction : new<Br>
1-5 year old : relatively new<Br>
5-10 year old: moderately old<Br>
10+ year old:  old<Br>
any other value where possession is handed in a future year : New<Br>

In [None]:
data["agePossession"].value_counts()

In [None]:
def fix_age_possession(value):
    if pd.isna(value) or value == "undefined":
        return np.nan
    elif value == "0 to 1 Year Old" or value == "Within 6 months" or value == "Within 3 months" or value == "Under Construction":
        return "new"
    elif value == "1 to 5 Year Old":
        return "relatively new"
    elif value == "5 to 10 Year Old":
        return "moderately old"
    elif value == "10+ Year Old":
        return "old"
    elif "-" in value or "By" in value or value.split(" ")[-1].isnumeric():
        return "new"

In [None]:
data["possession_category"] = data["agePossession"].apply(fix_age_possession)

In [None]:
data[["agePossession", "possession_category"]].sample(7)

In [None]:
data["possession_category"].unique()

## furnishDetails

In [None]:
data["furnishDetails"].value_counts()

In [None]:
all_furnishings= []

for detail in data["furnishDetails"].unique():
    if not pd.isna(detail):
        all_furnishings.extend(detail.replace("[","").replace("]","").replace("'","").split(","))

furnishing_set = set()
for i in all_furnishings:
    i = i.strip()
    pattern = r"(\S+) (\w+\s?\w*)"
    feature = re.findall(pattern, i)
    if feature:
        furnishing_set.add(feature[0][-1])
        
print(len(furnishing_set))

In [None]:
data.shape

In [None]:
def get_furnishing_count(details, furnishing):
    if isinstance(furnishing, str) and not pd.isna(details):
        if f"No {furnishing}" in details:
            return 0
        pattern = re.compile(f"(\d+) {furnishing}")
        match = pattern.search(details)
        if match:
            return int(match.group(1))
        elif furnishing in details:
            return 1
    return 0

furnishing_list = list(furnishing_set)

In [None]:
for furnishing in furnishing_list:
    data[furnishing] = data["furnishDetails"].apply(lambda x: get_furnishing_count(x, furnishing))

In [None]:
furnishing_df = data[["furnishDetails"] + furnishing_list]

#### Applying KMeans to furnishing_df

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

furnishing_df2 = furnishing_df.drop(columns="furnishDetails")

In [None]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(furnishing_df2)

In [None]:
inertia = []

for i in range(1,10):
    kmeans = KMeans(n_clusters=i, init="k-means++")
    kmeans.fit(scaled_data)
    inertia.append(kmeans.inertia_)

In [None]:
import matplotlib.pyplot as plt
plt.plot(range(1,10), inertia, marker="o")
plt.title("Number of clusters vs. Inertia")
plt.xlabel("num_clusters")
plt.ylabel("Inertia")
plt.grid()
plt.show()

In [None]:
kmeans = KMeans(n_clusters=3, init="k-means++")
kmeans.fit(scaled_data)
cluster_labels = kmeans.predict(scaled_data)
data["furnishingType"] = cluster_labels

In [None]:
data[["furnishDetails", "furnishingType"]].sample(7)

## Features

In [None]:
import ast
from sklearn.preprocessing import MultiLabelBinarizer

In [None]:
# converting each row of features from representation of lists to actual lists
data["features_list"] = data["features"].apply(lambda x: ast.literal_eval(x) if pd.notnull(x) and x.startswith("[") else [])

In [None]:
# use MultiLabelBinarizer to convert features list into a binary matrix
mlb = MultiLabelBinarizer()
binary_matrix = mlb.fit_transform(data["features_list"])

features_df = pd.DataFrame(binary_matrix, columns=mlb.classes_)
features_df

In [None]:
# Define the weights for each feature as provided
# Assigning weights based on perceived luxury contribution
weights = {
    '24/7 Power Backup': 8,
    '24/7 Water Supply': 4,
    '24x7 Security': 7,
    'ATM': 4,
    'Aerobics Centre': 6,
    'Airy Rooms': 8,
    'Amphitheatre': 7,
    'Badminton Court': 7,
    'Banquet Hall': 8,
    'Bank Attached Property' : 7,
    'Bar/Chill-Out Lounge': 9,
    'Barbecue': 7,
    'Basketball Court': 7,
    'Billiards': 7,
    'Bowling Alley': 8,
    'Business Lounge': 9,
    'CCTV Camera Security': 8,
    'Cafeteria': 6,
    'Car Parking': 6,
    'Card Room': 6,
    'Centrally Air Conditioned': 9,
    'Changing Area': 6,
    "Children's Play Area": 7,
    'Cigar Lounge': 9,
    'Clinic': 5,
    'Club House': 9,
    'Club house / Community Center': 6,
    'Concierge Service': 9,
    'Conference room': 8,
    'Creche/Day care': 7,
    'Cricket Pitch': 7,
    'Doctor on Call': 6,
    'Earthquake Resistant': 5,
    'Entrance Lobby': 7,
    'False Ceiling Lighting': 6,
    'Feng Shui / Vaastu Compliant': 5,
    'Fire Fighting Systems': 8,
    'Fitness Centre / GYM': 8,
    'Flower Garden': 7,
    'Food Court': 6,
    'Foosball': 5,
    'Football': 7,
    'Fountain': 7,
    'Gated Community': 7,
    'Golf Course': 10,
    'Grocery Shop': 6,
    'Gymnasium': 8,
    'High Ceiling Height': 8,
    'High Speed Elevators': 8,
    'Infinity Pool': 9,
    'Intercom Facility': 7,
    'Internal Street Lights': 6,
    'Internet/wi-fi connectivity': 7,
    'Jacuzzi': 9,
    'Jogging Track': 7,
    'Landscape Garden': 8,
    'Laundry': 6,
    'Lawn Tennis Court': 8,
    'Library': 8,
    'Lounge': 8,
    'Low Density Society': 7,
    'Maintenance Staff': 6,
    'Manicured Garden': 7,
    'Medical Centre': 5,
    'Milk Booth': 4,
    'Mini Theatre': 9,
    'Multipurpose Court': 7,
    'Multipurpose Hall': 7,
    'Natural Light': 8,
    'Natural Pond': 7,
    'Park': 8,
    'Party Lawn': 8,
    'Piped Gas': 7,
    'Piped-gas' : 7,
    'Pool Table': 7,
    'Power Back up Lift': 8,
    'Private Garden / Terrace': 9,
    'Property Staff': 7,
    'RO System': 7,
    'Rain Water Harvesting': 7,
    'Reading Lounge': 8,
    'Restaurant': 8,
    'Salon': 8,
    'Sauna': 9,
    'Security / Fire Alarm': 9,
    'Security Personnel': 9,
    'Separate entry for servant room': 8,
    'Sewage Treatment Plant': 6,
    'Shopping Centre': 7,
    'Skating Rink': 7,
    'Solar Lighting': 6,
    'Solar Water Heating': 7,
    'Spa': 9,
    'Spacious Interiors': 9,
    'Squash Court': 8,
    'Steam Room': 9,
    'Sun Deck': 8,
    'Swimming Pool': 8,
    'Temple': 5,
    'Theatre': 9,
    'Toddler Pool': 7,
    'Valet Parking': 9,
    'Video Door Security': 9,
    'Visitor Parking': 7,
    'Water Softener Plant': 7,
    'Water Storage': 7,
    'Water purifier': 7,
    'Waste Disposal' : 9,
    'Yoga/Meditation Area': 7,
    'Recently Renovated' : 9,
    'No open drainage around' : 9,
    'Lift(s)' : 7,
    'Water softening plant' : 9,
    'Power Back-up' : 8
}

def get_luxury_score(row):
    return sum(weights[feature] for feature, value in row.items() if value==1)

features_df['score'] = features_df.apply(get_luxury_score, axis=1)

In [None]:
data["luxury_score"] = features_df.apply(get_luxury_score, axis=1)

In [None]:
data.shape

In [None]:
data.columns

In [None]:
data.drop(columns=["Fan", "Geyser", "Microwave", "Modular Kitchen", "Bed", "Wardrobe", "Chimney", "AC", "Stove", "Fridge", "Washing Machine",\
                  "Water Purifier", "Sofa", "Stove", "Dining Table", "Curtains", "Light", "Exhaust Fan", "TV"], inplace=True)

In [None]:
data.drop(columns=["nearbyLocations", "furnishDetails", "features", "features_list", "additionalRoom"], inplace=True)

In [None]:
data.drop(columns=["agePossession", "builtup2", "plot_area"], inplace=True)

In [None]:
data.shape

In [None]:
# exporting as csv
data.to_csv("gurgaon_properties_cleaned_v1.csv", index=False)