# Data Exploration and Cleaning

## Loading Data:
- Used Python libraries (Pandas, NumPy) to load the data into Jupyter Notebook.

## Data Cleaning:
- Handling missing values (imputation or removal).
- Removing duplicates.
- Correcting data types.
- Outlier detection and treatment.



In [1]:
# loading the data from csv to dataframe
import pandas as pd
import numpy as np

houses_df = pd.read_csv("/Users/guliaharsh021/Downloads/DA Documents /Projects/Project 1/Data Collection/Property Data/Houses Data/houses_data.csv")

In [2]:
# viewing the first 10 rows of flats data

houses_df.head()

Unnamed: 0,Property Name,Property Title,Property Type,City/Locality,BHK,Property Size,Furnishing,Price Total,Price per Sqft
0,,1 BHK House for Sale in Narela New Delhi,"Residential-House,Villa",,1.0,25 sqyrd,Unfurnished,₹7 Lac,"₹3,111 per sqft"
1,,1 BHK House for Sale in Uttam Nagar West New ...,"Residential-House,Villa",,1.0,25 sqyrd,Unfurnished,₹11 Lac,"₹4,889 per sqft"
2,,1 BHK House for Sale in Najafgarh New Delhi,"Residential-House,Villa",,1.0,28 sqyrd,Semi-Furnished,₹7 Lac,"₹2,778 per sqft"
3,,1 BHK House for Sale in Jona Pur New Delhi,"Residential-House,Villa",,1.0,25 sqyrd,1,₹8 Lac,"₹3,556 per sqft"
4,Ashima Floors 5,"1 BHK House for Sale in Ashima Floors 5, Geet...","Residential-House,Villa","1 BHK House for Sale in Ashima Floors 5, Geet...",1.0,50 sqft,Ashima Floors 5,₹10 Lac,


In [3]:
# total number of rows 
houses_df.shape[0]

12023

In [4]:
# converting the "BHK" column to integer type
houses_df['BHK'] = houses_df['BHK'].astype(int)
houses_df.head(5)

Unnamed: 0,Property Name,Property Title,Property Type,City/Locality,BHK,Property Size,Furnishing,Price Total,Price per Sqft
0,,1 BHK House for Sale in Narela New Delhi,"Residential-House,Villa",,1,25 sqyrd,Unfurnished,₹7 Lac,"₹3,111 per sqft"
1,,1 BHK House for Sale in Uttam Nagar West New ...,"Residential-House,Villa",,1,25 sqyrd,Unfurnished,₹11 Lac,"₹4,889 per sqft"
2,,1 BHK House for Sale in Najafgarh New Delhi,"Residential-House,Villa",,1,28 sqyrd,Semi-Furnished,₹7 Lac,"₹2,778 per sqft"
3,,1 BHK House for Sale in Jona Pur New Delhi,"Residential-House,Villa",,1,25 sqyrd,1,₹8 Lac,"₹3,556 per sqft"
4,Ashima Floors 5,"1 BHK House for Sale in Ashima Floors 5, Geet...","Residential-House,Villa","1 BHK House for Sale in Ashima Floors 5, Geet...",1,50 sqft,Ashima Floors 5,₹10 Lac,


In [5]:
lst = houses_df['Property Size'].str[-4:]

In [6]:
print(lst.unique())

['qyrd' 'sqft' nan 'of 3' '2' 'ment' 'sale' '1' 'ound' ' sqm' 'ntha'
 'Move' 'arla' 'shed' '3' 'ears' 'of 2' '4' '5' 'acre']


In [7]:
df = pd.DataFrame(lst)

In [8]:
grouped_df = df.groupby('Property Size').size().reset_index(name='Count')
grouped_df

Unnamed: 0,Property Size,Count
0,sqm,1859
1,1,1
2,2,4
3,3,4
4,4,1
5,5,1
6,Move,14
7,acre,1
8,arla,3
9,ears,1


In [9]:
# Filtering rows where 'Property Size' contains 'sqft', 'qyrd' or 'sqm'
houses_df = houses_df[houses_df['Property Size'].str.contains('sqft|qyrd|sqm', regex=True, na=False)]
houses_df.shape[0]

11956

In [10]:
# Function to convert values to sqft
def convert_to_sqft(value):
    if pd.isna(value):
        return None
    elif 'sqft' in value:
        return float(value.split()[0].replace(',', '')) 
    elif 'qyrd' in value:
        return float(value.split()[0].replace(',', '')) * 9  
    elif 'sqm' in value:
        return float(value.split()[0].replace(',', '')) * 10.7639 
    else:
        return None 

houses_df['Property Size'] = houses_df['Property Size'].apply(convert_to_sqft)

In [12]:
houses_df.head()

Unnamed: 0,Property Name,Property Title,Property Type,City/Locality,BHK,Property Size,Furnishing,Price Total,Price per Sqft
0,,1 BHK House for Sale in Narela New Delhi,"Residential-House,Villa",,1,225.0,Unfurnished,₹7 Lac,"₹3,111 per sqft"
1,,1 BHK House for Sale in Uttam Nagar West New ...,"Residential-House,Villa",,1,225.0,Unfurnished,₹11 Lac,"₹4,889 per sqft"
2,,1 BHK House for Sale in Najafgarh New Delhi,"Residential-House,Villa",,1,252.0,Semi-Furnished,₹7 Lac,"₹2,778 per sqft"
3,,1 BHK House for Sale in Jona Pur New Delhi,"Residential-House,Villa",,1,225.0,1,₹8 Lac,"₹3,556 per sqft"
4,Ashima Floors 5,"1 BHK House for Sale in Ashima Floors 5, Geet...","Residential-House,Villa","1 BHK House for Sale in Ashima Floors 5, Geet...",1,50.0,Ashima Floors 5,₹10 Lac,


In [13]:
houses_df = houses_df.rename(columns={'Property Size': 'Property Size (sqft)'})
houses_df['Property Size (sqft)'] = houses_df['Property Size (sqft)'].astype(int)
houses_df['Property Size (sqft)']

0         225
1         225
2         252
3         225
4          50
         ... 
12018    7200
12019    4500
12020     492
12021    4500
12022    2565
Name: Property Size (sqft), Length: 11956, dtype: int64

In [14]:
# Filtering rows where 'Furnshing' contains 'Semi-Furnished', 'Unfurnished' or 'Furnished'
houses_df = houses_df[houses_df['Furnishing'].str.contains('Semi-Furnished|Unfurnished|Furnished', regex=True, na=False)]

In [15]:
# checking the data
lst = houses_df['Furnishing']
print(lst.unique())

['Unfurnished' 'Semi-Furnished' 'Furnished']


In [16]:
houses_df.shape[0]

6470

In [17]:
# Renaming the 'Price Total' column to 'Price (INR)'
houses_df = houses_df.rename(columns={'Price Total': 'Price (INR)'})

# Function to clean and convert 'Price (INR)' to numeric
def clean_and_convert_price(price):
    if pd.isna(price) or price == 'NA':  
        return None
    cleaned_price = ''.join(filter(str.isdigit, str(price)))
    if cleaned_price == '':
        return None
    return float(cleaned_price) * 100000 

# Clean and convert 'Price (INR)' column
houses_df['Price (INR)'] = houses_df['Price (INR)'].apply(clean_and_convert_price)

houses_df.head()

Unnamed: 0,Property Name,Property Title,Property Type,City/Locality,BHK,Property Size (sqft),Furnishing,Price (INR),Price per Sqft
0,,1 BHK House for Sale in Narela New Delhi,"Residential-House,Villa",,1,225,Unfurnished,700000.0,"₹3,111 per sqft"
1,,1 BHK House for Sale in Uttam Nagar West New ...,"Residential-House,Villa",,1,225,Unfurnished,1100000.0,"₹4,889 per sqft"
2,,1 BHK House for Sale in Najafgarh New Delhi,"Residential-House,Villa",,1,252,Semi-Furnished,700000.0,"₹2,778 per sqft"
6,,1 BHK House for Sale in Shiv Vihar West New D...,"Residential-House,Villa",,1,225,Unfurnished,700000.0,"₹3,111 per sqft"
7,,1 BHK House for Sale in Loni Ghaziabad,"Residential-House,Villa",,1,225,Unfurnished,,


In [19]:
# Drop rows with NaN values after conversion
houses_df = houses_df.dropna(subset=['Price (INR)', 'Property Size (sqft)'])

# Convert to integer type
houses_df['Price (INR)'] = houses_df['Price (INR)'].astype(int)

In [20]:
houses_df.shape[0]

6015

In [21]:
# Renaming the 'Price per Sqft' column to 'Price (per sqft)'
houses_df = houses_df.rename(columns={'Price per Sqft': 'Price (per sqft)'})

# Clean and convert 'Price (per sqft)' column
houses_df['Price (per sqft)'] = houses_df['Price (INR)'] / houses_df['Property Size (sqft)']

houses_df.head()

Unnamed: 0,Property Name,Property Title,Property Type,City/Locality,BHK,Property Size (sqft),Furnishing,Price (INR),Price (per sqft)
0,,1 BHK House for Sale in Narela New Delhi,"Residential-House,Villa",,1,225,Unfurnished,700000,3111.111111
1,,1 BHK House for Sale in Uttam Nagar West New ...,"Residential-House,Villa",,1,225,Unfurnished,1100000,4888.888889
2,,1 BHK House for Sale in Najafgarh New Delhi,"Residential-House,Villa",,1,252,Semi-Furnished,700000,2777.777778
6,,1 BHK House for Sale in Shiv Vihar West New D...,"Residential-House,Villa",,1,225,Unfurnished,700000,3111.111111
28,,"1 BHK House for Sale in Ballabhgarh, Mathura ...","Residential-House,Villa","1 BHK House for Sale in Ballabhgarh, Mathura ...",1,360,Furnished,1400000,3888.888889


In [23]:
# rounding off the values in column "Price (per sqft)"
houses_df['Price (per sqft)'] = houses_df['Price (per sqft)'].astype(int)
houses_df.head()

Unnamed: 0,Property Name,Property Title,Property Type,City/Locality,BHK,Property Size (sqft),Furnishing,Price (INR),Price (per sqft)
0,,1 BHK House for Sale in Narela New Delhi,"Residential-House,Villa",,1,225,Unfurnished,700000,3111
1,,1 BHK House for Sale in Uttam Nagar West New ...,"Residential-House,Villa",,1,225,Unfurnished,1100000,4888
2,,1 BHK House for Sale in Najafgarh New Delhi,"Residential-House,Villa",,1,252,Semi-Furnished,700000,2777
6,,1 BHK House for Sale in Shiv Vihar West New D...,"Residential-House,Villa",,1,225,Unfurnished,700000,3111
28,,"1 BHK House for Sale in Ballabhgarh, Mathura ...","Residential-House,Villa","1 BHK House for Sale in Ballabhgarh, Mathura ...",1,360,Furnished,1400000,3888


In [24]:
houses_df['Property Title'] = houses_df['Property Title'].str.replace(r'new delhi', 'Delhi')

In [25]:
houses_df['Property Title'] = houses_df['Property Title'].str.replace(r'delhi', 'Delhi')

In [26]:
houses_df['Property Title'] = houses_df['Property Title'].str.replace(r'New Delhi', 'Delhi')
houses_df['Property Title'] = houses_df['Property Title'].str.replace(r'-', ' ')

In [28]:
houses_df['Property Title'] = houses_df['Property Title'].str.replace(r'gurgaon', 'Gurugram')

In [29]:
# Dictionary of localities
Localities = {
    'Delhi': [
        'Adarsh Nagar', 'Ashok Vihar', 'Bawana', 'Begum Pur', 'Haqiqat Nagar', 'Karala', 'Keshav Puram', 'Narela', 'Pitam Pura', 'Rohini', 'Rani Bagh', 'Shalimar Bagh', 'Shastri Nagar', 'Azadpur', 'Civil Lines', 'Derawal Nagar', 'Gulabi Bagh', 'Kamla Nagar', 'Kashmiri Gate', 'Daryaganj', 'Model Town', 'Sadar Bazaar', 'Sarai Rohilla', 'Shakti Nagar', 'Tis Hazari', 'Timarpur', 'Wazirabad', 'GTB Nagar', 'Urdu Bazaar', 'Mukherjee Nagar', 'Majnu ka tilla', 'Babarpur', 'Bhajanpura', 'Dayal Pur', 'Dilshad Garden', 'Karawal Nagar', 'Naveen Shahdara', 'Nand Nagri', 'Shahdara', 'Shastri Park', 'Seelampur', 'Yamuna Vihar', 'Central Delhi', 'Ashok Nagar', 'Chandni Chowk', 'Civil Lines', 'Daryaganj', 'Dariba Kalan', 'Karol Bagh', 'Old Delhi', 'Shastri Nagar', 'South Patel Nagar', 'Sadar Bazaar', 'Paharganj', 'Rajender Nagar', 'Barakhamba Road', 'Chanakyapuri', 'Connaught Place', 'Gole Market', 'Golf Links, New Delhi', 'INA Colony', 'Inder Puri', 'Jaffrabad', 'Laxmibai Nagar', 'Lodhi Colony', "Lutyens' Delhi", 'Mahipalpur', 'New Delhi', 'Pragati Maidan', 'Raisina Hill', 'Rajendra Place', 'East Vinod Nagar', 'Krishna Nagar', 'Laxmi Nagar', 'Mayur Vihar', 'Pandav Nagar', 'Preet Vihar', 'Anand Vihar', 'Shreshtha Vihar', 'Vivek Vihar', 'Vasundhara Enclave', 'Geetanjali Enclave', 'Green Park', 'Gulmohar Park', 'Hauz Khas', 'Khanpur', 'Kailash Colony', 'Malviya Nagar', 'Maharani Bagh', 'Moti Bagh', 'New Moti Bagh', 'Mehrauli', 'Munirka', 'Netaji Nagar', 'Pamposh Enclave', 'Safdarjung Enclave', 'Sainik Farm', 'Saket', 'Sarojini Nagar', 'Sarvodaya Enclave', 'Shaheen Bagh', 'Siri Fort', 'South Extension', 'Shahpur Jat', 'Sriniwaspuri', 'Ashram Chowk', 'Lodhi Colony', 'Khan Market', 'Netaji Nagar', 'Nizamuddin East', 'Nizamuddin West', 'Sarai Kale Khan', 'Jangpura', 'Defence Colony', 'Lajpat Nagar', 'New Friends Colony', 'Nehru Place', 'Chittaranjan Park', 'Govindpuri', 'Greater Kailash', 'Okhla', 'Sarita Vihar', 'Sarai Kale Khan', 'Tughlaqabad', 'Badarpur', 'Pul Pehladpur', 'Ber Sarai', 'Dabri, New Delhi', 'Dashrath Puri', 'Dwarka Sub City', 'Delhi Cantonment', 'Dhaula Kuan', 'Ghitorni', 'Inderpuri', 'Janakpuri', 'Mahipalpur', 'Moti Bagh', 'Munirka', 'Najafgarh', 'Naraina', 'Palam', 'Rama Krishna Puram', 'Sagar Pur', 'Sarojini Nagar', 'Vasant Kunj', 'Vasant Vihar', 'Kalkaji', 'Ashok Nagar', 'Bali Nagar', 'Fateh Nagar', 'Kirti Nagar', 'Meera Bagh', 'Mayapuri', 'Moti Nagar', 'Nangloi Jat', 'Paschim Vihar', 'Patel Nagar', 'Punjabi Bagh', 'Rajouri Garden', 'Shivaji Place', 'Shadipur Depot', 'Shiv Ram Park', 'Tihar Village', 'Tilak Nagar', 'Tikri Kalan', 'Vikas Nagar', 'Vikaspuri', 'West Patel Nagar', 'Uttam Nagar', 'New Delhi'],
    'Faridabad': [
        'Ajronda Chowk', 'Ashoka Enclave Part 3', 'Agwanpur', 'Anangpur Dairy', 'Ajit Nagar', 'Ankhir', 'Ajronda', 'Ashoka Enclave', 'Badkhal Chowk', 'Ballabhgarh', 'BPTP Parkland', 'Basantpur', 'Charmwood Village', 'Chawla Colony', 'Dabuwa Colony', 'Dayal Bagh', 'Dabua Colony', 'Dayal Basti', 'Dav College', 'Friends Colony', 'Gandhi Colony', 'Ghazipur', 'Gurukul Basti', 'Gurukul Road', 'Greenfield Colony', 'Gopi Colony', 'Hardware Colony', 'Industrial Area', 'Independant Kothi', 'Indraprastha Colony', 'Ismailpur Road', 'Jawahar Colony', 'Jeevan Nagar', 'Katan Pahari', 'Kheri Road', 'Lakkarpur', 'Mewala Maharajpur', 'Mujesar', 'Mathura Road', 'Parvatiya Colony', 'Raveev Nagar', 'Sainik Colony', 'Sector 12', 'Sector 15', 'Sector 17', 'Sector 20', 'Sector 21C', 'Sector 23 A', 'Sector 27/A', 'Sector 30', 'Sector 35', 'Sector 4', 'Sector 46', 'Sector 55', 'Sector 6', 'Sector 7', 'Sector 54', 'Sector 75', 'Sector 78', 'Sector 84', 'Sector 88', 'SGM Nagar', 'Suraj Kund', 'Sector 28', 'Sector 15A', 'Sector 18', 'Sector 21 D', 'Sector 24', 'Sector 27', 'Sector 32', 'Sector 36', 'Sector 41', 'Sector 48', 'Sector 56', 'Sector 62', 'Sector 8', 'Sector 31', 'Sector 63', 'Sector 80', 'Sector 85', 'Sector 89', 'Suraj Kund Road', 'Sector 10', 'Sector 13', 'Sector 16', 'Sector 19', 'Sector 21A', 'Sector 22', 'Sector 29', 'Sector 33', 'Sector 37', 'Sector 42', 'Sector 5', 'Sector 58', 'Sector 64', 'Sector 9', 'Sector 43', 'Sector 70', 'Sector 76', 'Sector 81', 'Sector 86', 'Shastri Colony', 'Surya Nagar', 'Sector 11', 'Sector 14', 'Sector 16A', 'Sector 2', 'Sector 21B', 'Sector 23', 'Sector 25', 'Sector 3', 'Sector 34', 'Sector 39', 'Sector 45', 'Sector 52', 'Sector 59', 'Sector 65', 'Sector 91', 'Sector 49', 'Sector 72', 'Sector 77', 'Sector 82', 'Sector 87', 'Sehatpur', 'Spring Field Colony', 'Tikawali', 'Vinay Nagar', 'Yadav Colony'    ],
    'Ghaziabad': [
        'Abhay Khand', 'Ankur Vihar', 'Ahinsa Khand 1', 'Ansals Chiranjiv Vihar', 'Ahinsa Khand 2', 'Avantika', 'Ambedkar Road', 'Bhim Nagar', 'Bhram Puri', 'Bhuapur', 'Chander Nagar', 'Crossings Republik Road', 'National Highway 24, Crossings Republik', 'Dundahera', 'Dasna', 'Govindpuram', 'Gyan Khand 3', 'GT Road', 'Gyan Khand IV', 'Gyan Khand I', 'Gyan Khand II', 'Harbans Nagar', 'Harsaon', 'Indirapuram', 'Judges Enclave', 'Kamla Nehru Nagar', 'Kaushambi', 'Kavi Nagar', 'Lajpat Nagar', 'Lal Kuan', 'Lohia Nagar', 'Loni', 'Madhopura', 'Marium Nagar', 'Madhuban Bapudham', 'Model Town', 'Maliwara', 'Mohan Nagar', 'Nandgram', 'Nehru Nagar III', 'Niti Khand I', 'Nyay Khand 2', 'Naya Ganj', 'NH 24', 'Niti Khand II', 'Nyay Khand III', 'Neelmani Colony', 'NH 24 Bypass', 'Niti Khand 3', 'Nehru Nagar II', 'NH 58', 'Nyay Khand I', 'Panchsheel Enclave', 'Pandav Nagar Industrial Area', 'Patel Nagar', 'Pratap Vihar', 'Ramprastha', 'Raispur', 'Raj Nagar', 'Raj Nagar Extension', 'Ramprastha', 'Sadiqpur', 'Sewa Nagar', 'Shakti Khand III', 'Shalimar Garden Extension II', 'Surya Nagar', 'Sahibabad', 'Shahpur Bamheta', 'Shakti Khand IV', 'Shastri Nagar', 'Swaran Jyanti Puram', 'Sanjay Nagar', 'Shakti Khand I', 'Shalimar Garden', 'Shatabdipuram', 'Sehani Khurd', 'Shakti Khand II', 'Shalimar Garden Extention I', 'Vaishali', 'Vaishali Sector 1', 'Vaishali Sector 5', 'Vasundhara Sector 12', 'Vasundhara Sector 16', 'Vasundhara Sector 2C', 'Vasundhara Sector 6', 'Vijay Nagar', 'Vasundhara', 'Vaishali Sector 2', 'Vasundhara Sector 1', 'Vasundhara Sector 13', 'Vasundhara Sector 17', 'Vasundhara Sector 3', 'Vasundhara Sector 7', 'Vaibhav Khand', 'Vaishali Sector 3', 'Vasundhara Sector 10', 'Vasundhara Sector 14', 'Vasundhara Sector 4', 'Vasundhara Sector 8', 'Vaishali Extension', 'Vaishali Sector 4', 'Vasundhara Sector 11', 'Vasundhara Sector 15', 'Vasundhara Sector 19', 'Vasundhara Sector 5', 'Vasundhara Sector 9', 'Daulatpura', 'Nasirpur'    ],
    'Gurugram': [
        'Sector 56', 'Sector 50', 'Sector 48', 'Sohna Sector 35', 'Sector 53', 'Sector 42', 'Sector 31', 'Sector 81', 'Sector 83', 'Sector 65', 'Sector 86', 'Sector 92', 'Sector 58', 'Sector 59', 'Sushant Lok Phase 2', 'Sector 80', 'Sector 3', 'DLF Phase 2', 'Sector 99', 'Sector 85', 'Sector 9', 'Sector 4', 'Sohna Sector 14', 'Sector 95A', 'Sector 10', 'Sector 76', 'Sector 9 A', 'Dwarka Expressway', 'Palam Vihar Extension', 'Sohna Sector 11', 'Sector 54', 'Sector 49', 'Sohna Road', 'Sector 52', 'Golf Course Road', 'Palam Vihar', 'Sector 24', 'Sector 15', 'Sector 57', 'South City', 'Sector 46', 'DLF Phase 4', 'Sector 72', 'Sector 79', 'Sector 1', 'Sector 41', 'Sohna Sector 33', 'Sector 14', 'Sector 23', 'Sector 40', 'Sector 108', 'Sushant Lok Phase 3', 'DLF Phase 5', 'Sector 104', 'Sector 89', 'Sector 5', 'Sector 63', 'Sector 78', 'Sector 93', 'Sohna Sector 36', 'Sector 43', 'Sector 95', 'Sector 27', 'Sector 69', 'Sohna', 'Sector 66', 'Sector 22', 'Sector 67', 'Sector 37', 'Sector 110 A', 'Sector 26', 'Sector 99A', 'Golf Course Extension', 'Sector 62', 'Sector 7', 'Sector 90', 'Sector 71', 'Sector 55', 'Sector 17', 'Sector 70', 'Sector 39', 'Sector 3(New Palam Vihar) MG Road', 'Sector 91', 'Sector 77', 'Sector 10 A', 'Sector 105', 'Sector 32', 'Sector 25', 'Sector 47', 'DLF Phase 3', 'Sector 61', 'Sector 33', 'Sushant Lok I', 'Sector 51', 'Sector 82', 'Sector 45', 'Sector 23 A', 'Sector 103', 'Sector 28', 'Sector 70', 'Sector 38', 'Sector 30', 'Sector 109', 'Sector 21', 'Sector 68', 'Sector 111', 'Laxman Vihar', 'DLF Phase 1', 'Sector 107', 'Sector 102', 'Sector 84', 'Patel Nagar', 'Sector 112', 'Ashok Vihar Phase II NH8', 'Sector 82 A', 'Old Delhi Gurgaon Road', 'Sohna Sector 2'    ],
    'Noida': [
        'Noida Extension', 'Sector 62', 'Gaur City 1 Extension, Greater Noida', 'Sector 44', 'Sector 73', 'Sector 78', 'Sector 75', 'Noida Greater Noida Expressway', 'Sector 137', 'Sector 50', 'Sector 150', 'Sector 15', 'Sector 37', 'Sector 71', 'Sector 76', 'Sector 18', 'Sector 16', 'Sector 74', 'Sector 135', 'Sector 121', 'Sector 51', 'Sector 45', 'Sector 63', 'Sector 70', 'Sector 19', 'Sector 49', 'Sector 12', 'Sector 47', 'Sector 52', 'Sector 34', 'Sector 61', 'Sector 62A', 'Sector 79', 'Sector 128', 'Sector 29', 'Sector 27', 'Sector 41', 'Sector 22', 'Wave City Center, Sector 32', 'Sector 20', 'Sector 123', 'Lotus Boulevard, Sector 100', 'Sector 125', 'Sector 122', 'Sector 77', 'Sector 144', 'Sector 120', 'Sector 40', 'Sector 11', 'Sector 100', 'Sector 72', 'Sector 30', 'ATS Greens Village, Sector 93A', 'Sector 107', 'Sector 56', 'Sector 15A', 'Sector 115', 'Sector 82', 'Sector 132', 'Sector 99', 'Sector 93', 'Sector 55', 'Sector 127', 'Sector 26', 'Sector 119', 'Sector 143', 'Sector 1', 'Sector 53', 'Sector 105', 'Sector 39', 'Sector 59', 'Sector 126', 'Sector 54', 'Sector 63 A', 'Sector 108', 'Sector 81', 'Sector 33', 'Sector 134', 'Sector 104', 'Sector 36', 'Sector 28', 'Amrapali Leisure Valley Extension', 'Sector 25', 'Sector 46', 'Sector 92', 'Sector 110', 'Sector 23', 'Sector 142', 'Sector 66', 'Sector 145', 'Sector 48', 'Sector 58', 'Sector 5', '3C Lotus Zing, Sector 168', 'Sector 93A', 'Sector 168', 'Sector 102', 'Sector 43', 'Sector 35', 'Sector 14', 'Yamaha Vihar Colony, Sector 49', 'JalVayu Vihar', 'Block B, Sector 62', 'Sector 93B', 'Chotpur Colony, Chhajarsi Colony', 'Sector 96', 'Sector 117', 'Sector 124', 'Sector 101', 'BHEL Township, Sector 17', 'Sector 21', 'Sector 118', 'Hoshiyarpur, Sector 51', 'Sector 97', 'Sector 116', 'Sector 31', 'Sector 131', 'Chhalera, Sector 44', 'Sector 98', 'C Block, Sector 62', 'Sector 129', 'Sector 16A', 'Hosiery Complex Road', 'Sector 87', 'Phase 2', 'Parthala Khanjarpur, Sector 122', 'Sector 151', 'Sector 7', 'Sarfabad Village, Sector 73', 'Sector 17', 'Yakubpur', 'Sector 6', 'Sector 3', 'Sector 10', 'Sector 65', 'Ecotech 3, Greater Noida', 'Sector 68', 'Udyog Vihar, Sector 82', 'Sector 80', 'Sector 130', 'Sector   106', 'Sector 149', 'Sector 57', 'Sector 94', 'Sector 2', 'Sector 140', 'Sector 133', 'Sector 136', 'Sector 156', 'Sector 42', 'Nagla Charan Dass, Phase 2', 'Patwari Extension, Greater Noida', 'Sector 146', 'Sector 166', 'Sector 9', 'Sector 112', 'Sector 90', 'Rasoolpur Nawada, Sector 62', 'Devla, Greater Noida', 'Sector 138', 'Sector 32', 'Sector 88', 'Sector 4', 'Pocket A, Sector 105', 'Natthu Colony, Sector 49', 'Shahdara, Sector 141', 'Sector 113', 'Sorkha', 'Khairpur Gurjar Extension, Greater Noida', 'Khajur Colony, Sector 44', 'Sector 147', 'Chhajarsi Colony', 'Phase 2, Sector 62', 'Sector 152', 'Sector 16B', 'Garhi Chaukhandi, Chaukhandi', 'Sector 14A', 'Sector 143B', 'Sector 153', 'Sector 162', 'Chaukhandi', 'D Block, Sector 27', 'Naveen Nagar, Sector 19', 'Sector 94A', 'Sector 83', 'Nagla Nagli', 'Tusiana Village Extension, Greater Noida', 'Ambedkar City', 'Kakrala', 'Pragati Vihar, Sector 62A', 'Greater Noida', 'Yamuna Expressway'
    ]
}

In [31]:
import re

def populate_locality(df, localities_dict):
    for index, row in df.iterrows():
        text = row['Property Title']
        matched_key = None
        matched_value = None
        
        # Search for keys in the text
        for key in localities_dict:
            if re.search(r'\b{}\b'.format(re.escape(key)), text, flags=re.IGNORECASE):
                matched_key = key
                break
        
        # If a key is found, search for its values
        if matched_key:
            values = localities_dict[matched_key]
            for value in values:
                if re.search(r'\b{}\b'.format(re.escape(value)), text, flags=re.IGNORECASE):
                    matched_value = value
                    break
        
        # Assign the matched value to the City/Locality column
        if matched_key and matched_value:
            df.at[index, 'City/Locality'] = f"{matched_value}, {matched_key}"
        else:
            df.at[index, 'City/Locality'] = "No matching locality found."

# Apply the function to populate the City/Locality column
populate_locality(houses_df, Localities)

In [33]:
# Count rows with "No matching locality found."
no_match_count = (houses_df['City/Locality'] == "No matching locality found.").sum()
# Display the count of rows with no matching locality found
print(f"Number of rows with 'No matching locality found.': {no_match_count}")

Number of rows with 'No matching locality found.': 1636


In [37]:
houses_df.drop(houses_df[houses_df['City/Locality'] == 'No matching locality found.'].index, inplace=True)

In [38]:
houses_df.shape[0]

4379

In [40]:
# Dropping columns 'Property Name' and 'Property Title'
houses_df.drop(columns=['Property Name', 'Property Title'], inplace=True)

In [41]:
houses_df.head()

Unnamed: 0,Property Type,City/Locality,BHK,Property Size (sqft),Furnishing,Price (INR),Price (per sqft)
0,"Residential-House,Villa","Narela, Delhi",1,225,Unfurnished,700000,3111
1,"Residential-House,Villa","Uttam Nagar, Delhi",1,225,Unfurnished,1100000,4888
2,"Residential-House,Villa","Najafgarh, Delhi",1,252,Semi-Furnished,700000,2777
28,"Residential-House,Villa","Ballabhgarh, Faridabad",1,360,Furnished,1400000,3888
30,"Residential-House,Villa","Sagar Pur, Delhi",1,450,Unfurnished,2100000,4666


In [42]:
localities_delhi = {
        "North West Delhi": [
        "Adarsh Nagar", "Ashok Vihar", "Bawana", "Begum Pur", "Haqiqat Nagar",
        "Karala", "Keshav Puram", "Narela", "Pitam Pura", "Rohini",
        "Rani Bagh", "Shalimar Bagh", "Shastri Nagar"
    ],
    "North Delhi": [
        "Azadpur", "Civil Lines", "Derawal Nagar", "Gulabi Bagh", "Kamla Nagar",
        "Kashmiri Gate", "Daryaganj", "Model Town", "Narela", "Sadar Bazaar",
        "Sarai Rohilla", "Shakti Nagar", "Tis Hazari", "Timarpur", "Wazirabad",
        "GTB Nagar", "Urdu Bazaar", "Mukherjee Nagar", "Majnu ka tilla"
    ],
    "North East Delhi": [
        "Babarpur", "Bhajanpura", "Dayal Pur", "Dilshad Garden", "Karawal Nagar",
        "Naveen Shahdara", "Nand Nagri", "Shahdara", "Shastri Park", "Seelampur",
        "Yamuna Vihar"
    ],
    "Central Delhi": [
        "Ashok Nagar", "Chandni Chowk", "Civil Lines", "Daryaganj", "Dariba Kalan",
        "Karol Bagh", "Old Delhi", "Shastri Nagar", "Patel Nagar", "Sadar Bazaar",
        "Paharganj", "Rajender Nagar"
    ],
    "New Delhi": [
        "Barakhamba Road", "Chanakyapuri", "Connaught Place", "Gole Market",
        "Golf Links", "INA Colony", "Inder Puri", "Jaffrabad", "Laxmibai Nagar",
        "Lodhi Colony", "Lutyens' Delhi", "Mahipalpur", "New Delhi", "Pragati Maidan",
        "Raisina Hill", "Rajendra Place"
    ],
    "East Delhi": [
        "East Vinod Nagar", "Krishna Nagar", "Laxmi Nagar", "Mayur Vihar",
        "Pandav Nagar", "Preet Vihar", "Anand Vihar", "Shreshtha Vihar", "Vivek Vihar",
        "Vasundhara Enclave"
    ],
    "South Delhi": [
        "Geetanjali Enclave", "Green Park", "Gulmohar Park", "Hauz Khas", "Khanpur",
        "Kailash Colony", "Malviya Nagar", "Maharani Bagh", "Moti Bagh", "New Moti Bagh",
        "Mehrauli", "Munirka", "Netaji Nagar", "Pamposh Enclave", "Safdarjung Enclave",
        "Sainik Farm", "Saket", "Sarojini Nagar", "Sarvodaya Enclave", "Shaheen Bagh",
        "Siri Fort", "South Extension", "Shahpur Jat", "Sriniwaspuri", "Shahpur", "Sainik Farms"
    ],
    "South East Delhi": [
        "Ashram Chowk", "Lodhi Colony", "Khan Market", "Netaji Nagar", "Nizamuddin East",
        "Nizamuddin West", "Sarai Kale Khan", "Jangpura", "Defence Colony", "Lajpat Nagar",
        "New Friends Colony", "Nehru Place", "Chittaranjan Park", "Govindpuri",
        "Greater Kailash", "Okhla", "Sarita Vihar", "Sarai Kale Khan", "Tughlaqabad",
        "Badarpur", "Pul Pehladpur"
    ],
    "South West Delhi": [
        "Ber Sarai", "Dabri", "Dashrath Puri", "Dwarka", "Delhi Cantonment",
        "Dhaula Kuan", "Ghitorni", "Inderpuri", "Janakpuri", "Mahipalpur", "Moti Bagh",
        "Munirka", "Najafgarh", "Naraina", "Palam", "Rama Krishna Puram", "Sagar Pur",
        "Sarojini Nagar", "Vasant Kunj", "Vasant Vihar", "Kalkaji"
    ],
    "West Delhi": [
        "Ashok Nagar", "Bali Nagar", "Fateh Nagar", "Kirti Nagar", "Meera Bagh",
        "Mayapuri", "Moti Nagar", "Nangloi Jat", "Paschim Vihar", "Patel Nagar",
        "Punjabi Bagh", "Rajouri Garden", "Shivaji Place", "Shadipur Depot", "Shiv Ram Park",
        "Tihar Village", "Tilak Nagar", "Tikri Kalan", "Vikas Nagar", "Vikaspuri",
        "West Patel Nagar", "Uttam Nagar", "Nangloi", "Tikri"
    ]
}

In [45]:
houses_df['City/Locality'] = houses_df['City/Locality'].str.replace(r', Delhi', '')

# Function to update City/Locality column
def update_locality(row):
    for region, localities in localities_delhi.items():
        for locality in localities:
            if row['City/Locality'] == locality:
                return f"{locality}, {region}"
    return row['City/Locality']

# Apply the function to update the column
houses_df['City/Locality'] = houses_df.apply(update_locality, axis=1)

# Display the updated DataFrame
houses_df.head()

Unnamed: 0,Property Type,City/Locality,BHK,Property Size (sqft),Furnishing,Price (INR),Price (per sqft)
0,"Residential-House,Villa","Narela, North West Delhi",1,225,Unfurnished,700000,3111
1,"Residential-House,Villa","Uttam Nagar, West Delhi",1,225,Unfurnished,1100000,4888
2,"Residential-House,Villa","Najafgarh, South West Delhi",1,252,Semi-Furnished,700000,2777
28,"Residential-House,Villa","Ballabhgarh, Faridabad",1,360,Furnished,1400000,3888
30,"Residential-House,Villa","Sagar Pur, South West Delhi",1,450,Unfurnished,2100000,4666


In [47]:
houses_df.to_csv("houses_data_cleaned.csv")