In [155]:
import pandas as pd
import numpy as np
import re

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import warnings
warnings.filterwarnings(action='ignore')

In [None]:
flats = pd.read_csv("./flats.csv")
flats.head()

In [None]:
# dropping link column - not useful in analysis 
flats = flats.drop(columns='link')

In [None]:
flats.shape

In [None]:
flats.info()

In [None]:
flats.isnull().sum()

In [None]:
backup = flats.copy()

### society

In [None]:
flats['society'].value_counts().shape

There are 638 unique society names

In [None]:
flats['society'] = flats['society'].str.lower().str.replace(r'\d+\.\d+\s★','', regex=True)
# re.sub(r'\d+\.\d+\s★', '', flats['society'])

In [None]:
flats['society'].nunique()

now we have 607 unique society names

### price

In [None]:
flats['price'].value_counts()

In [None]:
flats[flats['price'] == 'Price on Request'].count()

- in 11 rows, price has a value 'Price on Request',
- dropping these rows because 11 is very small as compared to our 2500 rows of data

In [None]:
flats = flats[flats['price'] != 'Price on Request']

In [112]:
def handle_price(x):
    if isinstance(x, str):  # Check if x is a string
        if 'Lac' in x:
            return round(float(x.split(' ')[0]) / 100, 2)
        else:
            return round(float(x.split(' ')[0]), 2)
    return x  # Return NaN as-is

flats['price'] = flats['price'].apply(handle_price)

### area

In [None]:
flats['area'] = pd.to_numeric(flats['area'].str.split('/').str.get(0).str.replace(',','').str.replace('₹',''))

In [198]:
# renaming "area" column to "price_per_sqft"

flats.rename(columns={'area' : "price_per_sqft"}, inplace=True)

In [200]:
flats.columns

Index(['property_name', 'society', 'price', 'price_per_sqft', 'areaWithType',
       'bedRoom', 'bathroom', 'balcony', 'additionalRoom', 'address',
       'floorNum', 'facing', 'agePossession', 'nearbyLocations', 'description',
       'furnishDetails', 'features', 'rating', 'property_id'],
      dtype='object')

In [202]:
# calculating built-up area from price of hte flat and price/sq.ft. and making a new column "area"
flats.insert(loc=4, column="area", value=round((flats['price'] * 10000000)/ flats['price_per_sqft']))

### bedroom

In [137]:
flats['bedRoom'].value_counts()

3 Bedrooms    1437
2 Bedrooms     944
4 Bedrooms     478
1 Bedroom      104
5 Bedrooms      31
6 Bedrooms       3
Name: bedRoom, dtype: int64

In [158]:
flats['bedRoom'] = flats['bedRoom'].str.split(' ').str.get(0).astype(int)

In [141]:
flats['bedRoom'].value_counts()

3    1437
2     944
4     478
1     104
5      31
6       3
Name: bedRoom, dtype: int64

In [None]:
# dropping rows which had null values in multiple columns like bedrooms, bathroom , balconies and columns after that

flats = flats[~flats['bedRoom'].isnull()]

### bathroom

In [153]:
flats['bathroom'].value_counts()

2    1044
3     989
4     636
5     169
1     112
6      42
7       5
Name: bathroom, dtype: int64

In [157]:
flats['bathroom'] = flats['bathroom'].str.split(' ').str.get(0).astype(int)

In [146]:
flats['bathroom'].value_counts()

2    1044
3     989
4     636
5     169
1     112
6      42
7       5
Name: bathroom, dtype: int64

### balcony

In [148]:
flats['balcony'].value_counts()

3 Balconies     974
3+ Balconies    862
2 Balconies     749
1 Balcony       315
No Balcony       97
Name: balcony, dtype: int64

In [168]:
flats['balcony'] = flats['balcony'].str.split(' ').str.get(0).str.replace('No','0')

### additional room

In [170]:
flats['additionalRoom'].value_counts()

Servant Room                                     629
Study Room                                       232
Others                                           179
Pooja Room                                       132
Study Room,Servant Room                           81
Store Room                                        76
Pooja Room,Servant Room                           60
Servant Room,Others                               52
Servant Room,Pooja Room                           30
Study Room,Others                                 27
Pooja Room,Study Room,Servant Room,Others         25
Pooja Room,Study Room,Servant Room                24
Servant Room,Store Room                           19
Pooja Room,Study Room                             13
Pooja Room,Study Room,Servant Room,Store Room     12
Study Room,Pooja Room                              8
Servant Room,Study Room                            8
Study Room,Servant Room,Store Room                 7
Pooja Room,Store Room                         

In [174]:
flats['additionalRoom'] = flats['additionalRoom'].fillna('not available').str.lower()

### floorNum

In [177]:
flats['floorNum'].value_counts()

2nd   of 4 Floors           74
3rd   of 4 Floors           71
4th   of 4 Floors           62
1st   of 4 Floors           61
12nd   of 14 Floors         49
14th   of 14 Floors         48
Ground of 14 Floors         40
10th   of 14 Floors         35
7th   of 14 Floors          35
8th   of 14 Floors          34
4th   of 14 Floors          28
6th   of 14 Floors          27
2nd   of 2 Floors           26
1st   of 14 Floors          26
3rd   of 3 Floors           26
3rd   of 14 Floors          24
5th   of 14 Floors          24
8th   of 19 Floors          24
11st   of 14 Floors         23
1st   of 1 Floors           23
9th   of 14 Floors          23
9th   of 9 Floors           23
5th   of 12 Floors          22
2nd   of 3 Floors           22
2nd   of 14 Floors          21
8th   of 18 Floors          20
10th   of 19 Floors         18
6th   of 18 Floors          18
10th   of 18 Floors         17
9th   of 13 Floors          17
7th   of 15 Floors          17
12nd   of 12 Floors         17
Ground o

In [191]:
flats['floorNum'] = flats['floorNum'].str.split(' ').str.get(0).str.replace('Ground','0').str.replace('Basement','-1').str.replace('Lower','0').str.extract(r'(\d+)')

### adding an additional column "property_type"

In [204]:
flats.insert(loc=0, column="property_type", value="flat")

### exporting "flats" df to a csv file

In [207]:
flats.to_csv("flats_cleaned_v1.csv")

In [209]:
flats.head()

Unnamed: 0,property_type,property_name,society,price,price_per_sqft,area,areaWithType,bedRoom,bathroom,balcony,additionalRoom,address,floorNum,facing,agePossession,nearbyLocations,description,furnishDetails,features,rating,property_id
0,flat,2 BHK Flat in Krishna Colony,maa bhagwati residency,0.45,5000.0,900.0,Carpet area: 900 (83.61 sq.m.),2,2,1,not available,"Krishna Colony, Gurgaon, Haryana",4,West,1 to 5 Year Old,"['Chintapurni Mandir', 'State bank ATM', 'Pear...",So with lift.Maa bhagwati residency is one of ...,"['3 Fan', '4 Light', '1 Wardrobe', 'No AC', 'N...","['Feng Shui / Vaastu Compliant', 'Security / F...","['Environment4 out of 5', 'Safety4 out of 5', ...",C68850746
1,flat,2 BHK Flat in Ashok Vihar,apna enclave,0.5,7692.0,650.0,Carpet area: 650 (60.39 sq.m.),2,2,1,not available,"46b, Ashok Vihar, Gurgaon, Haryana",1,West,10+ Year Old,"['Chintapurni Mandir', 'Sheetla Mata Mandir', ...","Property situated on main road, railway statio...","['3 Wardrobe', '4 Fan', '1 Exhaust Fan', '1 Ge...","['Security / Fire Alarm', 'Maintenance Staff',...","['Environment4 out of 5', 'Safety4 out of 5', ...",H68850564
2,flat,2 BHK Flat in Sohna,tulsiani easy in homes,0.4,6722.0,595.0,Carpet area: 595 (55.28 sq.m.),2,2,3,not available,"Sohna, Gurgaon, Haryana",12,,0 to 1 Year Old,"['Huda City Metro', 'Golf Course extn road', '...","This property is 15 km away from badshapur, gu...",,"['Power Back-up', 'Feng Shui / Vaastu Complian...","['Environment4 out of 5', 'Safety4 out of 5', ...",J68850120
3,flat,2 BHK Flat in Sector 61 Gurgaon,smart world orchard,1.47,12250.0,1200.0,Carpet area: 1200 (111.48 sq.m.),2,2,2,study room,"Sector 61 Gurgaon, Gurgaon, Haryana",2,,Dec 2023,"['Sector 55-56 Metro station', 'Bestech Centra...",Near to metro station of sector 56 and opposit...,,"['Security / Fire Alarm', 'Private Garden / Te...",,S68849476
4,flat,2 BHK Flat in Sector 92 Gurgaon,parkwood westend,0.7,5204.0,1345.0,Super Built up area 1345(124.95 sq.m.),2,2,3,study room,"Sector 92 Gurgaon, Gurgaon, Haryana",5,,Under Construction,"['Yadav Clinic', 'Bangali Clinic', 'Dr. J. S. ...",We are the proud owners of this 2 bhk alongwit...,[],,"['Environment5 out of 5', 'Safety3 out of 5', ...",L47956793
