In [31]:
import pandas as pd

# 1. Import the dataset and clean column names
df = pd.read_csv("Mumbai_Property.csv")

# Clean column names by removing spaces and special characters, and renaming for clarity
df.columns = df.columns.str.strip()  # Remove leading/trailing spaces
df.columns = df.columns.str.replace(' ', '_')  # Replace spaces with underscores
df.columns = df.columns.str.replace('[^a-zA-Z0-9_]', '', regex=True)  # Remove special characters


In [32]:
df.head()

Unnamed: 0,Property_Name,Location,Region,Property_Age,Availability,Area_Tpye,Area_SqFt,Rate_SqFt,Floor_No,Bedroom,Bathroom,Price_Lakh
0,Omkar Alta Monte,W E Highway Malad East Mumbai,Malad Mumbai,0 to 1 Year,Ready To Move,Super Built Up Area,2900.0,17241,14,3,4,500.0
1,T Bhimjyani Neelkanth Woods,Manpada Thane Mumbai,Manpada Thane,1 to 5 Year,Ready To Move,Super Built Up Area,1900.0,12631,8,3,3,240.0
2,Legend 1 Pramila Nagar,Dahisar West Mumbai,Dahisar Mumbai,10+ Year,Ready To Move,Super Built Up Area,595.0,15966,3,1,2,95.0
3,Unnamed Property,Vidyavihar West Vidyavihar West Central Mumbai...,Central Mumbai,5 to 10 Year,Ready To Move,Built Up Area,1450.0,25862,1,3,3,375.0
4,Unnamed Property,176 Cst Road Kalina Mumbai 400098 Santacruz Ea...,Santacruz Mumbai,5 to 10 Year,Ready To Move,Carpet Area,876.0,39954,5,2,2,350.0


In [33]:
df.shape

(2580, 12)

In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2580 entries, 0 to 2579
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Property_Name  2580 non-null   object 
 1   Location       2580 non-null   object 
 2   Region         2580 non-null   object 
 3   Property_Age   2580 non-null   object 
 4   Availability   2580 non-null   object 
 5   Area_Tpye      2580 non-null   object 
 6   Area_SqFt      2580 non-null   float64
 7   Rate_SqFt      2580 non-null   int64  
 8   Floor_No       2580 non-null   int64  
 9   Bedroom        2580 non-null   int64  
 10  Bathroom       2580 non-null   int64  
 11  Price_Lakh     2580 non-null   float64
dtypes: float64(2), int64(4), object(6)
memory usage: 242.0+ KB


In [35]:

# 2. Filter based on a specific property age:
filtered_by_property_Age = df[df['Property_Age'] == '0 to 1 Year']

# 3. Filter based on a specific location (e.g., "Manpada Thane Mumbai"):
filtered_by_location = df[df['Location'] == 'Manpada Thane Mumbai']

# 4. Combine multiple filters 
filtered_combined = df[(df['Property_Age'] == '0 to 1 Year') & (df['Location'] == 'Manpada Thane Mumbai')]

# Print or use the filtered data as needed

print(filtered_by_property_Age)
print(filtered_by_location)
print(filtered_combined)

                 Property_Name  \
0            Omkar Alta Monte    
9          Sunteck West World    
20      Lodha Palava Downtown    
26         Puraniks Tokyo Bay    
35          Indiabulls Greens    
...                        ...   
2568           Satyam Harmony    
2569           Satyam Harmony    
2571  Lodha Codename Bullseye    
2576               Guru Anant    
2579  Gurukrupa Tulsi Heights    

                                              Location  \
0                        W E Highway Malad East Mumbai   
9     701 702 Naigaon East Mira Road And Beyond Mumbai   
20                          Dombivli East Thane Mumbai   
26                         Kasar vadavali Thane Mumbai   
35                           Panvel Navi Mumbai Mumbai   
...                                                ...   
2568                  Koparkhairane Navi Mumbai Mumbai   
2569                  Koparkhairane Navi Mumbai Mumbai   
2571   1503 Mira Road East Mira Road And Beyond Mumbai   
2576       

In [38]:
# 2. Handle missing values:
# You have a few options to deal with missing values, depending on your dataset:
# a. Imputation (filling missing values with a suitable strategy)
# Example: Fill missing numerical values with the mean, and missing categorical values with a mode
df['Bedroom'].fillna(df['Bedroom'].mean(), inplace=True)
# df['Categorical_Column'].fillna(df['Categorical_Column'].mode()[0], inplace=True)

# b. Removing rows with missing values
# Example: Remove rows with any missing value in any column
df.dropna(axis=0, how='any', inplace=True)

# c. Removing columns with missing values
# Example: Remove columns with more than 30% missing values
threshold = len(df) * 0.3
df.dropna(thresh=threshold, axis=1, inplace=True)


In [39]:
df.head()

Unnamed: 0,Property_Name,Location,Region,Property_Age,Availability,Area_Tpye,Area_SqFt,Rate_SqFt,Floor_No,Bedroom,Bathroom,Price_Lakh
0,Omkar Alta Monte,W E Highway Malad East Mumbai,Malad Mumbai,0 to 1 Year,Ready To Move,Super Built Up Area,2900.0,17241,14,3,4,500.0
1,T Bhimjyani Neelkanth Woods,Manpada Thane Mumbai,Manpada Thane,1 to 5 Year,Ready To Move,Super Built Up Area,1900.0,12631,8,3,3,240.0
2,Legend 1 Pramila Nagar,Dahisar West Mumbai,Dahisar Mumbai,10+ Year,Ready To Move,Super Built Up Area,595.0,15966,3,1,2,95.0
3,Unnamed Property,Vidyavihar West Vidyavihar West Central Mumbai...,Central Mumbai,5 to 10 Year,Ready To Move,Built Up Area,1450.0,25862,1,3,3,375.0
4,Unnamed Property,176 Cst Road Kalina Mumbai 400098 Santacruz Ea...,Santacruz Mumbai,5 to 10 Year,Ready To Move,Carpet Area,876.0,39954,5,2,2,350.0


In [23]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Sample DataFrame with a categorical variable 'Location'
data = {'Location': ['W E Highway Malad East Mumbai', 'Manpada Thane Mumbai', 'W E Highway Malad East Mumbai', 'Dahisar West Mumbai	', 'Manpada Thane Mumbai']}
df = pd.DataFrame(data)

# One-Hot Encoding
one_hot_encoded = pd.get_dummies(df, columns=['Location'], prefix=['Location'])
print("One-Hot Encoding:")
print(one_hot_encoded)

# Label Encoding
label_encoder = LabelEncoder()
df['Location_LabelEncoded'] = label_encoder.fit_transform(df['Location'])
print("\nLabel Encoding:")
print(df[['Location', 'Location_LabelEncoded']])


# One-Hot Encoding: pd.get_dummies() is used to create binary columns for each category in the 'Location' column. This method is useful when you don't want to impose any ordinal relationship between categories.

# Label Encoding: The LabelEncoder from scikit-learn is used to encode categorical values into numerical labels. Each unique category is assigned a unique integer label. This method can be suitable when there's an ordinal relationship between categories.


# The purpose of encoding categorical variables in data analysis and machine learning is to represent categorical data in a numerical format that can be used for modeling or analysis. Categorical variables are variables that represent categories or labels, such as colors, cities, property types, or any non-numeric values.


One-Hot Encoding:
   Location_Dahisar West Mumbai\t  Location_Manpada Thane Mumbai  \
0                               0                              0   
1                               0                              1   
2                               0                              0   
3                               1                              0   
4                               0                              1   

   Location_W E Highway Malad East Mumbai  
0                                       1  
1                                       0  
2                                       1  
3                                       0  
4                                       0  

Label Encoding:
                        Location  Location_LabelEncoded
0  W E Highway Malad East Mumbai                      2
1           Manpada Thane Mumbai                      1
2  W E Highway Malad East Mumbai                      2
3          Dahisar West Mumbai\t                      0
4           