In [None]:
import pandas as pd
import numpy as np

: 

In [None]:
df= pd.read_csv('Mumbai House Prices.csv')
df.head()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.isna().sum()

In [None]:
#Dropping the duplicate rows
df=df.drop_duplicates()
df.describe()

In [None]:
df.price_unit.unique()

In [None]:
#Converting all the prices to lakhs
def cnvrt_price_to_lakhs(row):
    return row['price']*100 if row['price_unit']=='Cr' else row['price']
df['price_lakhs']=df.apply(cnvrt_price_to_lakhs, axis=1)

In [None]:
df.head()

In [None]:
# 'price' and 'price_unit' column dropped 
df1=df.drop(['price','price_unit'],axis=1)

In [None]:
df1.head()

In [None]:
df1.isna().sum()

In [None]:
len(df1.locality.unique())

In [None]:
#Handling the localities
locality_counts=df1['locality'].value_counts()
locality_counts.head(20).plot(kind='barh')

In [None]:
#Filtering the localities with more than 40 properties
rare_localities = locality_counts[locality_counts < 40].index

df1['locality_simplified'] = df1['locality'].apply(lambda x: 'Other' if x in rare_localities else x)

len(df1.locality_simplified.unique())

In [None]:
#Dropped 'locality' column
df1.drop(['locality'],axis=1,inplace=True)
df1.head()

In [None]:
#Set dummies for 'locality_simplified'
locality_list = df1.locality_simplified.unique().tolist()
df2 = pd.get_dummies(df1, columns=['locality_simplified'])
df2.head()

In [None]:
len(df2.region.unique())     

In [None]:
#Group rare regions
region_counts = df2['region'].value_counts()
rare_regions = region_counts[region_counts < 10].index

df2['region_simplified'] = df2['region'].apply(lambda x: 'Other' if x in rare_regions else x)
len(df2.region_simplified.unique())


In [None]:
df2.drop(['region'], axis=1, inplace=True)
df2.head()

In [None]:
#Set dummies for 'region_simplified'
region_list=df2.region_simplified.unique().tolist()
df3= pd.get_dummies(df2, columns=['region_simplified'])
df3.head()

In [None]:
df3.type.unique()

In [None]:
#Set dummies for 'type'
type_list = df3.type.unique().tolist()
df4= pd.get_dummies(df3,columns=['type'])
df4.head()

In [None]:
df4.status.unique()

In [None]:
#Set dummies for 'status'
status_list=df4.status.unique().tolist()
df5= pd.get_dummies(df4, columns=['status'])
df5.head()

In [None]:
df5.age.unique()

In [None]:
#Set dummies for 'age'
age_list= df5.age.unique().tolist()
df6= pd.get_dummies(df5, columns=['age'])
df6.head()

In [None]:
df6.isna().sum()

In [None]:
df6.shape

In [None]:
df6.info()

In [None]:
df6.describe()

In [None]:
x= df6.drop('price_lakhs', axis=1)  
y= df6['price_lakhs']

In [None]:
#Split the data 
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test= train_test_split(x, y, test_size=0.2, random_state=11)

In [None]:
from sklearn.ensemble import RandomForestRegressor as RF
model = RF(max_features= x_train.shape[1],
                random_state=1)
model.fit(x_train,y_train)

In [None]:
y_pred= model.predict(x_test)

In [None]:
import matplotlib.pyplot as plt
ax = plt.subplots(figsize=(12,12))[1]
ax.scatter(y_test, y_pred)
ax.set_xlabel("Actual Price")
ax.set_ylabel("Predicted Price")

In [None]:
# Residuals
import seaborn as sns
residuals = y_test - model.predict(x_test)
sns.histplot(residuals, kde=True)
plt.title("Residual Distribution")
plt.show()

In [None]:
from sklearn.metrics import r2_score 
r2 = r2_score(y_test, y_pred)
r2

In [None]:
#changing number of trees grown from the default of 100
model_ = RF(max_features=x_train.shape[1],
                n_estimators=500,
                random_state=0).fit(x_train, y_train)
y_hat = model_.predict(x_test)
r2_score(y_test , y_hat)

Since the r2 score of model is slightly greater than model_ we will use model

In [None]:
import joblib

feature_names = x_train.columns.tolist()  
joblib.dump(feature_names, 'feature_names.joblib')

# Save the model
joblib.dump(model, 'model_filename.joblib')

# Save category lists
joblib.dump(locality_list, 'locality_list.joblib')
joblib.dump(region_list, 'region_list.joblib')
joblib.dump(status_list, 'status_list.joblib')
joblib.dump(age_list, 'age_list.joblib')
joblib.dump(type_list, 'type_list.joblib')
