### Importing libraries

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

### reading the data 

In [7]:
df = pd.read_csv('mysuru_fin.csv')
avg = pd.read_csv('avg.csv')

In [9]:
df.head()

Unnamed: 0.1,Unnamed: 0,location,price,area,bhk,locations,abs_price,sqft_val,price_per_sqft,type,num_bhk
0,0,Residential land / Plot in Bogadi,₹17 - 41.25 Lac,"581 - 1,453 sqft",Plot/Land,Bogadi,2912500.0,1017,2863.8,Plot,
1,1,Residential land / Plot in Jayapura,₹12.85 - 43.61 Lac,"680 - 2,560 sqft",Plot/Land,Jayapura,2823000.0,1620,1742.6,Plot,
2,2,Residential land / Plot in Lalithadripura,₹74 Lac,"1,200 sqft",Plot/Land,Lalithadripura,7400000.0,1200,6166.7,Plot,
3,3,3 BHK Flat in Vishweshwara Nagar,₹90 Lac,"1,420 sqft",3 BHK,Vishweshwara Nagar,9000000.0,1420,6338.0,House,3.0
4,4,4 Bedroom House in Vidyaranyapura,₹2.59 Cr,"2,587 sqft",4 BHK,Vidyaranyapura,25900000.0,2587,10011.6,House,4.0


In [11]:
avg.head()

Unnamed: 0.1,Unnamed: 0,locations,type,price_per_sqft
0,0,1st Stage Vijayanagar,House,12083
1,1,1st Stage Vijayanagar,Plot,10802
2,2,1st stage Kuvempunagar,House,12720
3,3,2nd Stage Gokulam,House,11562
4,4,2nd Stage Gokulam,Plot,10769


### Merging data

In [13]:
df = df.merge(avg, on=['locations', 'type'], how='left', suffixes=('', '_avg'))

In [15]:
df

Unnamed: 0.1,Unnamed: 0,location,price,area,bhk,locations,abs_price,sqft_val,price_per_sqft,type,num_bhk,Unnamed: 0_avg,price_per_sqft_avg
0,0,Residential land / Plot in Bogadi,₹17 - 41.25 Lac,"581 - 1,453 sqft",Plot/Land,Bogadi,2912500.0,1017,2863.8,Plot,,53,4207
1,1,Residential land / Plot in Jayapura,₹12.85 - 43.61 Lac,"680 - 2,560 sqft",Plot/Land,Jayapura,2823000.0,1620,1742.6,Plot,,159,2158
2,2,Residential land / Plot in Lalithadripura,₹74 Lac,"1,200 sqft",Plot/Land,Lalithadripura,7400000.0,1200,6166.7,Plot,,205,4672
3,3,3 BHK Flat in Vishweshwara Nagar,₹90 Lac,"1,420 sqft",3 BHK,Vishweshwara Nagar,9000000.0,1420,6338.0,House,3.0,380,6108
4,4,4 Bedroom House in Vidyaranyapura,₹2.59 Cr,"2,587 sqft",4 BHK,Vidyaranyapura,25900000.0,2587,10011.6,House,4.0,367,10434
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1490,1745,Residential land / Plot in Bannur Road,₹3 Cr,"10,000 sqft",Plot/Land,Bannur Road,30000000.0,10000,3000.0,Plot,,37,3726
1491,1746,Residential land / Plot in Yelwala,₹25 Lac,800 sqft,Plot/Land,Yelwala,2500000.0,800,3125.0,Plot,,392,2453
1492,1747,3 BHK Flat in Bogadi,₹85 Lac,"1,710 sqft",3 BHK,Bogadi,8500000.0,1710,4970.8,House,3.0,52,16581
1493,1748,6 Bedroom House in Vidyaranyapura,₹1.2 Cr,"1,100 sqft",6 BHK,Vidyaranyapura,12000000.0,1100,10909.1,House,6.0,367,10434


### Applying Feature engineering

In [17]:
df['num_bhk'].fillna(0,inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['num_bhk'].fillna(0,inplace=True)


In [19]:
df.dropna()

Unnamed: 0.1,Unnamed: 0,location,price,area,bhk,locations,abs_price,sqft_val,price_per_sqft,type,num_bhk,Unnamed: 0_avg,price_per_sqft_avg
0,0,Residential land / Plot in Bogadi,₹17 - 41.25 Lac,"581 - 1,453 sqft",Plot/Land,Bogadi,2912500.0,1017,2863.8,Plot,0.0,53,4207
1,1,Residential land / Plot in Jayapura,₹12.85 - 43.61 Lac,"680 - 2,560 sqft",Plot/Land,Jayapura,2823000.0,1620,1742.6,Plot,0.0,159,2158
2,2,Residential land / Plot in Lalithadripura,₹74 Lac,"1,200 sqft",Plot/Land,Lalithadripura,7400000.0,1200,6166.7,Plot,0.0,205,4672
3,3,3 BHK Flat in Vishweshwara Nagar,₹90 Lac,"1,420 sqft",3 BHK,Vishweshwara Nagar,9000000.0,1420,6338.0,House,3.0,380,6108
4,4,4 Bedroom House in Vidyaranyapura,₹2.59 Cr,"2,587 sqft",4 BHK,Vidyaranyapura,25900000.0,2587,10011.6,House,4.0,367,10434
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1490,1745,Residential land / Plot in Bannur Road,₹3 Cr,"10,000 sqft",Plot/Land,Bannur Road,30000000.0,10000,3000.0,Plot,0.0,37,3726
1491,1746,Residential land / Plot in Yelwala,₹25 Lac,800 sqft,Plot/Land,Yelwala,2500000.0,800,3125.0,Plot,0.0,392,2453
1492,1747,3 BHK Flat in Bogadi,₹85 Lac,"1,710 sqft",3 BHK,Bogadi,8500000.0,1710,4970.8,House,3.0,52,16581
1493,1748,6 Bedroom House in Vidyaranyapura,₹1.2 Cr,"1,100 sqft",6 BHK,Vidyaranyapura,12000000.0,1100,10909.1,House,6.0,367,10434


### Selecting relevant features and target variable price

In [21]:
X = df[['locations', 'type', 'sqft_val', 'num_bhk', 'price_per_sqft_avg']]
y = df['abs_price']

### Applying One - Hot encoding to categorical variables

In [25]:
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_encoded = encoder.fit_transform(df[['locations', 'type']])
X_encoded = pd.DataFrame(X_encoded, columns=encoder.get_feature_names_out(['locations', 'type']))

##### combining the encoded data with final X data

In [27]:
X_final = pd.concat([X_encoded, df[['sqft_val', 'num_bhk', 'price_per_sqft_avg']]], axis=1)

### Splitting the data into Train and Test data 

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.2, random_state=42)

### Model Training

In [51]:
model = RandomForestRegressor(n_estimators=100, random_state=38)
model.fit(X_train, y_train)

### Predictions

In [53]:
y_pred = model.predict(X_test)

### Model performance evaluation

In [55]:
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error: {mae}")
print(f"R-Squared Score: {r2}")

Mean Absolute Error: 3231696.158766875
R-Squared Score: 0.5905201091211539
