In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv('Data/Mumbai House Prices.csv')
df.shape

(76038, 9)

In [3]:
df.head()

Unnamed: 0,bhk,type,locality,area,price,price_unit,region,status,age
0,3,Apartment,Lak And Hanware The Residency Tower,685,2.5,Cr,Andheri West,Ready to move,New
1,2,Apartment,Radheya Sai Enclave Building No 2,640,52.51,L,Naigaon East,Under Construction,New
2,2,Apartment,Romell Serene,610,1.73,Cr,Borivali West,Under Construction,New
3,2,Apartment,Soundlines Codename Urban Rainforest,876,59.98,L,Panvel,Under Construction,New
4,2,Apartment,Origin Oriana,659,94.11,L,Mira Road East,Under Construction,New


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76038 entries, 0 to 76037
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   bhk         76038 non-null  int64  
 1   type        76038 non-null  object 
 2   locality    76038 non-null  object 
 3   area        76038 non-null  int64  
 4   price       76038 non-null  float64
 5   price_unit  76038 non-null  object 
 6   region      76038 non-null  object 
 7   status      76038 non-null  object 
 8   age         76038 non-null  object 
dtypes: float64(1), int64(2), object(6)
memory usage: 5.2+ MB


In [6]:
# Encode categorical variables using LabelEncoder
le = LabelEncoder()
df['type'] = le.fit_transform(df['type'])
df['locality'] = le.fit_transform(df['locality'])
df['region'] = le.fit_transform(df['region'])
df['status'] = le.fit_transform(df['status'])


In [7]:
# Select features (independent variables) and target variable (dependent variable)
X = df[['bhk', 'type', 'locality', 'region', 'area']]
Y = df['price']

In [8]:
# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [9]:
# Intialize the Linear Regression model
model = LinearRegression()

In [10]:
# Train the model
model.fit(X_train, Y_train)

In [11]:
# Make predictions on the test set
y_pred = model.predict(X_test)

In [13]:
# Evaluate the model
mse = mean_squared_error(Y_test, y_pred)
r2 = r2_score(Y_test, y_pred)

In [15]:
# Print the coefficients and evaluation metrics
print(f"Coefficients: {model.coef_}")


Coefficients: [-1.73222789e+01 -2.50595991e+00  4.46297355e-04  2.66981745e-02
  3.41019719e-03]


In [16]:
print(f"Intercept: {model.intercept_}")

Intercept: 55.75837118708124


In [17]:
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 874.9313806806164


In [18]:
print(f"R-squared: {r2}")

R-squared: 0.1949346148411848


In [19]:
model.intercept_

55.75837118708124

In [20]:
model.score(X_test, Y_test)

0.1949346148411848

In [21]:
model.score(X_train, Y_train)

0.18499412496147283