In [69]:
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

In [19]:
df = pd.read_csv("../datasets/gurgaon_properties_post_feature_selection.csv")

In [21]:
df.head()

Unnamed: 0,sector,property_type,price,bedRoom,bathroom,builtUpArea,servant room,study room,luxury_category
0,sector 49,flat,2.45,3.0,3.0,1865.0,0,1,Low
1,sector 109,house,6.1,5.0,6.0,2430.0,1,0,Low
2,sector 1,flat,1.65,4.0,3.0,3111.0,1,0,Low
3,sector 7,house,0.66,3.0,1.0,550.0,0,0,Low
4,sector 37d,flat,1.4,3.0,3.0,1711.0,0,0,Medium


In [23]:
df['property_type'] = df['property_type'].replace({'flat':0, 'house':1})
df['luxury_category'] = df['luxury_category'].replace({'Low':0, 'Medium':1, 'High':3})

  df['property_type'] = df['property_type'].replace({'flat':0, 'house':1})
  df['luxury_category'] = df['luxury_category'].replace({'Low':0, 'Medium':1, 'High':3})


In [33]:
new_df = pd.get_dummies(df, columns=['sector'], drop_first=True, dtype='int')

In [35]:
new_df.head()

Unnamed: 0,property_type,price,bedRoom,bathroom,builtUpArea,servant room,study room,luxury_category,sector_b block,sector_c block,...,sector_sector 99,sector_sector 99a,sector_sector 9a,sector_sheetla colony,sector_sohna road,sector_sohna road road,sector_south city,sector_surat nagar,sector_sushant lok,sector_valley view
0,0,2.45,3.0,3.0,1865.0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,6.1,5.0,6.0,2430.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1.65,4.0,3.0,3111.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0.66,3.0,1.0,550.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1.4,3.0,3.0,1711.0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [37]:
X = new_df.drop(columns=['price'])

In [39]:
y = new_df['price']

In [45]:
y_transformed = np.log1p(y)

In [59]:
scaler = StandardScaler()

scaled_df = pd.DataFrame(scaler.fit_transform(X), columns = X.columns)

In [61]:
scaled_df.head()

Unnamed: 0,property_type,bedRoom,bathroom,builtUpArea,servant room,study room,luxury_category,sector_b block,sector_c block,sector_dlf phase,...,sector_sector 99,sector_sector 99a,sector_sector 9a,sector_sheetla colony,sector_sohna road,sector_sohna road road,sector_south city,sector_surat nagar,sector_sushant lok,sector_valley view
0,-0.516077,-0.049915,-0.172102,0.017139,-0.742822,2.094447,-0.836741,-0.029017,-0.044349,-0.188212,...,-0.058107,-0.09211,-0.05303,-0.016748,-0.211004,-0.05303,-0.047418,-0.050301,-0.113051,-0.033511
1,1.937696,1.645285,1.971258,0.485453,1.346218,-0.477453,-0.836741,-0.029017,-0.044349,-0.188212,...,-0.058107,-0.09211,-0.05303,-0.016748,-0.211004,-0.05303,-0.047418,-0.050301,-0.113051,-0.033511
2,-0.516077,0.797685,-0.172102,1.049917,1.346218,-0.477453,-0.836741,-0.029017,-0.044349,-0.188212,...,-0.058107,-0.09211,-0.05303,-0.016748,-0.211004,-0.05303,-0.047418,-0.050301,-0.113051,-0.033511
3,1.937696,-0.049915,-1.601009,-1.072832,-0.742822,-0.477453,-0.836741,-0.029017,-0.044349,-0.188212,...,-0.058107,-0.09211,-0.05303,-0.016748,-0.211004,-0.05303,-0.047418,-0.050301,-0.113051,-0.033511
4,-0.516077,-0.049915,-0.172102,-0.110508,-0.742822,-0.477453,0.175409,-0.029017,-0.044349,-0.188212,...,-0.058107,-0.09211,-0.05303,-0.016748,-0.211004,-0.05303,-0.047418,-0.050301,-0.113051,-0.033511


In [73]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(LinearRegression(), scaled_df, y_transformed, cv=kfold, scoring='r2')

In [85]:
scores.mean(), scores.std()

(0.8552918088387905, 0.019140212444266936)

In [87]:
lr = LinearRegression()

In [89]:
lr.fit(scaled_df, y_transformed)

In [121]:
scaled_df.columns

Index(['property_type', 'bedRoom', 'bathroom', 'builtUpArea', 'servant room',
       'study room', 'luxury_category', 'sector_b block', 'sector_c block',
       'sector_dlf phase',
       ...
       'sector_sector 99', 'sector_sector 99a', 'sector_sector 9a',
       'sector_sheetla colony', 'sector_sohna road', 'sector_sohna road road',
       'sector_south city', 'sector_surat nagar', 'sector_sushant lok',
       'sector_valley view'],
      dtype='object', length=127)

In [151]:
features_coef_df = pd.DataFrame(lr.coef_.reshape(1, 127), columns=scaled_df.columns) \
                        .stack().reset_index().drop(columns=['level_0']).rename(columns={'level_1':'features', 0:'coef'})
features_coef_df

Unnamed: 0,features,coef
0,property_type,0.121328
1,bedRoom,0.069587
2,bathroom,0.061134
3,builtUpArea,0.204692
4,servant room,0.041321
...,...,...
122,sector_sohna road road,-0.022852
123,sector_south city,0.005671
124,sector_surat nagar,-0.037704
125,sector_sushant lok,0.004491


In [77]:
import statsmodels.api as sm

In [153]:
xscaled_with_constant = sm.add_constant(scaled_df)

model = sm.OLS(y_transformed, xscaled_with_constant).fit()

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.870
Model:                            OLS   Adj. R-squared:                  0.865
Method:                 Least Squares   F-statistic:                     181.3
Date:                Mon, 21 Jul 2025   Prob (F-statistic):               0.00
Time:                        01:13:07   Log-Likelihood:                 661.99
No. Observations:                3566   AIC:                            -1068.
Df Residuals:                    3438   BIC:                            -277.0
Df Model:                         127                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                   

In [163]:
unstandardized_std = X.apply(lambda x: x.std(), axis=0)

In [169]:
unstandardized_std

property_type                0.407593
bedRoom                      1.179968
bathroom                     1.399868
builtUpArea               1206.623747
servant room                 0.478756
                             ...     
sector_sohna road road       0.052888
sector_south city            0.047318
sector_surat nagar           0.050181
sector_sushant lok           0.111640
sector_valley view           0.033478
Length: 127, dtype: float64

In [189]:
y_transformed_std = y_transformed.std()

In [269]:
values = np.array(features_coef_df['coef']) * (np.array(y_transformed_std) / np.array(unstandardized_std.tolist()))

In [271]:
actual_coefs = pd.DataFrame(values.reshape(1, 127), columns=scaled_df.columns).stack() \
                        .reset_index().drop(columns=['level_0']).rename(columns={'level_1':'features', 0: 'coefs'})

In [273]:
actual_coefs.head()

Unnamed: 0,features,coefs
0,property_type,0.165997
1,bedRoom,0.032887
2,bathroom,0.024354
3,builtUpArea,9.5e-05
4,servant room,0.048131


In [275]:
actual_coefs['coefs'] =  actual_coefs['coefs'].apply(lambda x: np.expm1(x))

In [277]:
actual_coefs.head()

Unnamed: 0,features,coefs
0,property_type,0.180569
1,bedRoom,0.033434
2,bathroom,0.024653
3,builtUpArea,9.5e-05
4,servant room,0.049308
