## Baseline Model
### @cipher499
### 27/12/23

In [1]:
import pandas as pd
import numpy as np

In [43]:
# load the dataset into a dataframe
df = pd.read_csv('gurgaon_properties_post_feature_selection.csv')
df.head()

Unnamed: 0,property_type,sector,bedrooms,bathrooms,balconies,age_possession,built_up_area,study room,servant room,store room,furnishing_type,luxury_category,floor_category,price
0,0.0,73.0,3.0,3.0,3.0,3.0,1654.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5
1,0.0,54.0,4.0,4.0,4.0,0.0,2134.0,1.0,1.0,0.0,1.0,1.0,0.0,2.65
2,0.0,97.0,2.0,2.0,3.0,3.0,1300.0,0.0,0.0,0.0,1.0,1.0,2.0,1.2
3,0.0,9.0,3.0,2.0,2.0,3.0,717.0,0.0,0.0,0.0,0.0,0.0,2.0,0.52
4,0.0,111.0,2.0,1.0,3.0,1.0,828.0,0.0,0.0,0.0,0.0,2.0,1.0,0.54


For Linear Regression:
1. One-Hot Encode -> sector, balconies, age_possession, furnishing_type, luxury_category, floor_category
2. Standard Scale -> property_type, bedRoom, bathroom, built_up_area, servant room, store room
3. Log transform -> price (it is a right skewed distribution)

In [3]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR

In [4]:
columns_to_encode = ['sector', 'balconies', 'age_possession', 'furnishing_type', 'luxury_category', 'floor_category']

In [44]:
# create the feature and the target
X = df.drop(columns=['price'])
y = df['price']

In [45]:
# Apply the log1p transformation to the target variable
y_transformed = np.log1p(y)

In [46]:
X

Unnamed: 0,property_type,sector,bedrooms,bathrooms,balconies,age_possession,built_up_area,study room,servant room,store room,furnishing_type,luxury_category,floor_category
0,0.0,73.0,3.0,3.0,3.0,3.0,1654.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,54.0,4.0,4.0,4.0,0.0,2134.0,1.0,1.0,0.0,1.0,1.0,0.0
2,0.0,97.0,2.0,2.0,3.0,3.0,1300.0,0.0,0.0,0.0,1.0,1.0,2.0
3,0.0,9.0,3.0,2.0,2.0,3.0,717.0,0.0,0.0,0.0,0.0,0.0,2.0
4,0.0,111.0,2.0,1.0,3.0,1.0,828.0,0.0,0.0,0.0,0.0,2.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3550,0.0,97.0,2.0,2.0,3.0,3.0,1484.0,0.0,0.0,0.0,1.0,1.0,0.0
3551,0.0,91.0,3.0,4.0,2.0,0.0,1854.0,0.0,0.0,1.0,0.0,2.0,2.0
3552,0.0,75.0,4.0,4.0,4.0,0.0,2127.0,0.0,1.0,0.0,1.0,2.0,2.0
3553,1.0,86.0,2.0,2.0,2.0,1.0,745.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
y_transformed

0       1.252763
1       1.294727
2       0.788457
3       0.418710
4       0.431782
          ...   
3550    0.788457
3551    0.875469
3552    1.308333
3553    0.559616
3554    0.916291
Name: price, Length: 3555, dtype: float64

In [47]:
# Create a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['property_type', 'bedrooms', 'bathrooms', 'built_up_area', 'servant room', 'store room']),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), columns_to_encode)
    ], 
    remainder='passthrough'
)

In [48]:
# Create a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [28]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
scores.mean()



0.8586768680469824

In [29]:
# standard deviation
scores.std()

0.024024871513118374

In [49]:
# split the data into training and testing sets and fit the model on training data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y_transformed, test_size=0.2, random_state=42)
pipeline.fit(X_train, y_train)

In [50]:
# get the model predictions 
y_pred = pipeline.predict(X_test)
y_pred = np.expm1(y_pred)

In [33]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(np.expm1(y_test), y_pred)

0.5856218198604966

#### Linear Regression:
- R2 score = 0.85
- MAE = 0.58 Cr
- S.D. = 0.024 Cr

### SVR

In [51]:
# Create a pipeline with SVR
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', SVR(kernel='rbf'))
])

# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
scores.mean()



0.8862583194647611

In [35]:
# standard deviation
scores.std()

0.016736977301472788

In [36]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
y_pred = np.expm1(y_pred)
mean_absolute_error(np.expm1(y_test), y_pred)

0.5269143631651948

#### SVR:
- R2 score = 0.88
- MAE = 0.52 Cr
- S.D. = 0.016 Cr