In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

In [4]:
data = pd.read_csv("./gurgaon_properties_post_feature_selection_v2 (1).csv")
data.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 36,0.82,3.0,2.0,2,New Property,850.0,0.0,0.0,0.0,Low,Low Floor
1,flat,sector 89,0.95,2.0,2.0,2,New Property,1226.0,1.0,0.0,0.0,Low,Mid Floor
2,flat,sohna road,0.32,2.0,2.0,1,New Property,1000.0,0.0,0.0,0.0,Low,High Floor
3,flat,sector 92,1.6,3.0,4.0,3+,Relatively New,1615.0,1.0,0.0,1.0,High,Mid Floor
4,flat,sector 102,0.48,2.0,2.0,1,Relatively New,582.0,0.0,1.0,0.0,High,Mid Floor


# Changing values of furnishing type column
* 0 -> unfurnished
* 1 -> semifurnished
* 2 -> furnished

In [9]:
data["furnishing_type"] = data["furnishing_type"].replace({0.0:"unfurnished", 1.0:"semifurnished", 2.0:"furnished"})

In [10]:
X = data.drop(columns="price")
y = data.loc[:,"price"]

In [12]:
# applying log1p transformation to the target variable
y_transformed = np.log1p(y)

In [14]:
X.head(1)

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 36,3.0,2.0,2,New Property,850.0,0.0,0.0,unfurnished,Low,Low Floor


# Ordinal Encoding

In [34]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_val_score

In [41]:
cols_to_encode = X.select_dtypes(include=["object"]).columns
num_cols = X.select_dtypes(include=np.number).columns.tolist()

In [42]:
preprocessor = ColumnTransformer(
    transformers = [
        ("num", StandardScaler(), num_cols),
        ("cat", OrdinalEncoder(), cols_to_encode)
    ],
    remainder="passthrough")

In [43]:
# Creating a pipeline

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", LinearRegression())
])
pipeline

In [44]:
# k-fold cross validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring="r2")

In [45]:
print(scores.mean(), scores.std())

0.7363096633436828 0.03238005754429935
