load the feature selection data
convert the furnishing type column to str
extract the features and the target
log transform the target
ordinal encoding:
	columns to encode
	create a column transformer object for num and cat cols
create a pipeline object
	ct
	regressor
kfold
scores (cv score)
split the data
train the pipeline model
get the predictions
transform the preds back to exp
calculate the mae

In [1]:
import numpy as np
import pandas as pd

In [2]:
pd.set_option('display.max_columns', None)

In [4]:
df = pd.read_csv('gurgaon_properties_post_feature_selection_v2.csv')
df.head()

Unnamed: 0,property_type,sector,price,bedrooms,bathrooms,balconies,age_possession,built_up_area,study room,servant room,store room,pooja room,others,furnishing_type,luxury_category,floor_category
0,flat,sector 65,2.5,3.0,3.0,3,Relatively New,1654.0,0.0,0.0,0.0,0.0,0.0,0.0,budget,high floor
1,flat,sector 48,2.65,4.0,4.0,3+,Moderately Old,2134.0,1.0,1.0,0.0,0.0,0.0,1.0,high,high floor
2,flat,sector 85,1.2,2.0,2.0,3,Relatively New,1300.0,0.0,0.0,0.0,0.0,0.0,1.0,high,medium floor
3,flat,sector 107,0.52,3.0,2.0,2,Relatively New,717.0,0.0,0.0,0.0,0.0,1.0,0.0,budget,medium floor
4,flat,sohna road,0.54,2.0,1.0,3,New Property,828.0,0.0,0.0,0.0,0.0,0.0,0.0,medium,low floor


In [5]:
df.drop(columns=['others', 'pooja room'], inplace=True)
df.head()

Unnamed: 0,property_type,sector,price,bedrooms,bathrooms,balconies,age_possession,built_up_area,study room,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 65,2.5,3.0,3.0,3,Relatively New,1654.0,0.0,0.0,0.0,0.0,budget,high floor
1,flat,sector 48,2.65,4.0,4.0,3+,Moderately Old,2134.0,1.0,1.0,0.0,1.0,high,high floor
2,flat,sector 85,1.2,2.0,2.0,3,Relatively New,1300.0,0.0,0.0,0.0,1.0,high,medium floor
3,flat,sector 107,0.52,3.0,2.0,2,Relatively New,717.0,0.0,0.0,0.0,0.0,budget,medium floor
4,flat,sohna road,0.54,2.0,1.0,3,New Property,828.0,0.0,0.0,0.0,0.0,medium,low floor


In [7]:
df['furnishing_type'].replace({0.0:'unfurnished', 1.0:'semifurnished', 2.0:'furnished'}, inplace=True)
df.head()

Unnamed: 0,property_type,sector,price,bedrooms,bathrooms,balconies,age_possession,built_up_area,study room,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 65,2.5,3.0,3.0,3,Relatively New,1654.0,0.0,0.0,0.0,unfurnished,budget,high floor
1,flat,sector 48,2.65,4.0,4.0,3+,Moderately Old,2134.0,1.0,1.0,0.0,semifurnished,high,high floor
2,flat,sector 85,1.2,2.0,2.0,3,Relatively New,1300.0,0.0,0.0,0.0,semifurnished,high,medium floor
3,flat,sector 107,0.52,3.0,2.0,2,Relatively New,717.0,0.0,0.0,0.0,unfurnished,budget,medium floor
4,flat,sohna road,0.54,2.0,1.0,3,New Property,828.0,0.0,0.0,0.0,unfurnished,medium,low floor


In [33]:
X = df.drop(columns='price')
y = df['price']
y_transformed = np.log1p(y)

### Ordinal Encoding

In [19]:
X.columns

Index(['property_type', 'sector', 'bedrooms', 'bathrooms', 'balconies',
       'age_possession', 'built_up_area', 'study room', 'servant room',
       'store room', 'furnishing_type', 'luxury_category', 'floor_category'],
      dtype='object')

In [36]:
columns_to_encode = ['property_type', 'balconies', 'age_possession', 'furnishing_type', 'luxury_category', 'floor_category']

In [37]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

preprocessor = ColumnTransformer(
                   transformers=[
                       ('num', StandardScaler(), ['bedrooms', 'bathrooms', 'built_up_area', 'servant room', 'store room']),
                       ('cat', OrdinalEncoder(), columns_to_encode)
                       ],
                    remainder='passthrough'
                    )

In [38]:
pipeline = Pipeline([
                ('preprocessor', preprocessor),
                ('regressor', LinearRegression())
            ])

In [30]:
X['sector'].value_counts()

sohna road    163
sector 85     108
sector 102    107
sector 92     100
sector 69      93
             ... 
sector 88b      3
sector 73       3
sector 27       2
sector 37       1
sector 17a      1
Name: sector, Length: 112, dtype: int64

In [42]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

kfold = KFold(n_splits=10, shuffle=True, random_state=42)

In [56]:
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [45]:
oe = OrdinalEncoder()
X['sector'] = oe.fit_transform(X[['sector']])                               

In [57]:
scores.mean(), scores.std()

(0.7417367295878756, 0.025026201446221523)

In [52]:
X_train, X_test, y_train, y_test = train_test_split(X, y_transformed, test_size=0.2, random_state=42)

In [53]:
pipeline.fit(X_train, y_train)

In [54]:
y_pred = pipeline.predict(X_test)
y_pred = np.expm1(y_pred)

In [55]:
mae = mean_absolute_error(np.expm1(y_test), y_pred)
mae

0.8170658219435964

In [71]:
def scorer(model_name, model):
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
                    ('preprocessor', preprocessor),
                    ('regressor', model)
                    ])
    
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed,cv=kfold, scoring='r2')
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X, y_transformed, test_size=0.2, random_state=42)
    
    pipeline.fit(X_train, y_train)
    
    y_pred = np.expm1(pipeline.predict(X_test))
    
    output.append(mean_absolute_error(np.expm1(y_test), y_pred))
    
    return output

In [72]:
from sklearn.linear_model import Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR

model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [73]:
model_output = []
for model_name, model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [74]:
model_output

[['linear_reg', 0.7417367295878756, 0.8170658219435964],
 ['svr', 0.761121038928748, 0.8238475641279636],
 ['ridge', 0.7417393380299888, 0.8169963234760353],
 ['LASSO', 0.05765833827019403, 1.4843625217979457],
 ['decision tree', 0.7804939200509582, 0.6761796174877927],
 ['random forest', 0.8853831477125483, 0.48689380977276236],
 ['extra trees', 0.8699100546661187, 0.519293275258561],
 ['gradient boosting', 0.878290453038624, 0.5470470596577626],
 ['adaboost', 0.7618625879552506, 0.8272045260789997],
 ['mlp', 0.8041531003395466, 0.723734661233513],
 ['xgboost', 0.8942772148474276, 0.47488026221760876]]

In [76]:
model_df = pd.DataFrame(model_output, columns=['name', 'r2', 'mae'])
model_df.sort_values(by='mae')

Unnamed: 0,name,r2,mae
10,xgboost,0.894277,0.47488
5,random forest,0.885383,0.486894
6,extra trees,0.86991,0.519293
7,gradient boosting,0.87829,0.547047
4,decision tree,0.780494,0.67618
9,mlp,0.804153,0.723735
2,ridge,0.741739,0.816996
0,linear_reg,0.741737,0.817066
1,svr,0.761121,0.823848
8,adaboost,0.761863,0.827205


### One-Hot Encoding

In [100]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedrooms', 'bathrooms', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first', handle_unknown='ignore'),['sector','age_possession','furnishing_type'])
    ], 
    remainder='passthrough'
)

In [101]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [102]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')



In [103]:
scores.mean()

0.8575265290208158

In [104]:
scores.std()

0.02409869992740901

In [105]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
pipeline.fit(X_train, y_train)

In [106]:
y_pred = pipeline.predict(X_test)
y_pred = np.expm1(y_pred)
mean_absolute_error(np.expm1(y_test), y_pred)

0.5909286591952617

In [107]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')   
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42) 
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)  
    y_pred = np.expm1(y_pred) 
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output

In [108]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))



In [109]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
6,extra trees,0.887128,0.494881
10,xgboost,0.891008,0.513371
5,random forest,0.875794,0.527851
9,mlp,0.87852,0.528868
1,svr,0.879721,0.534634
0,linear_reg,0.857527,0.590929
7,gradient boosting,0.860693,0.591235
2,ridge,0.857835,0.597118
4,decision tree,0.780952,0.706933
8,adaboost,0.732223,0.843082


ohe -> high dimensionality -> PCA
target encoding -> groupby on feature columns; mean of target column
; can lead to data leakage; always use it on training data after splitting the data
gives better results with tree based algorithms than reg. algorithms
XGboost -> bayesian h.tuning (hyperop); kaggle comp hack 