In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from xgboost import XGBRegressor 
import category_encoders as ce
from sklearn.compose import ColumnTransformer
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("gurgaon_properties_post_feature_selection.csv")

In [3]:
df.head()

Unnamed: 0,sector,property_type,price,bedRoom,bathroom,builtUpArea,servant room,study room,luxury_category
0,sector 49,flat,2.45,3.0,3.0,1865.0,0,1,Low
1,sector 109,house,6.1,5.0,6.0,2430.0,1,0,Low
2,sector 1,flat,1.65,4.0,3.0,3111.0,1,0,Low
3,sector 7,house,0.66,3.0,1.0,550.0,0,0,Low
4,sector 37d,flat,1.4,3.0,3.0,1711.0,0,0,Medium


In [4]:
df['servant room'] = df['servant room'].replace({0:'No', 1:'Yes'})
df['study room'] = df['study room'].replace({0:'No', 1:'Yes'})

In [5]:
X = df.drop(columns=['price'])
y = df['price']

In [6]:
y_transformed = np.log1p(y)

## Ordinal Encoding

In [8]:
columns_to_encode = ['sector', 'property_type', 'luxury_category', 'servant room', 'study room']

## creating a column transformer for preprocessing

In [10]:
preprocessor = ColumnTransformer(
    transformers = [
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'builtUpArea']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), columns_to_encode)
    ],
    remainder='passthrough'
)

## Creating a pipeline

In [12]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# k-fold cross validation

In [14]:
k_fold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=k_fold, scoring='r2')

In [15]:
r2_score, standard_deviation = scores.mean(), scores.std()

In [16]:
r2_score

0.731477113308337

In [17]:
standard_deviation

0.029313978928780106

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y_transformed, test_size=0.2, random_state=42)

In [20]:
pipeline.fit(X_train, y_train)

In [21]:
y_pred = pipeline.predict(X_test)

In [22]:
from sklearn.metrics import mean_absolute_error

In [23]:
y_pred = np.expm1(y_pred)

In [24]:
mae = mean_absolute_error(np.expm1(y_test), y_pred)

In [25]:
mae

0.9779014138976233

In [26]:
def scorer(model_name, model):

    output = {}
    
    pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', model)
    ])

    k_fold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=k_fold, scoring='r2')
    r2_score, standard_deviation = scores.mean(), scores.std()

    output[model_name] = {'r2_score':r2_score}
    output[model_name].update(std=standard_deviation)

    pipeline.fit(X_train, y_train)
    
    y_pred = pipeline.predict(X_test)

    y_pred = np.expm1(y_pred)
    
    mae = mean_absolute_error(np.expm1(y_test), y_pred)

    output[model_name].update(mae_pred=mae)

    return output

In [27]:
scorer('LinearRegression', LinearRegression())

{'LinearRegression': {'r2_score': 0.731477113308337,
  'std': 0.029313978928780106,
  'mae_pred': 0.9779014138976233}}

In [28]:
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from sklearn.linear_model import Ridge, Lasso
from xgboost import XGBRegressor 
import category_encoders as ce
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor

In [29]:
model_dict = {
    'lin_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'lasso':Lasso(),
    'decision Tree':DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreeRegressor(),
    'gradientBoosting':GradientBoostingRegressor(),
    'adaboost':AdaBoostRegressor(),
    'mlp':MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [30]:
def try_all_models(model_dict):
    model_output = []
    for model_name, model in model_dict.items():
        model_output.append(scorer(model_name, model))
    df = pd.DataFrame({
        'models': list(model_dict.keys()),
        'r2_train': list(map(lambda x: list(x.values())[0]['r2_score'], model_output)),
        'mae_pred': list(map(lambda x: list(x.values())[0]['mae_pred'], model_output)),
        'std_train': list(map(lambda x: list(x.values())[0]['std'], model_output)),
        }).sort_values(by='mae_pred', ascending=True)
    print(df)

In [31]:
try_all_models(model_dict)

              models  r2_train  mae_pred  std_train
5      random forest  0.877645  0.495946   0.028737
10           xgboost  0.891389  0.512031   0.021700
7   gradientBoosting  0.873360  0.577891   0.019694
4      decision Tree  0.786373  0.662034   0.036705
9                mlp  0.774642  0.762506   0.046902
6        extra trees  0.735607  0.781435   0.042867
8           adaboost  0.756361  0.853835   0.018482
2              ridge  0.731482  0.977895   0.029285
0            lin_reg  0.731477  0.977901   0.029314
1                svr  0.723545  0.979895   0.029877
3              lasso  0.070839  1.639397   0.019488


# OneHotEncoding

In [33]:
columns_to_encode

['sector', 'property_type', 'luxury_category', 'servant room', 'study room']

In [34]:
preprocessor = ColumnTransformer(
    transformers = [
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'builtUpArea']),
        ('cat1', OneHotEncoder(handle_unknown='ignore', drop='first'), ['sector', 'property_type', 'servant room', 'study room']),
        ('cat2', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), ['luxury_category'])
    ],
    remainder='passthrough'
)

In [35]:
try_all_models(model_dict)

              models  r2_train  mae_pred  std_train
9                mlp  0.895321  0.485545   0.017747
10           xgboost  0.893772  0.517147   0.020206
5      random forest  0.881253  0.517641   0.025456
1                svr  0.892784  0.520650   0.018451
7   gradientBoosting  0.860809  0.598972   0.020703
6        extra trees  0.814721  0.611834   0.037278
4      decision Tree  0.815784  0.661059   0.041743
0            lin_reg  0.855245  0.677911   0.019053
2              ridge  0.855903  0.691937   0.017828
8           adaboost  0.727617  0.911575   0.025785
3              lasso -0.002128  1.675962   0.002720


# Using PCA

In [37]:
from sklearn.decomposition import PCA

In [38]:
preprocessor = ColumnTransformer(
    transformers = [
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'builtUpArea']),
        ('cat1', OneHotEncoder(handle_unknown='ignore', drop='first', sparse_output=False), ['sector', 'property_type', 'servant room', 'study room']),
        ('cat2', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), ['luxury_category'])
    ],
    remainder='passthrough'
)

In [39]:
def scorer(model_name, model):

    output = {}
    
    pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=0.95)),
    ('regressor', model)
    ])

    k_fold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=k_fold, scoring='r2')
    r2_score, standard_deviation = scores.mean(), scores.std()

    output[model_name] = {'r2_score':r2_score}
    output[model_name].update(std=standard_deviation)

    pipeline.fit(X_train, y_train)
    
    y_pred = pipeline.predict(X_test)

    y_pred = np.expm1(y_pred)
    
    mae = mean_absolute_error(np.expm1(y_test), y_pred)

    output[model_name].update(mae_pred=mae)

    return output

In [40]:
try_all_models(model_dict)

              models  r2_train  mae_pred  std_train
9                mlp  0.860585  0.598068   0.024203
1                svr  0.855484  0.601614   0.031048
5      random forest  0.856754  0.620005   0.019638
10           xgboost  0.851233  0.621268   0.023046
7   gradientBoosting  0.844215  0.657784   0.021621
4      decision Tree  0.711912  0.800376   0.046222
0            lin_reg  0.799658  0.864299   0.024576
2              ridge  0.799701  0.864451   0.024571
6        extra trees  0.719772  0.881161   0.052242
8           adaboost  0.732046  0.883801   0.024328
3              lasso -0.002128  1.675962   0.002720


# Target Encoding

In [42]:
import category_encoders as ce

In [139]:
columns_to_encode

['sector', 'property_type', 'luxury_category', 'servant room', 'study room']

In [141]:
preprocessor = ColumnTransformer(
    transformers = [
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'builtUpArea']),
        ('target_enc', ce.TargetEncoder(), ['sector', 'property_type', 'servant room', 'study room']),
        ('cat2', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), ['luxury_category']),
    ],
    remainder='passthrough'
)

In [143]:
def scorer(model_name, model):

    output = {}
    
    pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', model)
    ])

    k_fold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=k_fold, scoring='r2')
    r2_score, standard_deviation = scores.mean(), scores.std()

    output[model_name] = {'r2_score':r2_score}
    output[model_name].update(std=standard_deviation)

    pipeline.fit(X_train, y_train)
    
    y_pred = pipeline.predict(X_test)

    y_pred = np.expm1(y_pred)
    
    mae = mean_absolute_error(np.expm1(y_test), y_pred)

    output[model_name].update(mae_pred=mae)

    return output

In [145]:
try_all_models(model_dict)

              models  r2_train  mae_pred  std_train
10           xgboost  0.902202  0.485076   0.016995
5      random forest  0.891921  0.503431   0.018017
7   gradientBoosting  0.883153  0.570740   0.018551
4      decision Tree  0.805920  0.630514   0.037348
1                svr  0.856844  0.638120   0.023854
9                mlp  0.851019  0.690468   0.021965
6        extra trees  0.775141  0.724971   0.032068
8           adaboost  0.817036  0.770225   0.021539
0            lin_reg  0.815099  0.828875   0.018589
2              ridge  0.815113  0.829242   0.018577
3              lasso -0.002128  1.675962   0.002720


In [147]:
xgb = XGBRegressor()

In [149]:
preprocessor = ColumnTransformer(
    transformers = [
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'builtUpArea']),
        ('target_enc', ce.TargetEncoder(), ['sector', 'property_type', 'servant room', 'study room']),
        ('cat2', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), ['luxury_category']),
    ],
    remainder='passthrough'
)

In [151]:
 pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', xgb)
    ])

In [153]:
pipeline.fit(X, y_transformed)

In [155]:
import joblib
joblib.dump(pipeline, 'model.joblib')

['model.joblib']

In [157]:
X.to_csv('df.csv', index=False)