In [314]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from xgboost import XGBRegressor 
import category_encoders as ce
from sklearn.compose import ColumnTransformer
import warnings
warnings.filterwarnings("ignore")

In [7]:
df = pd.read_csv("gurgaon_properties_post_feature_selection.csv")

In [9]:
df.head()

Unnamed: 0,sector,property_type,price,bedRoom,bathroom,builtUpArea,servant room,study room,luxury_category
0,sector 49,flat,2.45,3.0,3.0,1865.0,0,1,Low
1,sector 109,house,6.1,5.0,6.0,2430.0,1,0,Low
2,sector 1,flat,1.65,4.0,3.0,3111.0,1,0,Low
3,sector 7,house,0.66,3.0,1.0,550.0,0,0,Low
4,sector 37d,flat,1.4,3.0,3.0,1711.0,0,0,Medium


In [471]:
df['servant room'] = df['servant room'].replace({0:'No', 1:'Yes'})
df['study room'] = df['study room'].replace({0:'No', 1:'Yes'})

In [475]:
X = df.drop(columns=['price'])
y = df['price']

In [477]:
y_transformed = np.log1p(y)

## Ordinal Encoding

In [493]:
columns_to_encode = ['sector', 'property_type', 'luxury_category', 'servant room', 'study room']

## creating a column transformer for preprocessing

In [496]:
preprocessor = ColumnTransformer(
    transformers = [
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'builtUpArea']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), columns_to_encode)
    ],
    remainder='passthrough'
)

## Creating a pipeline

In [499]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# k-fold cross validation

In [502]:
k_fold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=k_fold, scoring='r2')

In [504]:
r2_score, standard_deviation = scores.mean(), scores.std()

In [506]:
r2_score

0.7314771133083371

In [508]:
standard_deviation

0.029313978928780196

In [510]:
from sklearn.model_selection import train_test_split

In [512]:
X_train, X_test, y_train, y_test = train_test_split(X, y_transformed, test_size=0.2, random_state=42)

In [514]:
pipeline.fit(X_train, y_train)

In [516]:
y_pred = pipeline.predict(X_test)

In [518]:
from sklearn.metrics import mean_absolute_error

In [520]:
y_pred = np.expm1(y_pred)

In [522]:
mae = mean_absolute_error(np.expm1(y_test), y_pred)

In [524]:
mae

0.9779014138976233

In [526]:
def scorer(model_name, model):

    output = {}
    
    pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', model)
    ])

    k_fold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=k_fold, scoring='r2')
    r2_score, standard_deviation = scores.mean(), scores.std()

    output[model_name] = {'r2_score':r2_score}
    output[model_name].update(std=standard_deviation)

    pipeline.fit(X_train, y_train)
    
    y_pred = pipeline.predict(X_test)

    y_pred = np.expm1(y_pred)
    
    mae = mean_absolute_error(np.expm1(y_test), y_pred)

    output[model_name].update(mae_pred=mae)

    return output

In [528]:
scorer('LinearRegression', LinearRegression())

{'LinearRegression': {'r2_score': 0.7314771133083371,
  'std': 0.029313978928780196,
  'mae_pred': 0.9779014138976233}}

In [530]:
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from sklearn.linear_model import Ridge, Lasso
from xgboost import XGBRegressor 
import category_encoders as ce
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor

In [532]:
model_dict = {
    'lin_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'lasso':Lasso(),
    'decision Tree':DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreeRegressor(),
    'gradientBoosting':GradientBoostingRegressor(),
    'adaboost':AdaBoostRegressor(),
    'mlp':MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [534]:
def try_all_models(model_dict):
    model_output = []
    for model_name, model in model_dict.items():
        model_output.append(scorer(model_name, model))
    df = pd.DataFrame({
        'models': list(model_dict.keys()),
        'r2_train': list(map(lambda x: list(x.values())[0]['r2_score'], model_output)),
        'mae_pred': list(map(lambda x: list(x.values())[0]['mae_pred'], model_output)),
        'std_train': list(map(lambda x: list(x.values())[0]['std'], model_output)),
        }).sort_values(by='mae_pred', ascending=True)
    print(df)

In [536]:
try_all_models(model_dict)

              models  r2_train  mae_pred  std_train
5      random forest  0.877277  0.501836   0.027178
10           xgboost  0.891389  0.512031   0.021700
7   gradientBoosting  0.873259  0.578358   0.019643
4      decision Tree  0.784844  0.659486   0.037379
6        extra trees  0.745585  0.766547   0.034477
9                mlp  0.766382  0.798228   0.027099
8           adaboost  0.755072  0.876456   0.028157
2              ridge  0.731482  0.977895   0.029285
0            lin_reg  0.731477  0.977901   0.029314
1                svr  0.723545  0.979895   0.029877
3              lasso  0.070839  1.639397   0.019488


# OneHotEncoding

In [538]:
columns_to_encode

['sector', 'property_type', 'luxury_category', 'servant room', 'study room']

In [545]:
preprocessor = ColumnTransformer(
    transformers = [
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'builtUpArea']),
        ('cat1', OneHotEncoder(handle_unknown='ignore', drop='first'), ['sector', 'property_type', 'servant room', 'study room']),
        ('cat2', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), ['luxury_category'])
    ],
    remainder='passthrough'
)

In [547]:
try_all_models(model_dict)

              models  r2_train  mae_pred  std_train
9                mlp  0.891944  0.493997   0.019324
5      random forest  0.879789  0.509138   0.026281
10           xgboost  0.893772  0.517147   0.020206
1                svr  0.892784  0.520650   0.018451
7   gradientBoosting  0.861001  0.599005   0.020568
4      decision Tree  0.814823  0.633194   0.042523
6        extra trees  0.811410  0.647525   0.038615
0            lin_reg  0.855245  0.677911   0.019053
2              ridge  0.855917  0.691812   0.017834
8           adaboost  0.728790  0.948791   0.029384
3              lasso -0.002128  1.675962   0.002720


# Using PCA

In [549]:
from sklearn.decomposition import PCA

In [550]:
preprocessor = ColumnTransformer(
    transformers = [
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'builtUpArea']),
        ('cat1', OneHotEncoder(handle_unknown='ignore', drop='first', sparse_output=False), ['sector', 'property_type', 'servant room', 'study room']),
        ('cat2', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), ['luxury_category'])
    ],
    remainder='passthrough'
)

In [554]:
def scorer(model_name, model):

    output = {}
    
    pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=0.95)),
    ('regressor', model)
    ])

    k_fold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=k_fold, scoring='r2')
    r2_score, standard_deviation = scores.mean(), scores.std()

    output[model_name] = {'r2_score':r2_score}
    output[model_name].update(std=standard_deviation)

    pipeline.fit(X_train, y_train)
    
    y_pred = pipeline.predict(X_test)

    y_pred = np.expm1(y_pred)
    
    mae = mean_absolute_error(np.expm1(y_test), y_pred)

    output[model_name].update(mae_pred=mae)

    return output

In [556]:
try_all_models(model_dict)

              models  r2_train  mae_pred  std_train
9                mlp  0.861089  0.591966   0.025442
1                svr  0.855484  0.601614   0.031048
5      random forest  0.857414  0.610175   0.019221
10           xgboost  0.851233  0.621268   0.023046
7   gradientBoosting  0.844454  0.657442   0.021665
6        extra trees  0.714664  0.769006   0.028114
4      decision Tree  0.705436  0.831387   0.051117
0            lin_reg  0.799658  0.864299   0.024576
2              ridge  0.799701  0.864451   0.024571
8           adaboost  0.732452  0.905625   0.023325
3              lasso -0.002128  1.675962   0.002720


# Target Encoding

In [558]:
import category_encoders as ce

In [559]:
columns_to_encode

['sector', 'property_type', 'luxury_category', 'servant room', 'study room']

In [563]:
preprocessor = ColumnTransformer(
    transformers = [
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'builtUpArea']),
        ('cat1', OneHotEncoder(handle_unknown='ignore', drop='first'), ['property_type', 'servant room', 'study room']),
        ('cat2', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), ['luxury_category']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ],
    remainder='passthrough'
)

In [565]:
def scorer(model_name, model):

    output = {}
    
    pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', model)
    ])

    k_fold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=k_fold, scoring='r2')
    r2_score, standard_deviation = scores.mean(), scores.std()

    output[model_name] = {'r2_score':r2_score}
    output[model_name].update(std=standard_deviation)

    pipeline.fit(X_train, y_train)
    
    y_pred = pipeline.predict(X_test)

    y_pred = np.expm1(y_pred)
    
    mae = mean_absolute_error(np.expm1(y_test), y_pred)

    output[model_name].update(mae_pred=mae)

    return output

In [567]:
try_all_models(model_dict)

              models  r2_train  mae_pred  std_train
10           xgboost  0.902030  0.486663   0.017190
5      random forest  0.892755  0.501955   0.017669
7   gradientBoosting  0.883126  0.571852   0.018744
1                svr  0.855775  0.629863   0.024731
4      decision Tree  0.806009  0.637920   0.035322
6        extra trees  0.794624  0.668495   0.031405
9                mlp  0.850919  0.668864   0.020027
8           adaboost  0.815525  0.787677   0.023930
0            lin_reg  0.815099  0.828875   0.018589
2              ridge  0.815115  0.829005   0.018587
3              lasso -0.002128  1.675962   0.002720


In [569]:
xgb = XGBRegressor()

In [573]:
preprocessor = ColumnTransformer(
    transformers = [
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'builtUpArea']),
        ('cat1', OneHotEncoder(handle_unknown='ignore', drop='first'), ['sector', 'property_type', 'servant room', 'study room']),
        ('cat2', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), ['luxury_category']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ],
    remainder='passthrough'
)

In [575]:
 pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', xgb)
    ])

In [577]:
pipeline.fit(X, y_transformed)

In [579]:
import pickle

In [581]:
with open('model.pkl', 'wb') as f:
    pickle.dump(pipeline, f)

In [583]:
with open('df.pkl', 'wb') as f:
    pickle.dump(X, f)

In [593]:
%conda list 

# packages in environment at /opt/anaconda3:
#
# Name                    Version                   Build  Channel
_anaconda_depends         2024.06         py312_openblas_2  
abseil-cpp                20230802.0           h313beb8_2  
absl-py                   2.1.0                    pypi_0    pypi
aext-assistant            4.0.15          py312hca03da5_jl4_0  
aext-assistant-server     4.0.15          py312hca03da5_0  
aext-core                 4.0.15          py312hca03da5_jl4_0  
aext-core-server          4.0.15          py312hca03da5_1  
aext-panels               4.0.15          py312hca03da5_0  
aext-panels-server        4.0.15          py312hca03da5_0  
aext-share-notebook       4.0.15          py312hca03da5_0  
aext-share-notebook-server 4.0.15          py312hca03da5_0  
aext-shared               4.0.15          py312hca03da5_0  
aiobotocore               2.12.3          py312hca03da5_0  
aiohttp                   3.9.5           py312h80987f9_0  
aioitertools              0.7.

In [595]:
%pip show scikit-learn

Name: scikit-learn
Version: 1.6.1
Summary: A set of python modules for machine learning and data mining
Home-page: https://scikit-learn.org
Author: 
Author-email: 
License: BSD 3-Clause License

 Copyright (c) 2007-2024 The scikit-learn developers.
 All rights reserved.

 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:

 * Redistributions of source code must retain the above copyright notice, this
   list of conditions and the following disclaimer.

 * Redistributions in binary form must reproduce the above copyright notice,
   this list of conditions and the following disclaimer in the documentation
   and/or other materials provided with the distribution.

 * Neither the name of the copyright holder nor the names of its
   contributors may be used to endorse or promote products derived from
   this software without specific prior written permission.

 THIS SOFTWARE IS PROVIDED BY THE COPYR

In [599]:
%pip list

Package                           Version
--------------------------------- ------------------
absl-py                           2.1.0
aext-assistant                    4.0.15
aext-assistant-server             4.0.15
aext-core                         4.0.15
aext-core-server                  4.0.15
aext-panels                       4.0.15
aext-panels-server                4.0.15
aext-share-notebook               4.0.15
aext-share-notebook-server        4.0.15
aext-shared                       4.0.15
aiobotocore                       2.12.3
aiohttp                           3.9.5
aioitertools                      0.7.1
aiosignal                         1.2.0
alabaster                         0.7.16
altair                            5.0.1
anaconda-anon-usage               0.4.4
anaconda-catalogs                 0.2.0
anaconda-client                   1.12.3
anaconda-cloud-auth               0.5.1
anaconda-navigator                2.6.0
anaconda-project                  0.11.1
annotated-ty