In [1]:
from category_encoders import OneHotEncoder, OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression, Ridge, RidgeCV
from sklearn.metrics import accuracy_score, plot_confusion_matrix, classification_report, r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.inspection import permutation_importance
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [9]:
DATA_PATH = '../Users/edwin/OneDrive/Documents/Github/Build Week Project #2/Data\housing data'
train= pd.read_csv(DATA_PATH+'Build Week train.csv')
test = pd.read_csv(DATA_PATH+'Build Week test.csv')


FileNotFoundError: [Errno 2] No such file or directory: '/Users/edwin/OneDrive/Documents/Github/Build Week Project #2/Data\\housing dataBuild Week train.csv'

In [None]:
train.info()

# I. Wrangling Data

In [None]:
# Creating a simple function that rounds up the SQFT and removes the high-cardinality column ('ADDRESS')
def wrangle(df):
    cols = ['POSTED_BY', 'UNDER_CONSTRUCTION', 'RERA', 'BHK_NO.', 'BHK_OR_RK',
       'SQUARE_FT', 'READY_TO_MOVE', 'RESALE', 'ADDRESS', 'LONGITUDE',
       'LATITUDE']
    df[cols[5]] = df[cols[5]].apply(np.ceil)
    
    df.drop(columns=cols[8], inplace=True)
    
    return df

In [None]:
wrangle(train)

In [None]:
wrangle(test)

# II. Splitting Data into Feature Matrix & Target vector

In [None]:
target = 'TARGET(PRICE_IN_LACS)'
y = train[target]
X = train.drop(columns=target)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state = 42)

assert len(X_train) + len(X_val) == len(X)

# III. Establishing a Baseline 

In [None]:
y_pred = [y_train.mean()] * len(y_train)
print('Baseline MAE:', mean_absolute_error(y_train, y_pred))

# IV. Building a Ridge Regresion Model with built-in Cross Validation

In [None]:
model_rcv = make_pipeline(
OneHotEncoder(use_cat_names=True),
StandardScaler(),
RidgeCV(alphas=(0.5, 5.0, 50.0),
        cv=5)
)

model_rcv.fit(X_train, y_train)

## Checking Metrics

In [None]:
print('Training MAE:', mean_absolute_error(y_train, model_rcv.predict(X_train)))
print('Validation MAE:', mean_absolute_error(y_val, model_rcv.predict(X_val)))
print('Training R^2:', model_rcv.score(X_train, y_train))
print('Validation R^2:', model_rcv.score(X_val, y_val))

## Feature Importance for RidgeRegression CV Model

In [None]:
coeffs_ = model_rcv.named_steps['ridgecv'].coef_
feature_names = model_rcv.named_steps['onehotencoder'].get_feature_names()

feature_imp = pd.Series(coeffs_, index=feature_names)
feature_imp.sort_values(ascending=True, key=lambda x: abs(x), inplace=True)
feature_imp.tail(10).plot(kind='barh')
plt.xlabel('Importance [₹]')
plt.ylabel('Feature')
plt.title('Feature Importance for Ridge Regression CV Model')
plt.show()

# V. Building Random Forest Model

In [None]:
model_rf = make_pipeline(
OrdinalEncoder(),
StandardScaler(),
RandomForestRegressor(n_estimators=200,
                      max_depth=30,
                      max_features='sqrt',
                      criterion='mae',
                      bootstrap=True,
                      n_jobs=-1,
                      random_state=42,
                      max_samples=0.8)
)

model_rf.fit(X_train, y_train);

## Checking Metrics

In [None]:
print('Training MAE:', mean_absolute_error(y_train, model_rf.predict(X_train)))
print('Test MAE:', mean_absolute_error(y_val, model_rf.predict(X_val)))
print('Training accuracy score:', model_rf.score(X_train, y_train))
print('Validation accuracy score:', model_rf.score(X_val, y_val))

## Permutation Importances

In [None]:
perm_imp = permutation_importance(model_rf,
                                  X_val,
                                  y_val,
                                  n_jobs=-1,
                                  random_state=42
                                 )

In [None]:
data = {'importances_mean': perm_imp['importances_mean'],
       'importances_std': perm_imp['importances_mean']}

In [None]:
df = pd.DataFrame(data, index=X_val.columns)
df.sort_values(by='importances_mean', inplace=True)

In [None]:
df['importances_mean'].tail(10).plot(kind='barh')
plt.xlabel('Importances (drop in accuracy)')
plt.ylabel('Feature')
plt.title('Permutation Importance for RandomForestRegressor model')
plt.show()

# VI. Building GradientBoosterRegressor Model

In [None]:
model_gbr = make_pipeline(
    OrdinalEncoder(),
    StandardScaler(),
    GradientBoostingRegressor(loss='huber',
                         n_estimators=200,
                         max_features='auto',
                         random_state=42)
)

model_gbr.fit(X_train, y_train)

## Checking Metrics

In [None]:
print('Training MAE:', mean_absolute_error(y_train, model_gbr.predict(X_train)))
print('Test MAE:', mean_absolute_error(y_val, model_gbr.predict(X_val)))
print('Training accuracy score:', model_gbr.score(X_train, y_train))
print('Validation accuracy score:', model_gbr.score(X_val, y_val))

# VII. Building XGBoost Regressor Model

In [None]:
eval_set = [(X_val, y_val)]

model_xgb = make_pipeline(
    OrdinalEncoder(),
    StandardScaler(),
    XGBRegressor(n_estimators=60,
                  max_depth=10,
                 min_child_weight=9,
                 gamma=12,
                  learning_rate=0.5,
                  objective='reg:squarederror',
                 tree_method='hist',
                  random_state=42, 
                  eval_set=eval_set,
                  eval_metric='rmse',
                  early_stopping_rounds=10,
                  verbose=True,
                  n_jobs=-1)
)

model_xgb.fit(X_train, y_train);

In [None]:
print('Training MAE:', mean_absolute_error(y_train, model_xgb.predict(X_train)))
print('Test MAE:', mean_absolute_error(y_val, model_xgb.predict(X_val)))
print('Training accuracy score:', model_xgb.score(X_train, y_train))
print('Validation accuracy score:', model_xgb.score(X_val, y_val))