In [1]:
import pandas as pd

housing_data = pd.read_csv("housing_data_test_train.csv")

In [2]:
housing_data.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Price
0,0,0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,1,1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,2,2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,3,3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,5,5,4.0368,52.0,4.761658,1.103627,413.0,2.139896,37.85,-122.25,2.697


In [3]:
# Checking for missing values and data types
missing_values = housing_data.isnull().sum()
data_types = housing_data.dtypes

# Statistical summary of the dataset
statistical_summary = housing_data.describe()

missing_values, data_types, statistical_summary


(Unnamed: 0.1    0
 Unnamed: 0      0
 MedInc          0
 HouseAge        0
 AveRooms        0
 AveBedrms       0
 Population      0
 AveOccup        0
 Latitude        0
 Longitude       0
 Price           0
 dtype: int64,
 Unnamed: 0.1      int64
 Unnamed: 0        int64
 MedInc          float64
 HouseAge        float64
 AveRooms        float64
 AveBedrms       float64
 Population      float64
 AveOccup        float64
 Latitude        float64
 Longitude       float64
 Price           float64
 dtype: object,
        Unnamed: 0.1    Unnamed: 0        MedInc      HouseAge      AveRooms  \
 count  16512.000000  16512.000000  16512.000000  16512.000000  16512.000000   
 mean   10344.188590  10344.188590      3.876149     28.604469      5.441114   
 std     5958.933738   5958.933738      1.891584     12.586046      2.613727   
 min        0.000000      0.000000      0.499900      1.000000      0.846154   
 25%     5168.750000   5168.750000      2.572050     18.000000      4.439906   
 50% 

In [4]:
from sklearn.preprocessing import StandardScaler

# 1. Removal of Unnecessary Columns
housing_data_fe = housing_data.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'])

# 2. Outlier Detection and Handling
# Using IQR for columns AveBedrms and AveOccup
Q1 = housing_data_fe[['AveBedrms', 'AveOccup']].quantile(0.25)
Q3 = housing_data_fe[['AveBedrms', 'AveOccup']].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Correcting the outlier removal step

# Aligning the lower and upper bounds with the DataFrame
lower_bound = lower_bound.reindex(housing_data_fe.columns, axis=0, fill_value=float('-inf'))
upper_bound = upper_bound.reindex(housing_data_fe.columns, axis=0, fill_value=float('inf'))

# Applying the outlier filtering only to relevant columns
housing_data_no_outliers = housing_data_fe[~((housing_data_fe < lower_bound) | (housing_data_fe > upper_bound)).any(axis=1)]

# Proceeding with feature transformation and creation of new features
# Standardizing the numerical features
scaler = StandardScaler()
numerical_columns = ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup']
housing_data_no_outliers[numerical_columns] = scaler.fit_transform(housing_data_no_outliers[numerical_columns])

# Creating new features
housing_data_no_outliers['RoomsPerHousehold'] = housing_data_no_outliers['AveRooms'] / housing_data_no_outliers['AveOccup']
housing_data_no_outliers['BedroomProportion'] = housing_data_no_outliers['AveBedrms'] / housing_data_no_outliers['AveRooms']
housing_data_no_outliers['PopulationPerHousehold'] = housing_data_no_outliers['Population'] / housing_data_no_outliers['AveOccup']

housing_data_no_outliers.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  housing_data_no_outliers[numerical_columns] = scaler.fit_transform(housing_data_no_outliers[numerical_columns])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  housing_data_no_outliers['RoomsPerHousehold'] = housing_data_no_outliers['AveRooms'] / housing_data_no_outliers['AveOccup']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#re

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Price,RoomsPerHousehold,BedroomProportion,PopulationPerHousehold
0,2.319603,0.966815,1.466385,-0.369971,-1.049897,-0.477898,37.88,-122.23,4.526,-3.068406,-0.252301,2.196905
1,2.306987,-0.62919,0.833014,-1.160662,0.881764,-1.193694,37.86,-122.22,3.585,-0.697846,-1.393327,-0.738685
2,1.753615,1.844617,2.573533,0.385817,-0.888228,-0.081702,37.85,-122.24,3.521,-31.499037,0.149917,10.871565
3,0.897954,1.844617,0.475753,0.379925,-0.830622,-0.49012,37.85,-122.25,3.413,-0.970687,0.798577,1.694732
4,0.046534,1.844617,-0.420567,0.845359,-0.965346,-1.145428,37.85,-122.25,2.697,0.36717,-2.010047,0.842782


In [5]:
from sklearn.model_selection import train_test_split

# 1. Handling Missing Values
# Rechecking for missing values
missing_values_check = housing_data_no_outliers.isnull().sum()

# 2. The dataset contains only numerical features, so no encoding of categorical variables is required

# 3. Splitting the Data into Features and Target
X = housing_data_no_outliers.drop('Price', axis=1)  # Features
y = housing_data_no_outliers['Price']               # Target

# 4. Train-Test Split
# Splitting the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

missing_values_check, X_train.shape, X_test.shape, y_train.shape, y_test.shape


(MedInc                    0
 HouseAge                  0
 AveRooms                  0
 AveBedrms                 0
 Population                0
 AveOccup                  0
 Latitude                  0
 Longitude                 0
 Price                     0
 RoomsPerHousehold         0
 BedroomProportion         0
 PopulationPerHousehold    0
 dtype: int64,
 (11885, 11),
 (2972, 11),
 (11885,),
 (2972,))

In [13]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Initializing models
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso(),
    "Elastic Net": ElasticNet(),
    "Random Forest Regressor": RandomForestRegressor(),
    "Gradient Boosting Regressor": GradientBoostingRegressor()
}

# Dictionary to store the MSE for each model
model_performance = {}

# Training and evaluating each model
for model_name, model in models.items():
    # Training the model
    model.fit(X_train, y_train)

    # Making predictions on the test set
    y_pred = model.predict(X_test)

    # Calculating MSE
    mse = mean_squared_error(y_test, y_pred)
    model_performance[model_name] = mse

model_performance_sorted = dict(sorted(model_performance.items(), key=lambda item: item[1]))

model_performance_sorted

{'Random Forest Regressor': 0.25289940334489736,
 'Gradient Boosting Regressor': 0.2766483622880805,
 'Ridge Regression': 0.4209713870587075,
 'Linear Regression': 0.4209737602934081,
 'Elastic Net': 1.038206603863539,
 'Lasso Regression': 1.3223497435049176}

In [7]:
from sklearn.model_selection import GridSearchCV

# Defining the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5]
}

# Initializing the Gradient Boosting Regressor
gb_regressor = GradientBoostingRegressor()

# Grid Search with cross-validation
grid_search = GridSearchCV(estimator=gb_regressor, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)

# Performing the grid search
grid_search.fit(X_train, y_train)

# Best parameters and best score (MSE)
best_params = grid_search.best_params_
best_score = -grid_search.best_score_

#best_params, best_score

In [8]:
gb_regressor = GradientBoostingRegressor(
    n_estimators=200,  
    learning_rate=0.1, 
    max_depth=5,      
)

In [9]:
gb_regressor.fit(X, y)

GradientBoostingRegressor(max_depth=5, n_estimators=200)

In [29]:
def evaluate_model_performance(model, new_dataset):
    """
    Evaluate the performance of the trained regression model on a new dataset.
    The new dataset is preprocessed within the function to match the training data format.
    
    Parameters:
    model (Regressor): A trained regression model.
    new_dataset (DataFrame): A new dataset with the same structure as the original training dataset.
    
    Returns:
    dict: A dictionary with the R-squared value and Mean Squared Error indicating the performance of the model.
    """
    
    # Preprocessing the new dataset
    
    # Remove unnecessary index columns
    new_data = new_dataset.copy()
    if 'Unnamed: 0' in new_data.columns:
        new_data.drop(columns=['Unnamed: 0'], inplace=True)
    if 'Unnamed: 0.1' in new_data.columns:
        new_data.drop(columns=['Unnamed: 0.1'], inplace=True)
    
    # Feature engineering
    new_data['RoomsPerHousehold'] = new_data['AveRooms'] / new_data['AveOccup']
    new_data['BedroomProportion'] = new_data['AveBedrms'] / new_data['AveRooms']
    new_data['PopulationPerHousehold'] = new_data['Population'] / new_data['AveOccup']
    
    # Standardizing numerical features
    scaler = StandardScaler()
    numerical_features = ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup']
    new_data[numerical_features] = scaler.fit_transform(new_data[numerical_features])
    
    # Splitting the preprocessed dataset into features and target
    X_new = new_data.drop('Price', axis=1)
    y_new = new_data['Price']
    
    # Making predictions using the model
    predictions = model.predict(X_new)
    
    # Calculating R-squared value and Mean Squared Error
    r_squared = r2_score(y_new, predictions)
    mse = mean_squared_error(y_new, predictions)
    
    return {"R-squared": r_squared, "MSE": mse}

# Example usage:
# performance_metrics = evaluate_model_performance(trained_model, new_data)
# print("Performance metrics:", performance_metrics)



In [27]:
eval_data = pd.read_csv("housing_data_holdout_set.csv")

In [26]:
evaluate_model_performance(gb_regressor, eval_data)

{'R-squared': 0.8078367937518194, 'MSE': 0.2520587181052135}

In [22]:
#import pickle
#with open('MODEL_A.pkl', 'wb') as file:
#    pickle.dump(gb_regressor, file)

In [28]:
filename = 'MODEL_A.pkl'

with open(filename, 'rb') as file:
    model_a = pickle.load(file)

In [30]:
evaluate_model_performance(model_a, eval_data)

{'R-squared': 0.8078367937518194, 'MSE': 0.2520587181052135}

In [None]:
'''
FINAL MODEL A PERFORMANCE

{'R-squared': 0.8078367937518194, 'MSE': 0.2520587181052135}

'''
