In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Load the datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')


In [4]:
from sklearn.impute import SimpleImputer

# Create an imputer object for numerical columns using mean
num_imputer = SimpleImputer(strategy='mean')

# Apply the imputer to each numerical column in train_df and test_df
for df in [train_df, test_df]:
    numerical_cols = df.select_dtypes(include=[np.number]).columns
    df[numerical_cols] = num_imputer.fit_transform(df[numerical_cols])

In [5]:
# Replace NaN values in categorical columns with 'None'
for df in [train_df, test_df]:
    categorical_cols = df.select_dtypes(include=['object']).columns
    df[categorical_cols] = df[categorical_cols].fillna('None')

In [14]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Define the column transformer to apply OneHotEncoder to categorical variables
categorical_features = train_df.select_dtypes(include=['object']).columns
preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
], remainder='passthrough')

# Apply preprocessing to the train dataset
X_train_transformed = preprocessor.fit_transform(train_df.drop('SalePrice', axis=1))

# Define the model
model = LinearRegression()

# Create the RFE model
rfe = RFE(estimator=model, n_features_to_select=250)  # Adjust n_features_to_select as needed

# Fit the RFE model
rfe.fit(X_train_transformed, train_df['SalePrice'])

# Get the feature names after transformation
all_feature_names = preprocessor.get_feature_names_out()

# Print the features selected by RFE
print("Features selected by RFE:", all_feature_names[rfe.support_])

Features selected by RFE: ['cat__Street_Grvl' 'cat__Street_Pave' 'cat__Alley_Grvl' 'cat__Alley_None'
 'cat__Alley_Pave' 'cat__LotShape_IR1' 'cat__LotShape_IR2'
 'cat__LotShape_IR3' 'cat__LotShape_Reg' 'cat__LandContour_Bnk'
 'cat__LandContour_HLS' 'cat__LandContour_Low' 'cat__LandContour_Lvl'
 'cat__Utilities_AllPub' 'cat__Utilities_NoSeWa' 'cat__LotConfig_Corner'
 'cat__LotConfig_CulDSac' 'cat__LotConfig_FR2' 'cat__LotConfig_FR3'
 'cat__LotConfig_Inside' 'cat__LandSlope_Gtl' 'cat__LandSlope_Mod'
 'cat__LandSlope_Sev' 'cat__Neighborhood_Blmngtn'
 'cat__Neighborhood_Blueste' 'cat__Neighborhood_BrDale'
 'cat__Neighborhood_BrkSide' 'cat__Neighborhood_ClearCr'
 'cat__Neighborhood_CollgCr' 'cat__Neighborhood_Crawfor'
 'cat__Neighborhood_Edwards' 'cat__Neighborhood_Gilbert'
 'cat__Neighborhood_IDOTRR' 'cat__Neighborhood_MeadowV'
 'cat__Neighborhood_Mitchel' 'cat__Neighborhood_NAmes'
 'cat__Neighborhood_NPkVill' 'cat__Neighborhood_NWAmes'
 'cat__Neighborhood_NoRidge' 'cat__Neighborhood_NridgH

In [15]:
# Transforming the train and test datasets
X_train_transformed = preprocessor.transform(train_df.drop('SalePrice', axis=1))
X_test_transformed = preprocessor.transform(test_df)

# Selecting the features identified by RFE
X_train_selected = X_train_transformed[:, rfe.support_]
X_test_selected = X_test_transformed[:, rfe.support_]

# Convert these arrays back to DataFrames
selected_features = all_feature_names[rfe.support_]
train_df_selected = pd.DataFrame(X_train_selected, columns=selected_features)
test_df_selected = pd.DataFrame(X_test_selected, columns=selected_features)

In [16]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor()
}

# Testing each model
for name, model in models.items():
    model.fit(train_df_selected, train_df['SalePrice'])
    scores = cross_val_score(model, train_df_selected, train_df['SalePrice'], scoring='neg_mean_squared_error')
    rmse = np.sqrt(-scores.mean())
    print(f"{name}: RMSE = {rmse}")

Linear Regression: RMSE = 385809071841.3081
Ridge Regression: RMSE = 34869.97423739327


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Lasso Regression: RMSE = 36634.07037562389
Decision Tree: RMSE = 43495.63771191226
Random Forest: RMSE = 31330.297108990442
Gradient Boosting: RMSE = 28078.522666969788


In [17]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor

# Define the parameter grid to search
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 3],
    'min_samples_leaf': [1, 2]
}

# Create a base model
gb = GradientBoostingRegressor()

# Instantiate the grid search model
grid_search = GridSearchCV(estimator=gb, param_grid=param_grid, 
                           cv=3, n_jobs=-1, scoring='neg_mean_squared_error', verbose=2)

# Fit the grid search to the data
grid_search.fit(train_df_selected, train_df['SalePrice'])

# Best parameters
best_params = grid_search.best_params_
print("Best parameters:", best_params)

# Create a new model with the best parameters
best_gb = GradientBoostingRegressor(**best_params)

# Fit the model and make predictions
best_gb.fit(train_df_selected, train_df['SalePrice'])
train_predictions = best_gb.predict(train_df_selected)
test_predictions = best_gb.predict(test_df_selected)

Fitting 3 folds for each of 108 candidates, totalling 324 fits
Best parameters: {'learning_rate': 0.1, 'max_depth': 3, 'min_samples_leaf': 2, 'min_samples_split': 3, 'n_estimators': 300}


In [18]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Calculate metrics for the training set
train_mae = mean_absolute_error(train_df['SalePrice'], train_predictions)
train_mse = mean_squared_error(train_df['SalePrice'], train_predictions)
train_rmse = np.sqrt(train_mse)
train_r2 = r2_score(train_df['SalePrice'], train_predictions)

print("Training Metrics:")
print("Mean Absolute Error:", train_mae)
print("Mean Squared Error:", train_mse)
print("Root Mean Squared Error:", train_rmse)
print("R-squared:", train_r2)

Training Metrics:
Mean Absolute Error: 9409.398613664373
Mean Squared Error: 153870004.74637923
Root Mean Squared Error: 12404.434882185453
R-squared: 0.9756024793499786
