In [25]:
import pickle
import pathlib
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, make_scorer, accuracy_score




In [26]:
DATA_DIR = pathlib.Path.cwd().parent / 'data'
clean_data_path = DATA_DIR / 'processed' / 'ames_clean.pkl'
with open(clean_data_path, 'rb') as file:
    data = pickle.load(file)

In [27]:
data.head()

Unnamed: 0,MS.SubClass,MS.Zoning,Lot.Frontage,Lot.Area,Lot.Shape,Land.Contour,Lot.Config,Land.Slope,Neighborhood,Bldg.Type,...,Sale.Type,Sale.Condition,SalePrice,Condition,HasShed,HasAlley,Exterior,Garage.Age,Remod.Age,House.Age
0,20,RL,141.0,31770.0,IR1,Lvl,Corner,Gtl,NAmes,1Fam,...,GroupedWD,Normal,5.332438,Norm,False,False,BrkFace,50.0,50.0,50.0
1,20,RH,80.0,11622.0,Reg,Lvl,Inside,Gtl,NAmes,1Fam,...,GroupedWD,Normal,5.021189,Roads,False,False,VinylSd,49.0,49.0,49.0
2,20,RL,81.0,14267.0,IR1,Lvl,Corner,Gtl,NAmes,1Fam,...,GroupedWD,Normal,5.235528,Norm,False,False,Wd Sdng,52.0,52.0,52.0
3,20,RL,93.0,11160.0,Reg,Lvl,Corner,Gtl,NAmes,1Fam,...,GroupedWD,Normal,5.38739,Norm,False,False,BrkFace,42.0,42.0,42.0
4,60,RL,74.0,13830.0,IR1,Lvl,Inside,Gtl,Gilbert,1Fam,...,GroupedWD,Normal,5.278525,Norm,False,False,VinylSd,13.0,12.0,13.0


In [28]:
X = data.drop('SalePrice', axis=1)
y = data['SalePrice']

In [29]:
X_model = pd.get_dummies(X, drop_first=True).copy()

In [30]:
X_train, X_test, y_train, y_test = train_test_split(
    X_model,
    y,
    test_size=0.2,
    random_state=42,
)

In [31]:
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

In [32]:
scoring = make_scorer(mean_squared_error, greater_is_better=False)
cv_scores = cross_val_score(linear_model, X_model, y, cv=5, scoring=scoring)
mean_mse = -cv_scores.mean()
print(f"Cross-validated MSE: {mean_mse}")

Cross-validated MSE: 0.0031637562127567386


In [33]:
# Initialize the Gradient Boosting Regressor model
boosting_model = GradientBoostingRegressor()

# Define a parameter grid for tuning the model
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of boosting stages
    'learning_rate': [0.01, 0.1, 0.2],  # Step size shrinkage
    'max_depth': [3, 5, 7],  # Maximum depth of each tree
    'subsample': [0.8, 1.0],  # Fraction of samples used for fitting individual base learners
}

# Initialize GridSearchCV with the boosting model
grid_search = GridSearchCV(estimator=boosting_model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)

# Fit GridSearchCV to the training data
grid_search.fit(X_train, y_train)

# Retrieve the best model and its parameters
best_boosting_model = grid_search.best_estimator_
print("Best Hyperparameters:", grid_search.best_params_)

# Test performance on the test set
y_pred = best_boosting_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Test MSE:", mse)


Fitting 3 folds for each of 54 candidates, totalling 162 fits
Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 300, 'subsample': 0.8}
Test MSE: 0.0026286159810718444
