In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [7]:
# Import train and test data
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

print(train_data.shape)
print(test_data.shape)

(1460, 81)
(1459, 80)


In [8]:
# Drop columns with too many missing values in the train and test data
threshold = 0.4 # set threshold to 50% of missing values
dropped_columns = train_data.columns[train_data.isna().mean() > threshold] # get the names of the columns to drop
train_data = train_data.drop(dropped_columns, axis=1)
test_data = test_data.drop(dropped_columns, axis=1)

# Confirm that the desired columns have been dropped
print(train_data.shape)
print(test_data.shape)

(1460, 76)
(1459, 75)


In [9]:
# Impute median values into the missing data in the numeric columns of the train and test data
num_cols = test_data.select_dtypes(include=['float64', 'int64']).columns
train_data[num_cols] = train_data[num_cols].fillna(train_data[num_cols].median())
test_data[num_cols] = test_data[num_cols].fillna(test_data[num_cols].median())

# Confirm that missing values have been imputed
print(train_data.isnull().sum().sort_values(ascending=False))
print(test_data.isnull().sum().sort_values(ascending=False))

GarageType      81
GarageFinish    81
GarageQual      81
GarageCond      81
BsmtExposure    38
                ..
BsmtFinSF2       0
BsmtUnfSF        0
TotalBsmtSF      0
MSSubClass       0
SalePrice        0
Length: 76, dtype: int64
GarageFinish     78
GarageQual       78
GarageCond       78
GarageType       76
BsmtCond         45
                 ..
MSSubClass        0
Heating           0
HeatingQC         0
CentralAir        0
SaleCondition     0
Length: 75, dtype: int64


In [10]:
train_data.dropna(inplace=True)
test_data.dropna(inplace=True)

In [11]:
print(train_data.shape)
print(test_data.shape)

(1338, 76)
(1319, 75)


In [12]:
from sklearn.model_selection import train_test_split

# Separate the train_data into training and validation sets
X = train_data.drop(['SalePrice'], axis=1) # features
y = train_data['SalePrice'] # target variable
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)



In [13]:
# Check the training and validation subsets
print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)

(1070, 75) (1070,)
(268, 75) (268,)


### Machine Learning Pipeline

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV

#### Linear Regression

In [14]:
# Define the column transformer
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

In [36]:
# Define the pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [37]:
# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  Index(['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'Ha...
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
       'PavedDrive', 'SaleType', 'SaleCondition'],
      dtype='object'))])),
                ('regressor', LinearRegression())])

In [39]:
# Predict on the validation data
y_val_pred = pipeline.predict(X_val)

In [41]:
# Evaluate the model using R2 and MSE

mse = mean_squared_error(y_val, y_val_pred)
r2 = r2_score(y_val, y_val_pred)
print("MSE:", mse)
print("R2:", r2)

MSE: 636248400.175131
R2: 0.851312774417597


#### Random Forest

In [43]:
from sklearn.ensemble import RandomForestRegressor

In [44]:
# Define the Pipeline
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

# Define the hyperparameter grid to search over
param_grid = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__max_depth': [10, 20, 30]
}

In [45]:
# Perform grid search using 5-fold cross-validation
grid_search = GridSearchCV(pipe, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Predict on the validation data using the best estimator
best_pipeline = grid_search.best_estimator_
y_val_pred = best_pipeline.predict(X_val)

In [46]:
# Evaluate the model using R2 and MSE

mse = mean_squared_error(y_val, y_val_pred)
r2 = r2_score(y_val, y_val_pred)
print("MSE:", mse)
print("R2:", r2)

MSE: 711596588.0487593
R2: 0.8337043796389103


#### Gradient Boosting

In [1]:
from sklearn.ensemble import GradientBoostingRegressor

In [15]:
# Define the pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor())
])

# Define hyperparameters for the regressor
param_grid = {
    'regressor__learning_rate': [0.1, 0.01, 0.001],
    'regressor__n_estimators': [100, 200, 300],
    'regressor__max_depth': [2, 3, 4]
}

In [17]:
# Perform grid search using 5-fold cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Predict on the validation data using the best estimator
best_pipeline = grid_search.best_estimator_
y_pred = best_pipeline.predict(X_val)

In [19]:
# Evaluate the model using R2 and MSE

mse = mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)
print("MSE:", mse)
print("R2:", r2)

MSE: 612019954.7508689
R2: 0.856974808820067


#### Support Vector Regression

In [20]:
from sklearn.svm import SVR

In [21]:
# Define the pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', SVR())
])

# Define hyperparameters for the regressor
param_grid = {
    'regressor__kernel': ['linear', 'rbf', 'poly'],
    'regressor__C': [0.1, 1, 10],
    'regressor__gamma': ['scale', 'auto']
}

In [22]:
# Perform grid search using 5-fold cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Predict on the validation data using the best estimator
best_pipeline = grid_search.best_estimator_
y_val_pred = best_pipeline.predict(X_val)

In [23]:
# Evaluate the model using R2 and MSE

mse = mean_squared_error(y_val, y_val_pred)
r2 = r2_score(y_val, y_val_pred)
print("MSE:", mse)
print("R2:", r2)

MSE: 1388168369.1683474
R2: 0.6755938350554955


#### Elastic Net Regression

In [24]:
from sklearn.linear_model import ElasticNet

In [25]:
# Define the pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', ElasticNet())
])

# Define hyperparameters for the regressor
param_grid = {
    'regressor__alpha': [0.1, 1, 10],
    'regressor__l1_ratio': [0.1, 0.5, 0.9]
}

In [27]:
# Perform grid search using 5-fold cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Predict on the validation data using the best estimator
best_pipeline = grid_search.best_estimator_
y_val_pred = best_pipeline.predict(X_val)

In [28]:
# Evaluate the model using R2 and MSE

mse = mean_squared_error(y_val, y_val_pred)
r2 = r2_score(y_val, y_val_pred)
print("MSE:", mse)
print("R2:", r2)

MSE: 745704485.9976237
R2: 0.8257335796886576


Seems like Gradient Boosting was the best method, let's try it with a PCA analysis before the gradient boosting.

### PCA & Gradient Boosting

In [32]:
from sklearn.decomposition import TruncatedSVD

In [33]:
# Define the pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', TruncatedSVD()),
    ('regressor', GradientBoostingRegressor())
])

# Define hyperparameters for the pipeline
param_grid = {
    'pca__n_components': [10, 20, 30],
    'regressor__n_estimators': [100, 200, 300],
    'regressor__learning_rate': [0.01, 0.1],
    'regressor__max_depth': [3, 5, 7],
    'regressor__min_samples_split': [2, 4],
    'regressor__min_samples_leaf': [1, 2]
}

In [34]:
# Perform grid search using 5-fold cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Predict on the validation data using the best estimator
best_pipeline = grid_search.best_estimator_
y_pred = best_pipeline.predict(X_val)

In [36]:
# Evaluate the model using R2 and MSE

mse = mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)
print("MSE:", mse)
print("R2:", r2)

MSE: 714874404.1331942
R2: 0.832938374758694
