# XGBoost Model Exploration

## Training

### Import Modules

In [70]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

### Load Data

In [71]:
# Load the dataset
train_df = pd.read_csv('./../../data/processed/train_dropped.csv')

# NOTE: Logistic Regression is a classification model.
# Here we binarize the continuous 'Price' using its median value.
median_price = train_df['Price'].median()
train_df['Price_bin'] = (train_df['Price'] > median_price).astype(int)

# Separate features and target
X = train_df.drop(columns=['Price', 'Price_bin'])
y = train_df['Price_bin']

# Identify categorical and numerical features
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

### Preprocessing Pipelines

In [72]:
from sklearn.linear_model import Ridge

# Create preprocessing pipelines for numerical and categorical data
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Create a pipeline that attaches the preprocessor with a logistic regression model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('reg', Ridge())
])

### Grid Search

In [73]:
# Define parameter grid for grid search
param_grid = {
'reg__alpha': [10.0, 100.0, 1000.0, 5000.0, 10000.0]
}

# Setup GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

## Model Evaluation

### Best Model Score
Logistic regression is performing very poorly.

In [74]:
import numpy as np
from sklearn.metrics import root_mean_squared_error

# Print out each parameter with its best value and the corresponding choices
print("Best parameters and their choices:")
for param, choices in param_grid.items():
    best_value = grid_search.best_params_.get(param, None)
    print(f"\t{param:<25} | Best: {str(best_value):<10} | Choices: {choices}")

# Predict on the test set using the best estimator from grid search
y_pred = grid_search.predict(X_test)

# Calculate the Root Mean Squared Error (RMSE) using the new function
test_rmse = root_mean_squared_error(y_test, y_pred)
print("Test set RMSE: {:.2f}".format(test_rmse))


Best parameters and their choices:
	reg__alpha                | Best: 5000.0     | Choices: [10.0, 100.0, 1000.0, 5000.0, 10000.0]
Test set RMSE: 0.50


### Generate Submission
Logistic regression cannot account for missing data so we must also impute on rows with missing features before predicting. 

In [75]:
from sklearn.impute import SimpleImputer
import os

# Load your test data
test_df = pd.read_csv('./../../data/raw/test.csv')

# Identify numerical and categorical columns in test_df
numerical_features = test_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = test_df.select_dtypes(include=['object', 'category']).columns.tolist()

# Create imputers for numerical and categorical features
num_imputer = SimpleImputer(strategy='mean')
cat_imputer = SimpleImputer(strategy='most_frequent')

# Impute missing values for numerical features
test_df[numerical_features] = num_imputer.fit_transform(test_df[numerical_features])
# Impute missing values for categorical features
test_df[categorical_features] = cat_imputer.fit_transform(test_df[categorical_features])

# Now predict on the imputed test data using the trained grid_search model
y_pred_test = grid_search.predict(test_df)

# Create a submission DataFrame with only the 'id' and predicted 'Price'
submission = pd.DataFrame({
    'id': test_df['id'],
    'Price': y_pred_test
})

# Define the base file path for the submission file
file_path = os.path.join('..', '..', 'submissions', 'linear_regression_submission.csv')

# Check if the file exists; if so, append a number similar to Windows behavior
if os.path.exists(file_path):
    base, ext = os.path.splitext(file_path)
    i = 1
    new_file_path = f"{base} ({i}){ext}"
    # Increment the number until a non-existing file name is found
    while os.path.exists(new_file_path):
        i += 1
        new_file_path = f"{base} ({i}){ext}"
    file_path = new_file_path

# Save the submission DataFrame to CSV without the index
submission.to_csv(file_path, index=False)
print(f"Submission file saved as {file_path}")

Submission file saved as ..\..\submissions\linear_regression_submission.csv
