In [75]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

**Data loading & Train/Validation split**

In [76]:
train_data = pd.read_csv('./Final_Output_Reg_dev.csv')

X_train = train_data.drop(columns=['Offer To 1st Close'])
y_train = train_data['Offer To 1st Close']

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

**Model Setup**

In [77]:
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso(),
    "ElasticNet Regression": ElasticNet(),
    "Random Forest": RandomForestRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "SVM": SVR(),
    "Gradient Boosting": GradientBoostingRegressor()
}

param_grid = {
    "Ridge Regression": {'alpha': [0.01, 0.1, 1, 10], 'max_iter': [1000, 2000, 5000]},
    "Lasso Regression": {'alpha': [0.01, 0.1, 1, 10], 'max_iter': [1000, 2000, 5000]},
    "ElasticNet Regression": {'alpha': [0.01, 0.1, 1, 10], 'l1_ratio': [0.1, 0.5, 0.9], 'max_iter': [1000, 2000, 5000]},
    
    "Random Forest": {
        'n_estimators': [50, 100, 200, 500, 1000],
        'max_depth': [None, 10, 20, 30],  
        'min_samples_split': [2, 5, 10],  
        'min_samples_leaf': [1, 2, 4]  
    },
    
    "Decision Tree": {
        'max_depth': [None, 10, 20, 30],  
        'min_samples_split': [2, 5, 10],  
        'min_samples_leaf': [1, 2, 4]  
    },
    
    "SVM": {
        'C': [0.1, 1, 10, 100],  
        'gamma': ['scale', 'auto', 0.01, 0.1],  
        'kernel': ['linear', 'rbf']  
    },
    
    "Gradient Boosting": {
        'n_estimators': [50, 100, 200, 500], 
        'learning_rate': [0.01, 0.05, 0.1, 0.2],  
        'max_depth': [3, 5, 10, 20] 
    }
}

**Regression Model Training**

In [78]:
best_models = {}
for model_name, model in models.items():
    print(f"Training {model_name}...")
    
    if model_name == "Linear Regression":
        model.fit(X_train, y_train)
        best_models[model_name] = model
        print(f"Using default Linear Regression with no parameter tuning.")
    elif model_name in param_grid:
        grid_search = GridSearchCV(estimator=model, param_grid=param_grid[model_name], cv=5, n_jobs=-1, scoring='neg_mean_squared_error')
        grid_search.fit(X_train, y_train)
        best_models[model_name] = grid_search.best_estimator_
        print(f"Best parameters for {model_name}: {grid_search.best_params_}")
        print(f"Best score for {model_name}: {grid_search.best_score_}")

Training Linear Regression...
Using default Linear Regression with no parameter tuning.
Training Ridge Regression...
Best parameters for Ridge Regression: {'alpha': 10, 'max_iter': 1000}
Best score for Ridge Regression: -142.8974383680772
Training Lasso Regression...
Best parameters for Lasso Regression: {'alpha': 0.1, 'max_iter': 1000}
Best score for Lasso Regression: -130.26267660799377
Training ElasticNet Regression...
Best parameters for ElasticNet Regression: {'alpha': 0.1, 'l1_ratio': 0.9, 'max_iter': 1000}
Best score for ElasticNet Regression: -130.35570127767406
Training Random Forest...
Best parameters for Random Forest: {'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}
Best score for Random Forest: -118.48466592150291
Training Decision Tree...
Best parameters for Decision Tree: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}
Best score for Decision Tree: -161.3756324227868
Training SVM...
Best parameters for SVM: {'C': 10

In [79]:
for model_name, model in best_models.items():
    print(f"\nEvaluating {model_name} on validation set:")
    y_pred = model.predict(X_val)
    mse = mean_squared_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)
    print(f"Mean Squared Error: {mse}")
    print(f"R2 Score: {r2}")


Evaluating Linear Regression on validation set:
Mean Squared Error: 142.07265083106634
R2 Score: -0.10471379419688653

Evaluating Ridge Regression on validation set:
Mean Squared Error: 118.66406369536806
R2 Score: 0.0773042716320862

Evaluating Lasso Regression on validation set:
Mean Squared Error: 113.60598722031932
R2 Score: 0.11663433847748672

Evaluating ElasticNet Regression on validation set:
Mean Squared Error: 114.21246858510655
R2 Score: 0.11191852353573228

Evaluating Random Forest on validation set:
Mean Squared Error: 103.07808178426497
R2 Score: 0.19849613447535852

Evaluating Decision Tree on validation set:
Mean Squared Error: 162.23086653859195
R2 Score: -0.2614579587368955

Evaluating SVM on validation set:
Mean Squared Error: 114.7227067633869
R2 Score: 0.10795106638915997

Evaluating Gradient Boosting on validation set:
Mean Squared Error: 105.298708195533
R2 Score: 0.1812292177680559


**Test Set Evaluation**

In [80]:
test_data = pd.read_csv('./Final_Output_Reg_test.csv')

In [81]:
X_test = test_data.drop(columns=['Offer To 1st Close'])
y_test = test_data['Offer To 1st Close']

**Test Result**

In [82]:
for model_name, model in best_models.items():
    print(f"\nEvaluating {model_name} on test set:")
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"Mean Squared Error: {mse}")
    print(f"R2 Score: {r2}")


Evaluating Linear Regression on test set:
Mean Squared Error: 1230.1863513040537
R2 Score: -0.06813817487034113

Evaluating Ridge Regression on test set:
Mean Squared Error: 1169.1898778125824
R2 Score: -0.01517655503149129

Evaluating Lasso Regression on test set:
Mean Squared Error: 1133.521937970702
R2 Score: 0.015792970946183038

Evaluating ElasticNet Regression on test set:
Mean Squared Error: 1135.0427730422534
R2 Score: 0.014472470197752774

Evaluating Random Forest on test set:
Mean Squared Error: 1085.6001872362467
R2 Score: 0.0574021558569342

Evaluating Decision Tree on test set:
Mean Squared Error: 1100.5133264715942
R2 Score: 0.04445347266958988

Evaluating SVM on test set:
Mean Squared Error: 1178.9751507737785
R2 Score: -0.023672847963290167

Evaluating Gradient Boosting on test set:
Mean Squared Error: 1088.9496239179634
R2 Score: 0.05449392883892168


*PCA Version*

In [83]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import numpy as np

In [84]:
train_data = pd.read_csv('./Final_Output_Reg_dev.csv')

X_train = train_data.drop(columns=['Offer To 1st Close'])
y_train = train_data['Offer To 1st Close']

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

pca = PCA(n_components=0.95)  
X_train_pca = pca.fit_transform(X_train_scaled)
X_val_pca = pca.transform(X_val_scaled)

In [85]:
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso(),
    "ElasticNet Regression": ElasticNet(),
    #"Random Forest": RandomForestRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Gradient Boosting": GradientBoostingRegressor()
}

param_grid = {
    "Ridge Regression": {'alpha': [0.01, 0.1, 1, 10], 'max_iter': [1000, 2000, 5000]},
    "Lasso Regression": {'alpha': [0.01, 0.1, 1, 10], 'max_iter': [1000, 2000, 5000]},
    "ElasticNet Regression": {'alpha': [0.01, 0.1, 1, 10], 'l1_ratio': [0.1, 0.5, 0.9], 'max_iter': [1000, 2000, 5000]},
    
    "Random Forest": {
        'n_estimators': [50, 100, 200, 500, 1000],
        'max_depth': [None, 10, 20, 30],  
        'min_samples_split': [2, 5, 10],  
        'min_samples_leaf': [1, 2, 4]  
    },
    
    "Decision Tree": {
        'max_depth': [None, 10, 20, 30],  
        'min_samples_split': [2, 5, 10],  
        'min_samples_leaf': [1, 2, 4]  
    },
    
    "Gradient Boosting": {
        'n_estimators': [50, 100, 200, 500], 
        'learning_rate': [0.01, 0.05, 0.1, 0.2],  
        'max_depth': [3, 5, 10, 20] 
    }
}

In [86]:
best_models = {}
for model_name, model in models.items():
    print(f"Training {model_name}...")
    
    if model_name == "Linear Regression":
        model.fit(X_train_pca, y_train)
        best_models[model_name] = model
        print(f"Using default Linear Regression with no parameter tuning.")
    elif model_name in param_grid:
        grid_search = GridSearchCV(estimator=model, param_grid=param_grid[model_name], cv=5, n_jobs=-1, scoring='neg_mean_squared_error')
        grid_search.fit(X_train_pca, y_train)
        best_models[model_name] = grid_search.best_estimator_
        print(f"Best parameters for {model_name}: {grid_search.best_params_}")
        print(f"Best score for {model_name}: {grid_search.best_score_}")


Training Linear Regression...
Using default Linear Regression with no parameter tuning.
Training Ridge Regression...
Best parameters for Ridge Regression: {'alpha': 10, 'max_iter': 1000}
Best score for Ridge Regression: -531.5157175381744
Training Lasso Regression...
Best parameters for Lasso Regression: {'alpha': 1, 'max_iter': 1000}
Best score for Lasso Regression: -135.97083199947127
Training ElasticNet Regression...
Best parameters for ElasticNet Regression: {'alpha': 1, 'l1_ratio': 0.1, 'max_iter': 1000}
Best score for ElasticNet Regression: -130.19091754411363
Training Decision Tree...
Best parameters for Decision Tree: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2}
Best score for Decision Tree: -201.83311879424298
Training Gradient Boosting...
Best parameters for Gradient Boosting: {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 200}
Best score for Gradient Boosting: -127.67293431747498


In [87]:
for model_name, model in best_models.items():
    print(f"\nEvaluating {model_name} on validation set:")
    y_pred = model.predict(X_val_pca)
    mse = mean_squared_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)
    print(f"Mean Squared Error: {mse}")
    print(f"R2 Score: {r2}")


Evaluating Linear Regression on validation set:
Mean Squared Error: 124.26166612978415
R2 Score: 0.03377901474726597

Evaluating Ridge Regression on validation set:
Mean Squared Error: 124.10772144786795
R2 Score: 0.03497604185038017

Evaluating Lasso Regression on validation set:
Mean Squared Error: 121.96365583691923
R2 Score: 0.05164764502117347

Evaluating ElasticNet Regression on validation set:
Mean Squared Error: 113.42141728626969
R2 Score: 0.11806949824219826

Evaluating Decision Tree on validation set:
Mean Squared Error: 145.36039258094132
R2 Score: -0.1302782757603549

Evaluating Gradient Boosting on validation set:
Mean Squared Error: 111.4449908212601
R2 Score: 0.1334375903157955


In [88]:
test_data = pd.read_csv('./Final_Output_Reg_test.csv')

X_test = test_data.drop(columns=['Offer To 1st Close'])
y_test = test_data['Offer To 1st Close']

X_test_scaled = scaler.transform(X_test)
X_test_pca = pca.transform(X_test_scaled)

In [89]:
for model_name, model in best_models.items():
    print(f"\nEvaluating {model_name} on test set:")
    y_pred = model.predict(X_test_pca)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"Mean Squared Error: {mse}")
    print(f"R2 Score: {r2}")


Evaluating Linear Regression on test set:
Mean Squared Error: 1164.6720993192396
R2 Score: -0.011253887811819796

Evaluating Ridge Regression on test set:
Mean Squared Error: 1164.598059681806
R2 Score: -0.011189601158734686

Evaluating Lasso Regression on test set:
Mean Squared Error: 1194.2435438236519
R2 Score: -0.03692998861373442

Evaluating ElasticNet Regression on test set:
Mean Squared Error: 1173.9165082868567
R2 Score: -0.019280562885850783

Evaluating Decision Tree on test set:
Mean Squared Error: 1156.0839328042357
R2 Score: -0.003797010650815169

Evaluating Gradient Boosting on test set:
Mean Squared Error: 1145.240573888928
R2 Score: 0.005617990246392979
