# Coursework
## Programming in Python for Business Analytics
#### Group 19

#### Machine Learning Models

In [96]:
# Libraries:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import copy
import random

In [98]:
dataset = pd.read_csv('/Users/alexander/Documents/MSc Data Science/Python/Coursework/Copia de PythonCoursework/data/data_py.csv') #This dataset is before correlation matrix
tasks = pd.read_csv('/Users/alexander/Documents/MSc Data Science/Python/Coursework/tasks.csv')
suppliers = pd.read_csv('/Users/alexander/Documents/MSc Data Science/Python/Coursework/suppliers.csv')
cost = pd.read_csv('/Users/alexander/Documents/MSc Data Science/Python/Coursework/cost.csv.zip')

#### 1. Training and testing sets

In [100]:
# Splitting into test and train datasets, manually to preserve the groups
X = dataset.drop(['Task ID','Supplier ID','Cost'], axis='columns')
y = dataset['Cost']
Groups = dataset['Task ID']

all_tasks = Groups.unique()
random.seed(42)
TestGroup = np.random.choice(all_tasks, size=20, replace=False)

test_loc = dataset['Task ID'].isin(list(TestGroup))

X_test = X[test_loc]
y_test = y[test_loc]
X_train = X[~test_loc]
y_train = y[~test_loc]

#### 2. Ridge Regression:

In [102]:
# 3.3 Fitting a ridge regression model
from sklearn.linear_model import Ridge

ridge = Ridge(alpha=0.5)
ridge.fit(X_train, y_train)
ridge.score(X_test,y_test)
y_pred = ridge.predict(X_test)

In [114]:
# 3.4 Grouping by tasks and selecting a supplier for each test task
# using the error formula provided calculate difference between true supplier and predicted supplier cost

# Getting Task IDs for our predictions
ids = dataset['Task ID'][test_loc].reset_index(drop=True)
y_pred = pd.Series(y_pred, name="Predicted Cost")

y_pred_ids = pd.merge(ids, y_pred, right_index = True, left_index = True)
pred_best_suppliers = y_pred_ids.groupby('Task ID')['Predicted Cost'].min()

In [122]:
pred_best_suppliers

Task ID
T102    0.418969
T107    0.397930
T108    0.391683
T111    0.397508
T12     0.450436
T126    0.420693
T127    0.410633
T20     0.355949
T23     0.321155
T25     0.454071
T27     0.445902
T33     0.446751
T54     0.381206
T57     0.393498
T58     0.388032
T70     0.420737
T79     0.420581
T91     0.453063
T92     0.456403
T99     0.424343
Name: Predicted Cost, dtype: float64

In [124]:
# Getting the true cheapest suppliers for our test set
true_best_suppliers = cost.groupby('Task ID')['Cost'].min()

test_best_suppliers = true_best_suppliers[true_best_suppliers.index.isin(TestGroup)]

In [126]:
test_best_suppliers

Task ID
T102    0.389170
T107    0.404848
T108    0.375901
T111    0.380689
T12     0.439592
T126    0.387038
T127    0.394986
T20     0.309255
T23     0.305738
T25     0.398989
T27     0.403649
T33     0.388173
T54     0.350626
T57     0.391735
T58     0.357363
T70     0.365667
T79     0.396764
T91     0.444910
T92     0.416431
T99     0.419289
Name: Cost, dtype: float64

In [128]:
# Defining functions to calculate errors and rmse
def error_calc(true_min_costs, predicted_min_cost):
   error = true_min_costs - predicted_min_cost
   return error

def rmse_calc(error_array):
    squared_errors = error_array*error_array
    rss = np.sum(squared_errors)
    value = np.sqrt(rss/len(error_array))
    return value

ridge_error = error_calc(test_best_suppliers, pred_best_suppliers)
ridge_rmse = rmse_calc(ridge_error)

In [130]:
# Results for an initial ridge model
print(ridge_error)

Task ID
T102   -0.029799
T107    0.006918
T108   -0.015782
T111   -0.016819
T12    -0.010844
T126   -0.033656
T127   -0.015647
T20    -0.046694
T23    -0.015417
T25    -0.055082
T27    -0.042253
T33    -0.058577
T54    -0.030579
T57    -0.001763
T58    -0.030670
T70    -0.055070
T79    -0.023817
T91    -0.008153
T92    -0.039972
T99    -0.005054
dtype: float64


In [132]:
print(ridge_rmse)

0.03224905462971227


#### 3. Cross-validation

In [156]:
from sklearn.model_selection import LeaveOneGroupOut, cross_val_score
from sklearn.metrics import make_scorer

# Define the custom scoring function
def custom_error_score(y_true, y_pred, groups):
    # Combine predictions and true values with groups (Task ID)
    results = pd.DataFrame({'Task ID': groups, 'Actual Cost': y_true, 'Predicted Cost': y_pred})
    
    errors = []
    for task_id, group in results.groupby('Task ID'):
        # Get actual and predicted costs for this task
        actual_costs = group['Actual Cost'].values
        predicted_costs = group['Predicted Cost'].values
        
        # Identify supplier with the lowest predicted cost
        selected_supplier_index = np.argmin(predicted_costs)
        
        # Calculate the error for this task
        min_actual_cost = np.min(actual_costs)
        actual_cost_of_selected_supplier = actual_costs[selected_supplier_index]
        error_t = min_actual_cost - actual_cost_of_selected_supplier
        errors.append(error_t)
    
    # Return the mean error across all tasks in this fold
    return np.mean(errors)

In [158]:
# Create the scorer
custom_scorer = make_scorer(custom_error_score, greater_is_better=False, needs_proba=False, groups=Groups)



In [160]:
# Define the Leave-One-Group-Out cross-validator
logo = LeaveOneGroupOut()

In [162]:
# Perform Leave-One-Group-Out cross-validation
scores = []
for train_idx, test_idx in logo.split(X, y, groups=Groups):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    groups_test = Groups.iloc[test_idx]
    
    # Fit the model on the training set
    ridge.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = ridge.predict(X_test)
    
    # Calculate the score for this fold
    fold_score = custom_error_score(y_test, y_pred, groups_test)
    scores.append(fold_score)

# Compute RMSE of the scores
rmse_scores = np.sqrt(np.mean(np.array(scores)**2))

In [164]:
# Output the results
print(f"Cross-Validation Scores (Errors): {scores}")

Cross-Validation Scores (Errors): [-0.0006968233468558904, 0.0, -0.0333538663567397, -0.027587412713692472, 0.0, -0.05914839017400547, -0.013923368324799734, -0.10176826978047893, -0.02527894027607369, -0.008516451896807331, -0.020098615668314668, -0.026162844294415732, -0.0157307720523448, -0.022616915102923185, -0.018320281261390403, -0.021200359395771307, -0.01589934066381954, -0.019587667953720522, -0.004074396091798493, -0.027533479518293158, -0.007138839421870102, -0.053848974085956125, -0.00990789484916299, -0.010170413261649969, -0.028250282704583518, -0.034003932973600604, -0.01362023763449266, -0.052322285247574596, -0.015682717160928394, -0.03051918766921141, -0.03744146163967027, -0.044209801174477426, -0.028386253884491897, -0.033259263796944605, -0.040530052607745504, -0.010408508583130804, -0.0010206484836585705, 0.0, -0.0015756142650044103, -0.022599907898082527, -0.007005854650330601, -0.005351169440213632, -0.012896911130141686, -0.025453823409779752, 0.0, -0.01527123

In [166]:
print(f"RMSE of Cross-Validation Scores: {rmse_scores:.4f}")

RMSE of Cross-Validation Scores: 0.0255


#### 4. Hyper-parameter optimization

In [201]:
from sklearn.model_selection import GridSearchCV, LeaveOneGroupOut

# Custom scoring function
def custom_error_score(estimator, X_test, y_test, groups):
    # Predict on the test set
    y_pred = estimator.predict(X_test)
    
    # Combine predictions and true values with groups (Task ID)
    results = pd.DataFrame({
        'Task ID': groups,
        'Actual Cost': y_test,
        'Predicted Cost': y_pred
    })
    
    errors = []
    for task_id, group in results.groupby('Task ID'):
        actual_costs = group['Actual Cost'].values
        predicted_costs = group['Predicted Cost'].values
        selected_supplier_index = np.argmin(predicted_costs)
        min_actual_cost = np.min(actual_costs)
        actual_cost_of_selected_supplier = actual_costs[selected_supplier_index]
        error_t = min_actual_cost - actual_cost_of_selected_supplier
        errors.append(error_t)
    
    # Return the mean error for this fold
    return np.mean(errors)

In [207]:
# Custom scorer wrapper to include `groups`
class GroupScorer:
    def __init__(self, groups):
        self.groups = groups

    def __call__(self, estimator, X_test, y_test):
        return custom_error_score(estimator, X_test, y_test, self.groups)

# Leave-One-Group-Out cross-validator
logo = LeaveOneGroupOut()

# Hyperparameter grid for Ridge regression
param_grid = {'alpha': [0.1, 0.5, 1, 10, 100]}

# Ridge regression model
ridge = Ridge()

# GridSearch with manual scoring
best_params = None
best_score = None
all_scores = []

In [209]:
for train_idx, test_idx in logo.split(X, y, groups=Groups):
    # Get train-test split
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    groups_test = Groups.iloc[test_idx]
    
    # Perform grid search manually for each fold
    fold_scores = []
    for alpha in param_grid['alpha']:
        ridge = Ridge(alpha=alpha)
        ridge.fit(X_train, y_train)
        scorer = GroupScorer(groups_test)
        score = scorer(ridge, X_test, y_test)
        fold_scores.append(score)
    
    all_scores.append(fold_scores)

In [211]:
# Aggregate results
all_scores = np.array(all_scores)
mean_scores = np.mean(all_scores, axis=0)
best_alpha_idx = np.argmin(mean_scores)
best_params = {'alpha': param_grid['alpha'][best_alpha_idx]}
best_score = mean_scores[best_alpha_idx]

# Calculate RMSE
rmse = np.sqrt(np.mean(best_score**2))

In [213]:
# Output results
print(f"Best Hyperparameters: {best_params}")
print(f"Best RMSE from GridSearch: {rmse:.4f}")

Best Hyperparameters: {'alpha': 0.1}
Best RMSE from GridSearch: 0.0203
