## Logistic regression model in this file

In [None]:
# Third party imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, log_loss
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import seaborn as sns

# Local imports
from utils import scroll_df

#### Load the data train, val and test data for two datasets

In [None]:
dataset_1_train = pd.read_csv("datasets/train_dataset_1.csv")
X_train_1 = dataset_1_train.drop(columns=["vehicle_id", "time_step", "class"])
y_train_1 = dataset_1_train["class"]

dataset_2_train = pd.read_csv("datasets/train_dataset_1.csv")
X_train_2 = dataset_2_train.drop(columns=["vehicle_id", "time_step", "class"])
y_train_2 = dataset_2_train["class"]

print("Dataset 1")
print("X: ", X_train_1.shape)
print("y: ", y_train_1.shape)

print("Dataset 2")
print("X: ", X_train_2.shape)
print("y: ", y_train_2.shape)

In [None]:
dataset_1_val = pd.read_csv("datasets/val_dataset_1.csv")
X_val_1 = dataset_1_val.drop(columns=["vehicle_id", "time_step", "class"])
y_val_1 = dataset_1_val["class"]

dataset_2_val = pd.read_csv("datasets/val_dataset_2.csv")
X_val_2 = dataset_2_val.drop(columns=["vehicle_id", "time_step", "class"])
y_val_2 = dataset_2_val["class"]

print("Dataset 1")
print("X: ", X_val_1.shape)
print("y: ", y_val_1.shape)

print("Dataset 2")
print("X: ", X_val_2.shape)
print("y: ", y_val_2.shape)

In [None]:
dataset_1_test = pd.read_csv("datasets/test_dataset_1.csv")
X_test_1 = dataset_1_test.drop(columns=["vehicle_id", "time_step", "class"])
y_test_1 = dataset_1_test["class"]

dataset_2_test = pd.read_csv("datasets/test_dataset_2.csv")
X_test_2 = dataset_2_test.drop(columns=["vehicle_id", "time_step", "class"])
y_test_2 = dataset_2_test["class"]

print("Dataset 1")
print("X: ", X_test_1.shape)
print("y: ", y_test_1.shape)

print("Dataset 2")
print("X: ", X_test_2.shape)
print("y: ", y_test_2.shape)

#### Train and evaluation of model for different hyper params for a dataset

In [None]:
def train_and_evaluate_logisctic_regression(X_train, y_train, X_val, y_val, solvers):
    results = {"solver": [], 
               "train_accuracy": [], 
               "val_accuracy": [], 
               "train_loss": [], 
               "val_loss": [],
               "models": []
               }
    
    for solver in solvers:
        # Initialize and train the model
        model = LogisticRegression(solver=solver, max_iter=1000, random_state=50)
        model.fit(X_train, y_train)

        # Predictions
        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)
        
        # Probabilities for log loss
        y_train_proba = model.predict_proba(X_train)
        y_val_proba = model.predict_proba(X_val)

        # Calculate accuracy and log loss
        train_acc = accuracy_score(y_train, y_train_pred)
        val_acc = accuracy_score(y_val, y_val_pred)
        train_loss = log_loss(y_train, y_train_proba)
        val_loss = log_loss(y_val, y_val_proba)

        # Store results
        results["solver"].append(solver)
        results["train_accuracy"].append(train_acc)
        results["val_accuracy"].append(val_acc)
        results["train_loss"].append(train_loss)
        results["val_loss"].append(val_loss)
        results["models"].append(model)

        print(f"Solver={solver}: Train Acc={train_acc:.4f}, Val Acc={val_acc:.4f}, Train Loss={train_loss:.4f}, Val Loss={val_loss:.4f}")
    
    return results


In [None]:
solvers = ['lbfgs', 'newton-cg', 'sag', 'saga']

results_dataset_1 = train_and_evaluate_logisctic_regression(X_train_1, y_train_1, X_val_1, y_val_1, solvers)

In [None]:
results_dataset_1

In [None]:
results_dataset_2 = train_and_evaluate_logisctic_regression(X_train_2, y_train_2, X_val_2, y_val_2, solvers)

In [None]:
results_dataset_2

#### Test the models

In [None]:
def eval_on_test_set(model, X, y_true):

    # Make predictions
    y_test_pred = model.predict(X)

    # Compute metrics
    accuracy = accuracy_score(y_true, y_test_pred)

    # Confusion Matrix
    conf_matrix = confusion_matrix(y_test_2, y_test_pred)

    # Print Metrics
    print(f"Accuracy: {accuracy:.4f}")

    # Detailed Classification Report
    print("\nClassification Report:\n", classification_report(y_test_2, y_test_pred))

    # Plot Confusion Matrix
    plt.figure(figsize=(6, 5))
    sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=[0, 1, 2, 3, 4], yticklabels=[0, 1, 2, 3, 4])
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.title("Confusion Matrix")
    plt.show()

In [None]:
# Eval newton-cg model

model_1 = results_dataset_1["models"][1]
print(model_1)

eval_on_test_set(model_1, X_test_1, y_test_1)


In [None]:
# Eval newton-cg model

model_2 = results_dataset_2["models"][0]
print(model_2)

eval_on_test_set(model_2, X_test_2, y_test_2)