In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# ==========================================
# PART 0: LOAD DATA
# ==========================================
print("--- Loading Data ---")
try:
    # Load CSV files into NumPy arrays
    # We use .values to convert from Pandas DataFrame to NumPy array
    # .ravel() flattens the target arrays (y) to a 1D list, which sklearn prefers
    X_train = pd.read_csv("X_train.csv").values
    y_train = pd.read_csv("y_train.csv").values.ravel()
    X_test = pd.read_csv("X_test.csv").values
    y_test = pd.read_csv("y_test.csv").values.ravel()
    print(f"Success! Train shape: {X_train.shape}, Test shape: {X_test.shape}")
except FileNotFoundError:
    print("ERROR: Files not found. Please make sure you uploaded:")
    print("X_train.csv, y_train.csv, X_test.csv, y_test.csv")
    # Stop execution if data is missing
    raise

# ==========================================
# TASK 1.1: CLASS FREQUENCIES
# ==========================================
print("\n--- Task 1.1: Class Frequencies ---")

# Count how many times each class appears (0, 1, 2, 3, 4)
unique_classes, counts = np.unique(y_train, return_counts=True)
total_samples = len(y_train)

print(f"{'Class':<10} | {'Count':<10} | {'Frequency (%)':<15}")
print("-" * 45)

for cls, count in zip(unique_classes, counts):
    frequency = (count / total_samples) * 100
    print(f"{cls:<10} | {count:<10} | {frequency:<15.2f}")


# ==========================================
# PREPROCESSING (SCALING)
# ==========================================
# Logistic Regression and KNN need data to be "scaled" (normalized).
# We fit the scaler on Training data and transform both Train and Test.
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Helper function to calculate "0-1 Loss" (Error Rate)
def get_error_rate(model, X, y):
    y_pred = model.predict(X)
    return 1.0 - accuracy_score(y, y_pred)


# ==========================================
# TASK 1.2.1: LOGISTIC REGRESSION
# ==========================================
print("\n--- Task 1.2.1: Logistic Regression ---")

# Initialize model
# 'multinomial' = Multi-class classification
# 'C=1.0' = Default L2 regularization
log_reg = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=2000, C=1.0, random_state=42)

# Train (Fit) the model
log_reg.fit(X_train_scaled, y_train)

# Calculate Errors
train_err = get_error_rate(log_reg, X_train_scaled, y_train)
test_err = get_error_rate(log_reg, X_test_scaled, y_test)

print(f"Training Error: {train_err:.4f}")
print(f"Test Error:     {test_err:.4f}")


# ==========================================
# TASK 1.2.2: RANDOM FORESTS
# ==========================================
print("\n--- Task 1.2.2: Random Forests ---")

# List of tree counts to test
n_trees_list = [50, 100, 200]

for n in n_trees_list:
    # Initialize Random Forest
    # We use X_train (unscaled) because Trees don't strictly need scaling
    rf = RandomForestClassifier(n_estimators=n, random_state=42, n_jobs=-1)
    rf.fit(X_train, y_train)

    # Calculate Errors
    train_err = get_error_rate(rf, X_train, y_train)
    test_err = get_error_rate(rf, X_test, y_test)

    print(f"RF ({n} trees) -> Train Error: {train_err:.4f} | Test Error: {test_err:.4f}")


# ==========================================
# TASK 1.2.3: K-NEAREST NEIGHBORS (CV)
# ==========================================
print("\n--- Task 1.2.3: KNN with Cross-Validation ---")

# We want to test k values from 1 to 20
k_candidates = range(1, 21)
cv_accuracy_scores = []

print("Running Cross-Validation (this might take a minute)...")

for k in k_candidates:
    # Initialize KNN
    knn = KNeighborsClassifier(n_neighbors=k)

    # Run 5-fold Cross-Validation
    # This splits the training data 5 times and checks accuracy
    scores = cross_val_score(knn, X_train_scaled, y_train, cv=5, scoring='accuracy')

    # Store the average accuracy for this k
    cv_accuracy_scores.append(scores.mean())

# Find the K that had the highest accuracy
best_index = np.argmax(cv_accuracy_scores)
best_k = k_candidates[best_index]

print(f"Optimal k found: {best_k}")

# Train the FINAL model using the best k
best_knn = KNeighborsClassifier(n_neighbors=best_k)
best_knn.fit(X_train_scaled, y_train)

# Calculate Errors
train_err = get_error_rate(best_knn, X_train_scaled, y_train)
test_err = get_error_rate(best_knn, X_test_scaled, y_test)

print(f"KNN (k={best_k})   -> Train Error: {train_err:.4f} | Test Error: {test_err:.4f}")

--- Loading Data ---
Success! Train shape: (33723, 16), Test shape: (13900, 16)

--- Task 1.1: Class Frequencies ---
Class      | Count      | Frequency (%)  
---------------------------------------------
0.0        | 17565      | 52.09          
1.0        | 3221       | 9.55           
2.0        | 8523       | 25.27          
3.0        | 1583       | 4.69           
4.0        | 2831       | 8.39           

--- Task 1.2.1: Logistic Regression ---




Training Error: 0.1495
Test Error:     0.0995

--- Task 1.2.2: Random Forests ---
RF (50 trees) -> Train Error: 0.0001 | Test Error: 0.1122
RF (100 trees) -> Train Error: 0.0000 | Test Error: 0.1111
RF (200 trees) -> Train Error: 0.0000 | Test Error: 0.1109

--- Task 1.2.3: KNN with Cross-Validation ---
Running Cross-Validation (this might take a minute)...
Optimal k found: 20
KNN (k=20)   -> Train Error: 0.1405 | Test Error: 0.1024
