 Load training and test data, separate features (X) and target (y).
 Identify numeric columns and apply StandardScaler via ColumnTransformer
 to normalize inputs before model training.


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Load data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

X = train.drop(columns=["category"])
y = train["category"]

# Preprocessing
num_cols = ["signal_strength", "response_level"]
cat_cols = []  # none in features, only target

preprocessor = ColumnTransformer([
    ("num", StandardScaler(), num_cols)
])

Define a helper function 'evaluate_model' to train and test models, printing key metrics.
Build an SVM pipeline with preprocessing and tune hyperparameters using GridSearchCV.
Split the data into train/validation sets and evaluate the best SVM model.


In [None]:
def evaluate_model(model, X_train, y_train, X_valid, y_valid, name="Model"):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    print(f"\n=== {name} ===")
    print(classification_report(y_valid, y_pred))
    print(confusion_matrix(y_valid, y_pred))


# SVM with GridSearchCV

svm_pipe = Pipeline([
    ("prep", preprocessor),
    ("clf", SVC(probability=True))
])

param_grid = {
    "clf__kernel": ["linear", "rbf"],
    "clf__C": [0.1, 1, 10],
    "clf__gamma": ["scale", "auto"]
}

svm_cv = GridSearchCV(
    svm_pipe,
    param_grid,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    scoring="accuracy",
    n_jobs=-1
)

X_train, X_valid, y_train, y_valid = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
svm_cv.fit(X_train, y_train)
print("Best params:", svm_cv.best_params_)
best_svm = svm_cv.best_estimator_
evaluate_model(best_svm, X_train, y_train, X_valid, y_valid, "SVM")

Best params: {'clf__C': 1, 'clf__gamma': 'scale', 'clf__kernel': 'rbf'}

=== SVM ===
              precision    recall  f1-score   support

     Group_A       0.94      0.98      0.96        51
     Group_B       1.00      0.99      0.99       142
     Group_C       0.99      0.99      0.99        96

    accuracy                           0.99       289
   macro avg       0.98      0.99      0.98       289
weighted avg       0.99      0.99      0.99       289

[[ 50   0   1]
 [  2 140   0]
 [  1   0  95]]


Define a custom PyTorch-based MLP classifier:
 - __init__ sets model hyperparameters (hidden size, layers, dropout, lr, etc.).
- _build_model builds a feedforward network with ReLU and Dropout.
- fit trains the model using Adam optimizer and CrossEntropyLoss on one-hot encoded labels.
- predict returns class indices from the trained network.
- score computes accuracy by comparing predictions with true labels.


In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import numpy as np

class TorchMLPClassifier:
    def __init__(self, hidden_dim=64, hidden_layers=2, dropout=0.2, lr=1e-3, batch_size=64, epochs=20):
        self.hidden_dim = hidden_dim
        self.hidden_layers = hidden_layers
        self.dropout = dropout
        self.lr = lr
        self.batch_size = batch_size
        self.epochs = epochs

    def _build_model(self, input_dim, output_dim):
        layers = []
        in_dim = input_dim
        for _ in range(self.hidden_layers):
            layers += [nn.Linear(in_dim, self.hidden_dim), nn.ReLU(), nn.Dropout(self.dropout)]
            in_dim = self.hidden_dim
        layers += [nn.Linear(in_dim, output_dim)]
        return nn.Sequential(*layers)

    def fit(self, X, y):
        X = np.asarray(X, dtype=np.float32)
        y = pd.get_dummies(y).values.astype(np.float32)  # one-hot
        dataset = TensorDataset(torch.from_numpy(X), torch.from_numpy(y))
        loader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True)

        input_dim = X.shape[1]
        output_dim = y.shape[1]
        self.model = self._build_model(input_dim, output_dim)
        optimizer = torch.optim.Adam(self.model.parameters(), lr=self.lr)
        criterion = nn.CrossEntropyLoss()

        for epoch in range(self.epochs):
            for xb, yb in loader:
                optimizer.zero_grad()
                logits = self.model(xb)
                loss = criterion(logits, torch.argmax(yb, dim=1))
                loss.backward()
                optimizer.step()
        return self

    def predict(self, X):
        X = np.asarray(X, dtype=np.float32)
        with torch.no_grad():
            logits = self.model(torch.from_numpy(X))
            preds = torch.argmax(logits, dim=1).numpy()
        return preds

    def score(self, X, y):
        preds = self.predict(X)
        return (preds == pd.Categorical(y).codes).mean()

Wrap the custom PyTorch MLP inside a scikit-learn compatible class (TorchWrapper).
LabelEncoder ensures string labels are converted to integers for training and back to strings for predictions.
Build a pipeline with preprocessing + TorchWrapper, then tune hyperparameters using GridSearchCV.
Finally, evaluate the best PyTorch MLP model on the validation set.


In [None]:
from sklearn.base import BaseEstimator
from sklearn.preprocessing import LabelEncoder

class TorchWrapper(BaseEstimator):
    def __init__(self, hidden_dim=64, hidden_layers=2, dropout=0.2,
                 lr=1e-3, batch_size=64, epochs=20):
        self.hidden_dim = hidden_dim
        self.hidden_layers = hidden_layers
        self.dropout = dropout
        self.lr = lr
        self.batch_size = batch_size
        self.epochs = epochs
        self.clf = TorchMLPClassifier(hidden_dim, hidden_layers,
                                      dropout, lr, batch_size, epochs)
        self.le_ = None  # label encoder

    def fit(self, X, y):
        # Encode string labels to integers
        self.le_ = LabelEncoder()
        y_enc = self.le_.fit_transform(y)
        self.clf.fit(X, y_enc)
        return self

    def predict(self, X):
        preds = self.clf.predict(X)
        # Map back to original string labels
        return self.le_.inverse_transform(preds)

    def score(self, X, y):
        y_enc = self.le_.transform(y)
        preds = self.clf.predict(X)
        return (preds == y_enc).mean()

torch_pipe = Pipeline([
    ("prep", preprocessor),
    ("clf", TorchWrapper())
])

torch_param_grid = {
    "clf__hidden_dim": [64, 128],
    "clf__hidden_layers": [1, 2],
    "clf__dropout": [0.1, 0.3],
    "clf__lr": [1e-3, 3e-4],
    "clf__epochs": [20, 40]
}

torch_cv = GridSearchCV(
    torch_pipe,
    param_grid=torch_param_grid,
    cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42),
    scoring="accuracy",
    n_jobs=1
)

torch_cv.fit(X_train, y_train)
print("Best params:", torch_cv.best_params_)
best_svm = torch_cv.best_estimator_

# Now evaluation works with string labels consistently
evaluate_model(torch_cv, X_train, y_train, X_valid, y_valid, "PyTorch MLP")


Best params: {'clf__dropout': 0.3, 'clf__epochs': 20, 'clf__hidden_dim': 64, 'clf__hidden_layers': 2, 'clf__lr': 0.0003}

=== PyTorch MLP ===
              precision    recall  f1-score   support

     Group_A       0.88      0.90      0.89        51
     Group_B       0.99      0.96      0.98       142
     Group_C       0.96      0.99      0.97        96

    accuracy                           0.96       289
   macro avg       0.95      0.95      0.95       289
weighted avg       0.96      0.96      0.96       289

[[ 46   1   4]
 [  5 137   0]
 [  1   0  95]]
