# Knowledge Distillation with PyTorch

This notebook implements knowledge distillation using PyTorch. Knowledge distillation is a technique where a smaller model (student) learns from both the ground truth labels and the predictions of a larger model (teacher).

## Overview
1. Import required libraries
2. Define the student model architecture
3. Implement custom loss functions
4. Create data loading and preprocessing pipeline
5. Train the model with k-fold cross validation
6. Make predictions and visualize results

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler, QuantileTransformer
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import gc
import os
import warnings
warnings.filterwarnings('ignore')

## Model Architecture

We define a simple neural network as our student model. The model takes input features and outputs two probabilities using sigmoid activation:

In [None]:
class StudentModel(nn.Module):
    def __init__(self, input_shape):
        super(StudentModel, self).__init__()
        self.flatten = nn.Flatten()
        self.layer1 = nn.Linear(input_shape, 16)
        self.relu = nn.ReLU()
        self.output = nn.Linear(16, 2)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        x = self.flatten(x)
        x = self.relu(self.layer1(x))
        x = self.sigmoid(self.output(x))
        return x

## Loss Functions

We implement the focal loss and knowledge distillation loss functions:

In [None]:
def focal_loss(y_true, y_pred, gamma=2.0):
    epsilon = 1e-7
    y_pred = torch.clamp(y_pred, epsilon, 1.0 - epsilon)
    cross_entropy = -y_true * torch.log(y_pred)
    focal_weight = (1 - y_pred) ** gamma
    return torch.mean(focal_weight * cross_entropy)

def knowledge_distillation_loss(y_true, y_pred, beta=0.1):
    # Split ground truth and teacher predictions
    true_labels = y_true[:, :1]
    teacher_preds = y_true[:, 1:]
    
    # Split student predictions
    student_preds1 = y_pred[:, :1]
    student_preds2 = y_pred[:, 1:]
    
    # Calculate losses
    fl_loss = focal_loss(true_labels, student_preds1)
    distill_loss = nn.BCELoss()(student_preds2, teacher_preds)
    
    return beta * fl_loss + (1 - beta) * distill_loss

## Data Loading

Create a custom Dataset class for loading the training data:

In [None]:
class CustomDataset(Dataset):
    def __init__(self, features, targets):
        self.features = torch.FloatTensor(features)
        self.targets = torch.FloatTensor(targets)
        
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        return self.features[idx], self.targets[idx]

## Training Loop

Define the training function with k-fold cross validation:

In [None]:
def train_model(X, y, teacher_preds, n_splits=5, epochs=100, batch_size=32):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    kfold = KFold(n_splits=n_splits, shuffle=True)
    fold_scores = []
    
    for fold, (train_idx, val_idx) in enumerate(kfold.split(X)):
        print(f'Training fold {fold + 1}/{n_splits}')
        
        # Prepare data
        X_train, X_val = X[train_idx], X[val_idx]
        y_train = np.column_stack([y[train_idx], teacher_preds[train_idx]])
        y_val = np.column_stack([y[val_idx], teacher_preds[val_idx]])
        
        # Create data loaders
        train_dataset = CustomDataset(X_train, y_train)
        val_dataset = CustomDataset(X_val, y_val)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size)
        
        # Initialize model
        model = StudentModel(X.shape[1]).to(device)
        optimizer = optim.Adam(model.parameters())
        
        # Training loop
        best_val_loss = float('inf')
        for epoch in range(epochs):
            model.train()
            train_losses = []
            for batch_X, batch_y in train_loader:
                batch_X, batch_y = batch_X.to(device), batch_y.to(device)
                optimizer.zero_grad()
                outputs = model(batch_X)
                loss = knowledge_distillation_loss(batch_y, outputs)
                loss.backward()
                optimizer.step()
                train_losses.append(loss.item())
            
            # Validation
            model.eval()
            val_losses = []
            with torch.no_grad():
                for batch_X, batch_y in val_loader:
                    batch_X, batch_y = batch_X.to(device), batch_y.to(device)
                    outputs = model(batch_X)
                    val_loss = knowledge_distillation_loss(batch_y, outputs)
                    val_losses.append(val_loss.item())
            
            avg_val_loss = np.mean(val_losses)
            if avg_val_loss < best_val_loss:
                best_val_loss = avg_val_loss
                torch.save(model.state_dict(), f'best_model_fold_{fold}.pt')
            
            if epoch % 10 == 0:
                print(f'Epoch {epoch}: Train Loss = {np.mean(train_losses):.4f}, Val Loss = {avg_val_loss:.4f}')
        
        fold_scores.append(best_val_loss)
    
    return np.mean(fold_scores)

## Normalize and split data

Prepare the dataset using RankGauss normalization:

In [None]:
class GaussRankScaler():
    def __init__(self):
        self.epsilon = 1e-9
        self.lower = -1 + self.epsilon
        self.upper = 1 - self.epsilon
        self.range = self.upper - self.lower

    def fit_transform(self, X):
        i = np.argsort(X, axis=0)
        j = np.argsort(i, axis=0)

        assert (j.min() == 0).all()
        assert (j.max() == len(j) - 1).all()

        j_range = len(j) - 1
        self.divider = j_range / self.range

        transformed = j / self.divider
        transformed = transformed - self.upper
        transformed = scipy.special.erfinv(transformed)

        return transformed

In [None]:
# Load data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
lgb_train = pd.read_csv('lgbm_train.csv')

# Feature engineering
def add_features(df):
    df['f_2_log'] = np.log1p(df['f_2'])
    df['f_28_log'] = np.log1p(df['f_28'])
    return df

train = add_features(train)
test = add_features(test)

# Prepare features
feature_cols = [col for col in train.columns if col.startswith('f_')]
X = train[feature_cols].values
y = train['target'].values
X_test = test[feature_cols].values
teacher_preds = lgb_train['prediction'].values

# Scale features using QuantileTransformer    
#scaler = QuantileTransformer(n_quantiles=1000, output_distribution='normal')

# Replace QuantileTransformer with GaussRankScaler
scaler = GaussRankScaler()
X = scaler.fit_transform(X)
X_test = scaler.transform(X_test)

print('Data normalized and split successfully')

## Experiment 1: Simple Neural Network

In this experiment, we train a simple neural network with binary cross entropy loss:

In [None]:
params = {
    'n_splits': 5,
    'epochs': 100,
    'batch_size': 1024,
    'learning_rate': 0.001
}

def train_simple_nn(X, y, params):
    criterion = nn.BCELoss()
    mean_score = train_model(
        X, y, None,  # No teacher predictions
        n_splits=params['n_splits'],
        epochs=params['epochs'],
        batch_size=params['batch_size'],
        criterion=criterion
    )
    return mean_score

exp1_score = train_simple_nn(X, y, params)
print(f'Experiment 1 - Mean validation score: {exp1_score:.4f}')

## Experiment 2: Neural Network with Knowledge Distillation

Train with knowledge distillation using teacher predictions:

In [None]:
params['beta'] = 0.1  # Knowledge distillation weight
exp2_score = train_model(
    X, y, teacher_preds,
    n_splits=params['n_splits'],
    epochs=params['epochs'],
    batch_size=params['batch_size']
)
print(f'Experiment 2 - Mean validation score: {exp2_score:.4f}')

## Experiment 3: Neural Network with Focal Loss

Train using focal loss without knowledge distillation:

In [None]:
exp3_score = train_model(
    X, y, None,
    n_splits=params['n_splits'],
    epochs=params['epochs'],
    batch_size=params['batch_size'],
    criterion=focal_loss
)
print(f'Experiment 3 - Mean validation score: {exp3_score:.4f}')

## Experiment 4: Neural Network with Knowledge Distillation and Focal Loss

Combine both focal loss and knowledge distillation:

In [None]:
exp4_score = train_model(
    X, y, teacher_preds,
    n_splits=params['n_splits'],
    epochs=params['epochs'],
    batch_size=params['batch_size'],
    criterion=knowledge_distillation_loss
)
print(f'Experiment 4 - Mean validation score: {exp4_score:.4f}')

# Compare experiment results
results = pd.DataFrame({
    'Experiment': ['Simple NN', 'Knowledge Distillation', 'Focal Loss', 'KD + Focal Loss'],
    'Validation Score': [exp1_score, exp2_score, exp3_score, exp4_score]
})
print('\nExperiment Results:')
print(results)

## Making Predictions

Use the trained model to make predictions:

In [None]:
def predict(model, X):
    model.eval()
    with torch.no_grad():
        X_tensor = torch.FloatTensor(X)
        predictions = model(X_tensor)
        return predictions.numpy()

# Make predictions with each fold's model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
test_preds = []

for fold in range(params['n_splits']):
    model = StudentModel(X.shape[1]).to(device)
    model.load_state_dict(torch.load(f'best_model_fold_{fold}.pt'))
    test_dataset = CustomDataset(X_test, np.zeros((len(X_test), 2)))
    test_loader = DataLoader(test_dataset, batch_size=params['batch_size'])
    
    model.eval()
    fold_preds = []
    with torch.no_grad():
        for batch_X, _ in test_loader:
            batch_X = batch_X.to(device)
            outputs = model(batch_X)
            fold_preds.append(outputs.cpu().numpy()[:, 0])
    
    fold_preds = np.concatenate(fold_preds)
    test_preds.append(fold_preds)

# Average predictions across folds
final_preds = np.mean(test_preds, axis=0)

# Create submission
submission = pd.DataFrame({
    'id': test['id'],
    'target': final_preds
})
submission.to_csv('submission.csv', index=False)
print('Submission saved to submission.csv')

# Cleanup
for fold in range(params['n_splits']):
    os.remove(f'best_model_fold_{fold}.pt')
gc.collect()