In [1]:
import numpy as np

from kaggle_movie_genres.config import load_config
from kaggle_movie_genres.labelhandler import LabelHandler
from kaggle_movie_genres.featurizer import create_tokenizer_and_embedder
from kaggle_movie_genres.dataloader import create_dataloader
from kaggle_movie_genres.cls_classifier import CLS_Classifier
from kaggle_movie_genres.submission import format_predictions
from kaggle_movie_genres.trainpredict import TrainPredict
import logging
logging.basicConfig(level=logging.INFO,    
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S'  # Date + short timestamp
)
logger = logging.getLogger(__name__)
import torch.nn as nn
import torch
import tqdm


### Helper stuff    

In [2]:
# config contains all constants pathes and settings
config = load_config()

# label_handler helps to convert labels between different formats
label_handler = LabelHandler(config)



### Tokenizer, embedder and the model

In [3]:
tokenizer, embedder = create_tokenizer_and_embedder(config)
model = CLS_Classifier(embedder, num_labels=label_handler.get_multi_hot_length(), config=config)

### Create train / validation sets

In [4]:
train_set, validation_set = create_dataloader('data/train.csv', tokenizer, label_handler, config, validation_split=True)
test_set,_ = create_dataloader('data/test.csv', tokenizer, label_handler, config, validation_split=False)

2025-11-15 00:21:29 - INFO - Loaded 8000 records from data/train.csv
2025-11-15 00:21:29 - INFO - Using max token length: 256
2025-11-15 00:21:29 - INFO - Loaded 2000 records from data/test.csv
2025-11-15 00:21:29 - INFO - Using max token length: 256


In [None]:
TRAIN_NAME = "CLS_Classifier_v1"
trainer = TrainPredict(TRAIN_NAME,config, label_handler, model, train_set, validation_set, test_set)
trainer.train()

2025-11-15 00:21:32 - INFO - Starting epoch 1/50
W1115 00:21:37.338000 581933 torch/_inductor/utils.py:1250] [0/0] Not enough SMs to use max_autotune_gemm mode
 32%|███▏      | 64/200 [06:10<12:50,  5.66s/it]

In [None]:
# do an epoch with Adam optimizer and BCELoss
from sklearn.metrics import f1_score

from kaggle_movie_genres.submission import format_predictions
optimizer = torch.optim.Adam(model.parameters(), float(config['learning_rate']))

device = config['device']
model = model.to(device)
compiled_model = torch.compile(model)

for epoch in range(config['num_epochs']):
    logger.info(f"Starting epoch {epoch+1}/{config['num_epochs']}")
    model.train()
    all_labels = []
    all_probs = []
    all_losses = []
    for features, labels in tqdm.tqdm(train_set):
        
        # only move to device torch tensors
        features = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in features.items()}
        labels = labels.to(device)
        probs = compiled_model(features['input_tokens'], attention_mask=features['attention_mask'])
        preds = (probs > 0.5).int()
        
        loss = nn.BCELoss()(probs, labels)
        all_losses.append(loss.cpu().item())
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        all_labels.append(labels.cpu())
        all_probs.append(probs.cpu())
        
    
    all_labels = torch.cat(all_labels)
    all_probs = torch.cat(all_probs)
    all_preds = (all_probs > 0.5).int()
    train_f1 = f1_score(all_labels, all_preds, average='macro')
    logger.info(f"Training F1 Score: {train_f1:.4f}")    
    
    
    # Do validation after each epoch
    model.eval()
    all_labels = []
    all_probs = []
    with torch.no_grad():
        for features, labels in validation_set:
            features = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in features.items()}
            labels = labels.to(device)
            probs = compiled_model(features['input_tokens'], attention_mask=features['attention_mask'])
            all_labels.append(labels.cpu())
            all_probs.append(probs.cpu())
    all_labels = torch.cat(all_labels)
    all_probs = torch.cat(all_probs)
    all_preds = (all_probs > 0.5).int()
    val_f1 = f1_score(all_labels, all_preds, average='macro')
    logger.info(f"Validation F1 Score after epoch {epoch+1}: {val_f1:.4f} length {len(all_preds)}")
    all_movie_ids = []
    all_movie_descriptions = []
    all_probs = []
    with torch.no_grad():
        for features, _ in test_set:
            features = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in features.items()}
            probs = compiled_model(features['input_tokens'], attention_mask=features['attention_mask'])
            all_movie_ids.append(features['movie_id'].cpu())
            all_probs.append(probs.cpu())
            all_movie_descriptions.extend(features['movie_description'])  
    all_movie_ids = torch.cat(all_movie_ids).cpu().numpy()
    all_probs = torch.cat(all_probs).cpu().numpy()
    all_preds = (all_probs > 0.5).astype(np.int32)

    all_pred_labels = [label_handler.array_to_label_names(label_handler.multi_hot_to_array(pred)) for pred in all_preds]

    with open(f'submissions/submission_{epoch+1}_valf1_{val_f1*1000:.0f}_raw.txt', 'w') as f:
        for all_movie_descriptions, pred_labels in zip(all_movie_descriptions, all_pred_labels):
            f.write(f"{all_movie_descriptions},{' '.join(map(str, pred_labels))}\r\n")
    format_predictions(f'submissions/submission_{epoch+1}_valf1_{val_f1*1000:.0f}.csv', all_movie_ids, all_preds, label_handler)

In [None]:
all_movie_descriptions