In [1]:
from kaggle_movie_genres.config import load_config
from kaggle_movie_genres.labelhandler import LabelHandler
from kaggle_movie_genres.featurizer import create_tokenizer_and_embedder
from kaggle_movie_genres.dataloader import create_dataloader
from kaggle_movie_genres.cls_classifier import CLS_Classifier
import logging
logging.basicConfig(level=logging.INFO,    
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S'  # Date + short timestamp
)
logger = logging.getLogger(__name__)
import torch.nn as nn
import torch
import tqdm


### Helper stuff    

In [2]:
# config contains all constants pathes and settings
config = load_config()

# label_handler helps to convert labels between different formats
label_handler = LabelHandler(config)



### Tokenizer, embedder and the model

In [3]:
tokenizer, embedder = create_tokenizer_and_embedder(config)
model = CLS_Classifier(embedder, num_labels=label_handler.get_multi_hot_length(), config=config)

### Create train / validation sets

In [4]:
train_set, validation_set = create_dataloader('data/train.csv', tokenizer, label_handler, config, validation_split=True)

2025-11-12 18:36:36 - INFO - Loaded 8000 records from data/train.csv
2025-11-12 18:36:36 - INFO - Using max token length: 256


In [5]:
# do an epoch with Adam optimizer and BCELoss
from sklearn.metrics import f1_score
optimizer = torch.optim.Adam(model.parameters(), float(config['learning_rate']))

device = config['device']
model = model.to(device)
compiled_model = torch.compile(model)

for epoch in range(config['num_epochs']):
    logger.info(f"Starting epoch {epoch+1}/{config['num_epochs']}")
    model.train()
    all_labels = []
    all_probs = []
    all_losses = []
    for features, labels in tqdm.tqdm(train_set):
        features = {k: v.to(device) for k, v in features.items()}
        labels = labels.to(device)
        probs = compiled_model(features['input_tokens'], attention_mask=features['attention_mask'])
        preds = (probs > 0.5).int()
        
        loss = nn.BCELoss()(probs, labels)
        all_losses.append(loss.cpu().item())
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        all_labels.append(labels.cpu())
        all_probs.append(probs.cpu())
    
    all_labels = torch.cat(all_labels)
    all_probs = torch.cat(all_probs)
    all_preds = (all_probs > 0.5).int()
    train_f1 = f1_score(all_labels, all_preds, average='macro')
    logger.info(f"Training F1 Score: {train_f1:.4f}")    
    
    
    # Do validation after each epoch
    model.eval()
    all_labels = []
    all_probs = []
    with torch.no_grad():
        for features, labels in validation_set:
            features = {k: v.to(device) for k, v in features.items()}
            labels = labels.to(device)
            probs = compiled_model(features['input_tokens'], attention_mask=features['attention_mask'])
            all_labels.append(labels.cpu())
            all_probs.append(probs.cpu())
    all_labels = torch.cat(all_labels)
    all_probs = torch.cat(all_probs)
    all_preds = (all_probs > 0.5).int()
    val_f1 = f1_score(all_labels, all_preds, average='macro')
    logger.info(f"Validation F1 Score after epoch {epoch+1}: {val_f1:.4f}")

2025-11-12 18:36:41 - INFO - Starting epoch 1/1
100%|██████████| 200/200 [01:18<00:00,  2.56it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
2025-11-12 18:37:59 - INFO - Training F1 Score: 0.1048
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
2025-11-12 18:38:20 - INFO - Validation F1 Score after epoch 1: 0.1977


In [None]:
model.eval()
# evaluate test set
test_set = create_dataloader('data/test.csv', tokenizer, label_handler, config, validation_split=False)

all_movie_ids = []
all_probs = []
with torch.no_grad():
    for features, _ in test_set:
        features = {k: v.to(device) for k, v in features.items()}
        probs = compiled_model(features['input_tokens'], attention_mask=features['attention_mask'])
        all_movie_ids.append(features['movie_id'].cpu())
        all_probs.append(probs.cpu())
all_movie_ids = torch.cat(all_movie_ids).cpu().numpy()
all_probs = torch.cat(all_probs).cpu().numpy()
all_preds = (all_probs > 0.5).astype(np.int32)

def format_predictions(filename, movie_ids, predictions, label_handler):
    # write to a csv file, with columns movie_id and genres (space separated genre ids)
    import pandas as pd
    rows = []
    for movie_id, pred in zip(movie_ids, predictions):
        genre_ids = label_handler.multi_hot_to_array(pred)
        rows.append({'movie_id': movie_id, 'genres': ' '.join(map(str, genre_ids))})
    df = pd.DataFrame(rows)
    df.to_csv(filename, index=False)
    
format_predictions('submission.csv', all_movie_ids, all_preds, label_handler)


2025-11-12 18:38:20 - INFO - Loaded 2000 records from data/test.csv
2025-11-12 18:38:20 - INFO - Using max token length: 256
W1112 18:38:49.112000 1708036 torch/_inductor/utils.py:1250] [0/2] Not enough SMs to use max_autotune_gemm mode


AttributeError: 'numpy.ndarray' object has no attribute 'int'