In [1]:
import numpy as np

from kaggle_movie_genres.config import load_config
from kaggle_movie_genres.labelhandler import LabelHandler
from kaggle_movie_genres.featurizer import create_tokenizer_and_embedder
from kaggle_movie_genres.dataloader import create_dataloader
from kaggle_movie_genres.cls_classifier import CLS_Classifier
from kaggle_movie_genres.submission import format_predictions
from kaggle_movie_genres.trainpredict import TrainPredict
import logging
logging.basicConfig(level=logging.INFO,    
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S'  # Date + short timestamp
)
logger = logging.getLogger(__name__)
import torch.nn as nn
import torch
import tqdm


### Helper stuff    

In [2]:
# config contains all constants pathes and settings
config = load_config()

# label_handler helps to convert labels between different formats
label_handler = LabelHandler(config)



### Tokenizer, embedder and the model

In [3]:
tokenizer, embedder = create_tokenizer_and_embedder(config)
model = CLS_Classifier(embedder, num_labels=label_handler.get_multi_hot_length(), config=config)

### Create train / validation sets

In [4]:
train_set, validation_set = create_dataloader('data/train.csv', tokenizer, label_handler, config, validation_split=True)
test_set,_ = create_dataloader('data/test.csv', tokenizer, label_handler, config, validation_split=False)

2025-11-15 09:27:19 - INFO - Loaded 8000 records from data/train.csv
2025-11-15 09:27:19 - INFO - Using max token length: 256
2025-11-15 09:27:19 - INFO - Loaded 2000 records from data/test.csv
2025-11-15 09:27:19 - INFO - Using max token length: 256


In [None]:
TRAIN_NAME = "CLS_Classifier_v1"
trainer = TrainPredict(TRAIN_NAME,config, label_handler, model, train_set, validation_set, test_set)
trainer.train()

2025-11-15 09:27:25 - INFO - Starting epoch 1/50
W1115 09:27:29.353000 2183 torch/_inductor/utils.py:1250] [0/0] Not enough SMs to use max_autotune_gemm mode
2025-11-15 09:28:51 - INFO - Training epoch 1 completed. Train F1 Score: 0.1938, Train Loss: 0.2997
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
2025-11-15 09:29:14 - INFO - Validation epoch 1 completed. Val F1 Score: 0.3004, Val Loss: 0.2378
2025-11-15 09:29:41 - INFO - Test epoch 1 completed. 
2025-11-15 09:29:41 - INFO - Epoch 1 completed. Train Loss: 0.2997, Val Loss: 0.2378
2025-11-15 09:29:42 - INFO - Starting epoch 2/50
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
2025-11-15 09:30:53 - INFO - Training epoch 2 completed. Train F1 Score: 0.3481, Train Loss: 0.2502
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
2025-11-15 09:31:10 - INFO - Validation epoch 2 completed. Val F1 Score: 0.3895, Val Loss: 0.2270
2025-11-15 09:31:32 - INFO 