In [1]:
import numpy as np

from kaggle_movie_genres.config import load_config
from kaggle_movie_genres.labelhandler import LabelHandler
from kaggle_movie_genres.featurizer import create_tokenizer_and_embedder
from kaggle_movie_genres.dataloader import create_dataloader
from kaggle_movie_genres.cls_classifier import CLS_Classifier
from kaggle_movie_genres.submission import format_predictions
from kaggle_movie_genres.trainpredict import TrainPredict
import logging
logging.basicConfig(level=logging.INFO,    
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S'  # Date + short timestamp
)
logger = logging.getLogger(__name__)
import torch.nn as nn
import torch
import tqdm


### Helper stuff    

In [2]:
# config contains all constants pathes and settings
config = load_config()

# label_handler helps to convert labels between different formats
label_handler = LabelHandler(config)



### Tokenizer, embedder and the model

In [3]:
tokenizer, embedder = create_tokenizer_and_embedder(config)
model = CLS_Classifier(embedder, num_labels=label_handler.get_multi_hot_length(), config=config)

### Create train / validation sets

In [4]:
train_set, validation_set = create_dataloader('data/train.csv', tokenizer, label_handler, config, validation_split=True)
test_set,_ = create_dataloader('data/test.csv', tokenizer, label_handler, config, validation_split=False)

2025-11-15 00:46:58 - INFO - Loaded 8000 records from data/train.csv
2025-11-15 00:46:58 - INFO - Using only first 64 records from the dataset
2025-11-15 00:46:58 - INFO - Using max token length: 256
2025-11-15 00:46:58 - INFO - Loaded 2000 records from data/test.csv
2025-11-15 00:46:58 - INFO - Using only first 64 records from the dataset
2025-11-15 00:46:58 - INFO - Using max token length: 256


In [None]:
TRAIN_NAME = "CLS_Classifier_v1"
trainer = TrainPredict(TRAIN_NAME,config, label_handler, model, train_set, validation_set, test_set)
trainer.train()

2025-11-15 00:47:01 - INFO - Starting epoch 1/50
100%|██████████| 2/2 [00:03<00:00,  1.83s/it]
2025-11-15 00:47:05 - INFO - Training epoch 1 completed. Train F1 Score: 0.1805, Train Loss: 0.6699
100%|██████████| 1/1 [00:01<00:00,  1.72s/it]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
2025-11-15 00:47:06 - INFO - Validation epoch 1 completed. Val F1 Score: 0.0456, Val Loss: 0.5600
100%|██████████| 2/2 [00:00<00:00,  2.66it/s]
2025-11-15 00:47:07 - INFO - Test epoch 1 completed. 
2025-11-15 00:47:07 - INFO - Epoch 1 completed. Train Loss: 0.6699, Val Loss: 0.5600
2025-11-15 00:47:07 - INFO - Starting epoch 2/50
100%|██████████| 2/2 [00:00<00:00,  3.13it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
2025-11-15 00:47:08 - INFO - Training epoch 2 completed. Train F1 Score: 0.0823, Train Loss: 0.5377
100%|██████████| 1/1 [00:00<00:00,  6.74it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
2025