In [2]:
from kaggle_movie_genres.config import load_config
from kaggle_movie_genres.labelhandler import LabelHandler
from kaggle_movie_genres.featurizer import create_tokenizer_and_embedder
from kaggle_movie_genres.dataloader import create_dataloader
from kaggle_movie_genres.cls_classifier import CLS_Classifier
import logging
logging.basicConfig(level=logging.INFO,    
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S'  # Date + short timestamp
)
logger = logging.getLogger(__name__)
import torch.nn as nn
import torch
import tqdm


### Helper stuff    

In [3]:
# config contains all constants pathes and settings
config = load_config()

# label_handler helps to convert labels between different formats
label_handler = LabelHandler(config)



### Tokenizer, embedder and the model

In [4]:
tokenizer, embedder = create_tokenizer_and_embedder(config)
model = CLS_Classifier(embedder, num_labels=label_handler.get_multi_hot_length(), config=config)

### Create train / validation sets

In [5]:
train_set, val_set = create_dataloader('data/train.csv', tokenizer, label_handler, config, validation_split=True)
test_set, _ = create_dataloader('data/test.csv', tokenizer, label_handler, config, validation_split=False)

2025-11-13 12:07:27 - INFO - Loaded 8000 records from data/train.csv
2025-11-13 12:07:27 - INFO - Using max token length: 256
2025-11-13 12:07:27 - INFO - Loaded 2000 records from data/test.csv
2025-11-13 12:07:27 - INFO - Using max token length: 256


### Get token frequencies

In [6]:
all_input_tokens_train = []
for batch in train_set:
    features, labels = batch
    all_input_tokens_train.extend(features['input_tokens'].cpu().numpy().flatten().tolist())
    # Forward pass
all_input_tokens_test = []
for (features, _) in test_set:
    all_input_tokens_test.extend(features['input_tokens'].cpu().numpy().flatten().tolist())
all_input_tokens_val = []
for (features, _) in val_set:
    all_input_tokens_val.extend(features['input_tokens'].cpu().numpy().flatten().tolist())

In [7]:
import pandas as pd
all_input_tokens_test = pd.DataFrame(all_input_tokens_test, columns=['token_id'])
all_input_tokens_train = pd.DataFrame(all_input_tokens_train, columns=['token_id'])
all_input_tokens_val = pd.DataFrame(all_input_tokens_val, columns=['token_id'])

In [8]:
train_stats = all_input_tokens_train.groupby('token_id').size().reset_index(name='count')
test_stats = all_input_tokens_test.groupby('token_id').size().reset_index(name='count')
val_stats = all_input_tokens_val.groupby('token_id').size().reset_index(name='count')
token_stats_test = pd.merge(train_stats, test_stats, on='token_id', how='right', suffixes=('_train', '_test')).fillna(0)
token_stats_val = pd.merge(train_stats, val_stats, on='token_id', how='right', suffixes=('_train', '_val')).fillna(0)

In [9]:
# calculate deviation between train and test frequencies relative to bigger count
token_stats_test['deviation'] = (token_stats_test['count_train'] - token_stats_test['count_test']).abs() / token_stats_test['count_test']
token_stats_val['deviation'] = (token_stats_val['count_train'] - token_stats_val['count_val']).abs() / token_stats_val['count_val']


In [10]:
(token_stats_test.count_train!=0).sum()/len(token_stats_test)

0.904117696795468

In [11]:
(token_stats_val.count_train!=0).sum()/len(token_stats_val)

0.9120473022912047

In [16]:
import numpy as np


In [17]:
# test_gt
test_gt = pd.read_csv('data/sample_submission.csv', dtype=str)[:1000]
pred = pd.read_csv('submissions/submission_1_valf1_447.csv', dtype=str)
merged = pd.merge(test_gt, pred, on='movie_id', suffixes=('_gt', '_pred'), how='left')
merged

Unnamed: 0,movie_id,genre_ids_gt,genre_ids_pred
0,529,10749 18,18
1,3549,35 18,18
2,7536,35 27,27 53
3,5086,10752 18 28,28 18 53
4,3452,35 18 10749,18 10749
...,...,...,...
995,1259,18,18 27
996,4909,35 80 18 10749 53 36,80 18
997,7340,28 878 12,28 12 878
998,3724,14 10751 27,14 27


In [18]:
# use label handler array to multi hot to get back to multi hot
def parse_genre_ids(genre_ids_str):
    if genre_ids_str is None or genre_ids_str == '' or pd.isna(genre_ids_str):
        return []
    return [int(i) for i in genre_ids_str.split(' ')]
merged['genres_gt_multi_hot'] = merged['genre_ids_gt'].apply(lambda x: label_handler.array_to_multi_hot(parse_genre_ids(x)))
merged['genres_pred_multi_hot'] = merged['genre_ids_pred'].apply(lambda x: label_handler.array_to_multi_hot(parse_genre_ids(x)))
merged

Unnamed: 0,movie_id,genre_ids_gt,genre_ids_pred,genres_gt_multi_hot,genres_pred_multi_hot
0,529,10749 18,18,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."
1,3549,35 18,18,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."
2,7536,35 27,27 53,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,5086,10752 18 28,28 18 53,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."
4,3452,35 18 10749,18 10749,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."
...,...,...,...,...,...
995,1259,18,18 27,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."
996,4909,35 80 18 10749 53 36,80 18,"[0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, ..."
997,7340,28 878 12,28 12 878,"[1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
998,3724,14 10751 27,14 27,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ..."


In [19]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
f1_score(merged['genres_gt_multi_hot'].tolist(), merged['genres_pred_multi_hot'].tolist(), average='macro')

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


0.4594664973967337

In [20]:
# create a random prediction for testing from the ground truth

In [39]:
gt_multihot = np.stack(merged['genres_gt_multi_hot'])
true_freq = gt_multihot.sum()/gt_multihot.size
random_pred = (np.random.rand(*gt_multihot.shape) < true_freq).astype(int)
f1_score(merged['genres_gt_multi_hot'].tolist(), random_pred.tolist(), average='macro')

0.11824663176439083

In [25]:
merged['genres_gt_multi_hot'].sum() / merged['genres_gt_multi_hot'].size

array([0.241, 0.173, 0.091, 0.359, 0.161, 0.   , 0.487, 0.11 , 0.116,
       0.057, 0.134, 0.022, 0.085, 0.176, 0.118, 0.013, 0.263, 0.028,
       0.015], dtype=float32)