In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/tedy-with-lang/tedx_videos_extended_with_lang.csv
/kaggle/input/tedx-with-lang-translated/tedx_videos_extended_with_lang_translated.csv
/kaggle/input/terdx-finetuning/topics_train.csv
/kaggle/input/terdx-finetuning/sentiment_test.csv
/kaggle/input/terdx-finetuning/topics_test.csv
/kaggle/input/terdx-finetuning/sentiment_train.csv


In [25]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
HF_TOKEN = user_secrets.get_secret("HF_TOKEN")
WANDB_API = user_secrets.get_secret("WANDB_API")

In [None]:
!pip install --upgrade datasets transformers evaluate sentencepiece accelerate

In [2]:
import pandas as pd
tags = pd.read_csv('/kaggle/input/terdx-finetuning/topics_train.csv')
tags

Unnamed: 0,title,best_tag
0,I was held hostage for 317 days. Here's what I...,global issues
1,Planet City -- a sci-fi vision of an astonishi...,sustainability
2,Progress is not a zero-sum game,global issues
3,How I'm making bricks out of ashes and rubble ...,sustainability
4,How vultures can help solve crimes,nature
...,...,...
5059,Why do we hiccup?,health
5060,Why videos go viral,entertainment
5061,A primer on 3D printing,technology
5062,Building a dinosaur from a chicken,science


In [3]:
from datasets import load_dataset

file_dict = {
  "train" : "/kaggle/input/terdx-finetuning/topics_train.csv",
  "test" : "/kaggle/input/terdx-finetuning/topics_test.csv"
}

dataset = load_dataset(
  'csv',
  data_files=file_dict,
  delimiter=',',
  column_names=['title', 'best_tag'],
  skiprows=1,
)

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [4]:
# Extract all unique tags from the 'tag' column
all_tags = []
for split in dataset:  # Iterate over 'train' and 'test' splits
    for example in dataset[split]:
        tags = example['best_tag'].split(', ')
        all_tags.extend(tags)
classes = sorted(list(set(all_tags)))

classes

['AI',
 'art',
 'business',
 'climate change',
 'communication',
 'creativity',
 'culture',
 'design',
 'economics',
 'education',
 'entertainment',
 'environment',
 'food',
 'gender',
 'global issues',
 'health',
 'history',
 'humanity',
 'innovation',
 'literature',
 'mental health',
 'music',
 'nature',
 'personal growth',
 'politics',
 'psychology',
 'science',
 'social change',
 'storytelling',
 'sustainability',
 'technology',
 'work']

In [5]:
# Create mappings
class2id = {class_: id for id, class_ in enumerate(classes)}
id2class = {id: class_ for class_, id in class2id.items()}

In [6]:
print(class2id)
print(id2class)

{'AI': 0, 'art': 1, 'business': 2, 'climate change': 3, 'communication': 4, 'creativity': 5, 'culture': 6, 'design': 7, 'economics': 8, 'education': 9, 'entertainment': 10, 'environment': 11, 'food': 12, 'gender': 13, 'global issues': 14, 'health': 15, 'history': 16, 'humanity': 17, 'innovation': 18, 'literature': 19, 'mental health': 20, 'music': 21, 'nature': 22, 'personal growth': 23, 'politics': 24, 'psychology': 25, 'science': 26, 'social change': 27, 'storytelling': 28, 'sustainability': 29, 'technology': 30, 'work': 31}
{0: 'AI', 1: 'art', 2: 'business', 3: 'climate change', 4: 'communication', 5: 'creativity', 6: 'culture', 7: 'design', 8: 'economics', 9: 'education', 10: 'entertainment', 11: 'environment', 12: 'food', 13: 'gender', 14: 'global issues', 15: 'health', 16: 'history', 17: 'humanity', 18: 'innovation', 19: 'literature', 20: 'mental health', 21: 'music', 22: 'nature', 23: 'personal growth', 24: 'politics', 25: 'psychology', 26: 'science', 27: 'social change', 28: 's

In [7]:
from transformers import AutoTokenizer

model_path = 'facebook/bart-large-mnli'

tokenizer = AutoTokenizer.from_pretrained(model_path)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



In [8]:
def preprocess_function(example):
   text = example['title']
   all_labels = example['best_tag'].split(', ')
   labels = [0. for i in range(len(classes))]
   for label in all_labels:
       label_id = class2id[label]
       labels[label_id] = 1.

   example = tokenizer(text, truncation=True)
   example['best_tag'] = labels
   return example

tokenized_dataset = dataset.map(preprocess_function)

Map:   0%|          | 0/5064 [00:00<?, ? examples/s]

Map:   0%|          | 0/1266 [00:00<?, ? examples/s]

In [9]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [10]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch

# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions,
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds,
        labels=p.label_ids)
    return result

In [11]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    model_path, num_labels=len(classes),
    id2label=id2class, label2id=class2id,
    problem_type = "multi_label_classification", 
    ignore_mismatched_sizes=True
)

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([32]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([32, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
import wandb
wandb.login(key=WANDB_API)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mczarnybaranie1[0m ([33mczarnybaranie1-sgh-warsaw-school-of-economics[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [17]:
import os
# set the wandb project where this run will be logged
os.environ["WANDB_PROJECT"]="tedx_topic_classification"

# save your trained model checkpoint to wandb
os.environ["WANDB_LOG_MODEL"]="checkpoint"

# turn off watch to log faster
os.environ["WANDB_WATCH"]="false"

In [18]:
# Rename columns to text and labels (needed for model setup)
tokenized_dataset = tokenized_dataset.rename_column("title", "text")
tokenized_dataset = tokenized_dataset.rename_column("best_tag", "labels")

In [None]:
training_args = TrainingArguments(

   output_dir="/kaggle/working/topic_classifier",
   learning_rate=2e-5,
   per_device_train_batch_size=3,
   per_device_eval_batch_size=3,
   num_train_epochs=2,
   weight_decay=0.01,
   evaluation_strategy="steps",
   eval_steps=100,
   save_strategy="steps",
   save_steps=500,
   load_best_model_at_end=True,
)

trainer = Trainer(

   model=model,
   args=training_args,
   train_dataset=tokenized_dataset["train"],
   eval_dataset=tokenized_dataset["test"],
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)


In [None]:
tokenized_dataset

In [None]:
trainer.train()

In [None]:
from huggingface_hub import PyTorchModelHubMixin
model_name = "/kaggle/working/topic_classifier/checkpoint-1500"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model.push_to_hub("CzarnyBaranie/topic-classifier-checkpoint-1500",private=True)

In [None]:
###### Usuwanie folderu - KLIKNIJ TYLKO JEDEN RAZ!
import shutil
shutil.rmtree("/kaggle/working/topic_classifier/checkpoint-1688")

In [None]:
training_args = TrainingArguments(

   output_dir="/kaggle/working/topic_classifier",
   learning_rate=2e-5,
   per_device_train_batch_size=3,
   per_device_eval_batch_size=3,
   num_train_epochs=4,
   weight_decay=0.01,
   evaluation_strategy="steps",
   eval_steps=100,
   save_strategy="steps",
   save_steps=500,
   load_best_model_at_end=True,
   save_total_limit=4, 
)

trainer = Trainer(

   model=model,
   args=training_args,
   train_dataset=tokenized_dataset["train"],
   eval_dataset=tokenized_dataset["test"],
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)


In [None]:
# Path to the latest checkpoint
latest_checkpoint = '/kaggle/working/topic_classifier/checkpoint-1500'

# Resume training from the latest checkpoint
trainer.train(resume_from_checkpoint=latest_checkpoint)

In [None]:
###### Usuwanie folderu - KLIKNIJ TYLKO JEDEN RAZ!
import shutil
shutil.rmtree("/kaggle/working/topic_classifier/checkpoint-3376")

In [None]:
training_args = TrainingArguments(

   output_dir="/kaggle/working/topic_classifier",
   learning_rate=2e-5,
   per_device_train_batch_size=3,
   per_device_eval_batch_size=3,
   num_train_epochs=5,
   weight_decay=0.01,
   evaluation_strategy="steps",
   eval_steps=100,
   save_strategy="steps",
   save_steps=500,
   load_best_model_at_end=True,
   save_total_limit=3, 
)

trainer = Trainer(

   model=model,
   args=training_args,
   train_dataset=tokenized_dataset["train"],
   eval_dataset=tokenized_dataset["test"],
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)


In [None]:
# Path to the latest checkpoint
latest_checkpoint = '/kaggle/working/topic_classifier/checkpoint-3000'

# Resume training from the latest checkpoint
trainer.train(resume_from_checkpoint=latest_checkpoint)

In [None]:
training_args = TrainingArguments(

   output_dir="/kaggle/working/topic_classifier",
   learning_rate=2e-5,
   per_device_train_batch_size=3,
   per_device_eval_batch_size=3,
   num_train_epochs=6,
   weight_decay=0.01,
   evaluation_strategy="steps",
   eval_steps=100,
   save_strategy="steps",
   save_steps=500,
   load_best_model_at_end=True,
   save_total_limit=3, 
)

trainer = Trainer(

   model=model,
   args=training_args,
   train_dataset=tokenized_dataset["train"],
   eval_dataset=tokenized_dataset["test"],
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)


In [24]:
last_run_id = "uiul4m7h"  # fetch the run_id from your wandb workspace

# resume the wandb run from the run_id
with wandb.init(
    project=os.environ["WANDB_PROJECT"],
    id=last_run_id,
    resume="must",
) as run:
    # Connect an Artifact to the run
    my_checkpoint_name = f"checkpoint-{last_run_id}:latest"
    my_checkpoint_artifact = run.use_artifact("model-uiul4m7h:v10")

    # Download checkpoint to a folder and return the path
    checkpoint_dir = my_checkpoint_artifact.download()

    # reinitialize your model and trainer
    model = AutoModelForSequenceClassification.from_pretrained(
        "/kaggle/working/artifacts/model-uiul4m7h:v10", num_labels=len(classes)
    )
    # your awesome training arguments here.
    training_args = TrainingArguments(

       output_dir="/kaggle/working/topic_classifier",
       learning_rate=2e-5,
       per_device_train_batch_size=3,
       per_device_eval_batch_size=3,
       num_train_epochs=6,
       weight_decay=0.01,
       evaluation_strategy="steps",
       eval_steps=100,
       save_strategy="steps",
       save_steps=500,
       load_best_model_at_end=True,
       save_total_limit=3, 
    )
    
    trainer = Trainer(
    
       model=model,
       args=training_args,
       train_dataset=tokenized_dataset["train"],
       eval_dataset=tokenized_dataset["test"],
       tokenizer=tokenizer,
       data_collator=data_collator,
       compute_metrics=compute_metrics,
    )

    # make sure use the checkpoint dir to resume training from the checkpoint
    trainer.train(resume_from_checkpoint=checkpoint_dir)

[34m[1mwandb[0m: Downloading large artifact model-uiul4m7h:v10, 4667.12MB. 12 files... 
[34m[1mwandb[0m:   12 of 12 files downloaded.  
Done. 0:0:6.0
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'AI', '1': 'art', '2': 'business', '3': 'climate change', '4': 'communication', '5': 'creativity', '6': 'culture', '7': 'design', '8': 'economics', '9': 'education', '10': 'entertainment', '11': 'environment', '12': 'food', '13': 'gender', '14': 'global issues', '15': 'health', '16': 'history', '17': 'humanity', '18': 'innovation', '19': 'literature', '20': 'mental health', '21': 'music', '22': 'nature', '23': 'personal growth', '24': 'politics', '25': 'psychology', '26': 'science', '27': 'social change', '28': 'storytelling', '29': 'sustainability', '30': 'technology', '31': 'work'}. The number of labels wil be overwritten to 32.
You are resuming training from a checkpoint trained with 4.47.1 of Transformers but your current version is 4.44.2. This is not r

Step,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
4100,0.0229,0.062643,0.66242,0.80412,0.612164
4200,0.0229,0.06396,0.654777,0.800451,0.605845
4300,0.0229,0.063859,0.661311,0.804808,0.616114
4400,0.0229,0.063616,0.663543,0.803802,0.612164
4500,0.0217,0.064191,0.662988,0.804515,0.613744
4600,0.0217,0.063434,0.670908,0.808566,0.622433
4700,0.0217,0.063608,0.664977,0.806821,0.616114
4800,0.0217,0.064099,0.669779,0.807777,0.621643
4900,0.0217,0.064007,0.666106,0.808719,0.623223
5000,0.0189,0.06389,0.670598,0.810401,0.626382


Non-default generation parameters: {'forced_eos_token_id': 2}
[34m[1mwandb[0m: Adding directory to artifact (/kaggle/working/topic_classifier/checkpoint-4500)... Done. 34.0s
Non-default generation parameters: {'forced_eos_token_id': 2}
[34m[1mwandb[0m: Adding directory to artifact (/kaggle/working/topic_classifier/checkpoint-5000)... Done. 31.4s
Non-default generation parameters: {'forced_eos_token_id': 2}
[34m[1mwandb[0m: Adding directory to artifact (/kaggle/working/topic_classifier/checkpoint-5064)... Done. 38.7s
Could not locate the best model at /kaggle/working/topic_classifier/checkpoint-2600/pytorch_model.bin, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.
Non-default generation parameters: {'forced_eos_token_id': 2}


0,1
eval/accuracy,▃▁▅▃▄▇▅▆▇█
eval/f1,▄▁▄▅▅█▅█▆█
eval/loss,▁▇▇▅█▅▅█▇▇
eval/roc_auc,▄▁▄▃▄▇▅▆▇█
eval/runtime,█▂▃▄▂▄▁█▆▃
eval/samples_per_second,▁▇▆▅▇▅█▁▃▆
eval/steps_per_second,▁▇▆▅▇▅█▁▃▆
train/epoch,▁▂▂▃▄▄▅▅▆▇███
train/global_step,▁▂▂▃▄▄▅▅▆▇███
train/grad_norm,█▁

0,1
eval/accuracy,0.62638
eval/f1,0.6706
eval/loss,0.06389
eval/roc_auc,0.8104
eval/runtime,42.8456
eval/samples_per_second,29.548
eval/steps_per_second,4.925
total_flos,933381470013696.0
train/epoch,6.0
train/global_step,5064.0


In [29]:
from huggingface_hub import PyTorchModelHubMixin
model_name = "/kaggle/working/topic_classifier/checkpoint-5064"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model.push_to_hub("CzarnyBaranie/tedx-topic-classifier",private=True)

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'AI', '1': 'art', '2': 'business', '3': 'climate change', '4': 'communication', '5': 'creativity', '6': 'culture', '7': 'design', '8': 'economics', '9': 'education', '10': 'entertainment', '11': 'environment', '12': 'food', '13': 'gender', '14': 'global issues', '15': 'health', '16': 'history', '17': 'humanity', '18': 'innovation', '19': 'literature', '20': 'mental health', '21': 'music', '22': 'nature', '23': 'personal growth', '24': 'politics', '25': 'psychology', '26': 'science', '27': 'social change', '28': 'storytelling', '29': 'sustainability', '30': 'technology', '31': 'work'}. The number of labels wil be overwritten to 32.
Non-default generation parameters: {'forced_eos_token_id': 2}


model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/CzarnyBaranie/tedx-topic-classifier/commit/be9c1c3acc6f9bac905aa8e689b026979bbde9c9', commit_message='Upload BartForSequenceClassification', commit_description='', oid='be9c1c3acc6f9bac905aa8e689b026979bbde9c9', pr_url=None, pr_revision=None, pr_num=None)

# inference

In [3]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained('CzarnyBaranie/bart-finetuned-for-tedx-topics')

model = AutoModelForSequenceClassification.from_pretrained("CzarnyBaranie/bart-finetuned-for-tedx-topics")

tokenizer_config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'AI', '1': 'art', '2': 'business', '3': 'climate change', '4': 'communication', '5': 'creativity', '6': 'culture', '7': 'design', '8': 'economics', '9': 'education', '10': 'entertainment', '11': 'environment', '12': 'food', '13': 'gender', '14': 'global issues', '15': 'health', '16': 'history', '17': 'humanity', '18': 'innovation', '19': 'literature', '20': 'mental health', '21': 'music', '22': 'nature', '23': 'personal growth', '24': 'politics', '25': 'psychology', '26': 'science', '27': 'social change', '28': 'storytelling', '29': 'sustainability', '30': 'technology', '31': 'work'}. The number of labels wil be overwritten to 32.


model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

In [57]:
from transformers import pipeline
classifier = pipeline("zero-shot-classification",
                      model="CzarnyBaranie/bart-finetuned-for-tedx-topics", 
                     device=0)


config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'AI', '1': 'art', '2': 'business', '3': 'climate change', '4': 'communication', '5': 'creativity', '6': 'culture', '7': 'design', '8': 'economics', '9': 'education', '10': 'entertainment', '11': 'environment', '12': 'food', '13': 'gender', '14': 'global issues', '15': 'health', '16': 'history', '17': 'humanity', '18': 'innovation', '19': 'literature', '20': 'mental health', '21': 'music', '22': 'nature', '23': 'personal growth', '24': 'politics', '25': 'psychology', '26': 'science', '27': 'social change', '28': 'storytelling', '29': 'sustainability', '30': 'technology', '31': 'work'}. The number of labels wil be overwritten to 32.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'AI', '1': 'art', '2': 'business', '3': 'climate change', '4': 'communication', '5': 'creativity', '6': 'culture', '7': 'design', '8': 'economics', '9': 'education', '10': 'entertainment', '11': 'environment',

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.


In [58]:
list(model.config.id2label.values())

['AI',
 'art',
 'business',
 'climate change',
 'communication',
 'creativity',
 'culture',
 'design',
 'economics',
 'education',
 'entertainment',
 'environment',
 'food',
 'gender',
 'global issues',
 'health',
 'history',
 'humanity',
 'innovation',
 'literature',
 'mental health',
 'music',
 'nature',
 'personal growth',
 'politics',
 'psychology',
 'science',
 'social change',
 'storytelling',
 'sustainability',
 'technology',
 'work']

In [67]:
labels=list(model.config.id2label.values())
labels

['AI',
 'art',
 'business',
 'climate change',
 'communication',
 'creativity',
 'culture',
 'design',
 'economics',
 'education',
 'entertainment',
 'environment',
 'food',
 'gender',
 'global issues',
 'health',
 'history',
 'humanity',
 'innovation',
 'literature',
 'mental health',
 'music',
 'nature',
 'personal growth',
 'politics',
 'psychology',
 'science',
 'social change',
 'storytelling',
 'sustainability',
 'technology',
 'work']

In [68]:
print(model.config.id2label)
print(model.config.label2id)


{0: 'AI', 1: 'art', 2: 'business', 3: 'climate change', 4: 'communication', 5: 'creativity', 6: 'culture', 7: 'design', 8: 'economics', 9: 'education', 10: 'entertainment', 11: 'environment', 12: 'food', 13: 'gender', 14: 'global issues', 15: 'health', 16: 'history', 17: 'humanity', 18: 'innovation', 19: 'literature', 20: 'mental health', 21: 'music', 22: 'nature', 23: 'personal growth', 24: 'politics', 25: 'psychology', 26: 'science', 27: 'social change', 28: 'storytelling', 29: 'sustainability', 30: 'technology', 31: 'work'}
{'AI': 0, 'art': 1, 'business': 2, 'climate change': 3, 'communication': 4, 'creativity': 5, 'culture': 6, 'design': 7, 'economics': 8, 'education': 9, 'entertainment': 10, 'environment': 11, 'food': 12, 'gender': 13, 'global issues': 14, 'health': 15, 'history': 16, 'humanity': 17, 'innovation': 18, 'literature': 19, 'mental health': 20, 'music': 21, 'nature': 22, 'personal growth': 23, 'politics': 24, 'psychology': 25, 'science': 26, 'social change': 27, 'story

In [65]:
labels=list(model.config.id2label.values())
labels

['AI',
 'art',
 'business',
 'climate change',
 'communication',
 'creativity',
 'culture',
 'design',
 'economics',
 'education',
 'entertainment',
 'environment',
 'food',
 'gender',
 'global issues',
 'health',
 'history',
 'humanity',
 'innovation',
 'literature',
 'mental health',
 'music',
 'nature',
 'personal growth',
 'politics',
 'psychology',
 'science',
 'social change',
 'storytelling',
 'sustainability',
 'technology',
 'work']

In [74]:
sequence_to_classify = "How poetry saved me from a cult"
labels=list(model.config.id2label.values())
classification = classifier(sequence_to_classify,labels, multi_label=False)
classification

{'sequence': 'How poetry saved me from a cult',
 'labels': ['work',
  'mental health',
  'economics',
  'food',
  'business',
  'gender',
  'climate change',
  'AI',
  'literature',
  'culture',
  'nature',
  'education',
  'communication',
  'personal growth',
  'creativity',
  'history',
  'entertainment',
  'humanity',
  'art',
  'environment',
  'politics',
  'storytelling',
  'design',
  'global issues',
  'psychology',
  'innovation',
  'health',
  'music',
  'technology',
  'science',
  'social change',
  'sustainability'],
 'scores': [0.0683063343167305,
  0.06420928239822388,
  0.06332815438508987,
  0.06300736963748932,
  0.050007857382297516,
  0.04787668585777283,
  0.04645151644945145,
  0.0328267440199852,
  0.03210783749818802,
  0.03205064684152603,
  0.03203142434358597,
  0.02991892211139202,
  0.029693666845560074,
  0.029527846723794937,
  0.02822027914226055,
  0.02795073762536049,
  0.026581551879644394,
  0.02576313354074955,
  0.025392694398760796,
  0.024150889

In [24]:
print(classification["labels"][0]) # get label with highest score

economics


In [13]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Dane wejściowe
sequence_to_classify = "How poetry saved me from a cult"
candidate_labels = list(model.config.id2label.values())

# Przygotowanie danych: łączenie tekstu z każdą etykietą jako hipotezą
inputs = tokenizer(
    [sequence_to_classify] * len(candidate_labels),  # Tekst powtarzany dla każdej etykiety
    [f"This text is about {label}." for label in candidate_labels],  # Hipotezy
    return_tensors="pt",  # Zwróć tensory
    padding=True,  # Wyrównanie do największej długości
    truncation=True  # Obcięcie do maksymalnej długości modelu
).to("cuda")

# Predykcja
outputs = model(**inputs)
logits = outputs.logits  # Surowe logity
probs = torch.softmax(logits, dim=1)  # Prawdopodobieństwa

# Wyświetlanie wyników
for label, prob in zip(candidate_labels, probs[:, 1].tolist()):  # Zakładamy, że "1" oznacza zgodność
    print(f"Label: {label}, Probability: {prob:.4f}")


Label: AI, Probability: 0.0072
Label: art, Probability: 0.9805
Label: business, Probability: 0.0010
Label: climate change, Probability: 0.0023
Label: communication, Probability: 0.0027
Label: creativity, Probability: 0.0133
Label: culture, Probability: 0.0058
Label: design, Probability: 0.0257
Label: economics, Probability: 0.0018
Label: education, Probability: 0.0002
Label: entertainment, Probability: 0.1318
Label: environment, Probability: 0.0021
Label: food, Probability: 0.0024
Label: gender, Probability: 0.0005
Label: global issues, Probability: 0.0477
Label: health, Probability: 0.0003
Label: history, Probability: 0.1197
Label: humanity, Probability: 0.2154
Label: innovation, Probability: 0.0015
Label: literature, Probability: 0.0138
Label: mental health, Probability: 0.0008
Label: music, Probability: 0.0009
Label: nature, Probability: 0.0028
Label: personal growth, Probability: 0.0015
Label: politics, Probability: 0.1086
Label: psychology, Probability: 0.0003
Label: science, Prob

In [73]:
print(classifier.model.config.id2label)
print(classifier.model.config.label2id)

{0: 'AI', 1: 'art', 2: 'business', 3: 'climate change', 4: 'communication', 5: 'creativity', 6: 'culture', 7: 'design', 8: 'economics', 9: 'education', 10: 'entertainment', 11: 'environment', 12: 'food', 13: 'gender', 14: 'global issues', 15: 'health', 16: 'history', 17: 'humanity', 18: 'innovation', 19: 'literature', 20: 'mental health', 21: 'music', 22: 'nature', 23: 'personal growth', 24: 'politics', 25: 'psychology', 26: 'science', 27: 'social change', 28: 'storytelling', 29: 'sustainability', 30: 'technology', 31: 'work'}
{'AI': 0, 'art': 1, 'business': 2, 'climate change': 3, 'communication': 4, 'creativity': 5, 'culture': 6, 'design': 7, 'economics': 8, 'education': 9, 'entertainment': 10, 'environment': 11, 'food': 12, 'gender': 13, 'global issues': 14, 'health': 15, 'history': 16, 'humanity': 17, 'innovation': 18, 'literature': 19, 'mental health': 20, 'music': 21, 'nature': 22, 'personal growth': 23, 'politics': 24, 'psychology': 25, 'science': 26, 'social change': 27, 'story

# On full dataset

## test dataset

In [50]:
import pandas as pd
test_df = pd.read_csv('/kaggle/input/terdx-finetuning/topics_test.csv')
test_df

Unnamed: 0,title,best_tag
0,What is melatonin -- and should you take it to...,health
1,The world in 2200,social change
2,The real story behind Archimedes' Eureka!,history
3,Are you technically fit to parent?,psychology
4,Big data is better data,technology
...,...,...
1261,"What ""Orwellian"" really means",literature
1262,"1,000 TED Talks in six words",storytelling
1263,We actually have a shot at stopping the climat...,climate change
1264,The billion-dollar campaign to electrify trans...,sustainability


In [52]:
test_df['predicted_tag'] = None
test_df

Unnamed: 0,title,best_tag,predicted_tag
0,What is melatonin -- and should you take it to...,health,
1,The world in 2200,social change,
2,The real story behind Archimedes' Eureka!,history,
3,Are you technically fit to parent?,psychology,
4,Big data is better data,technology,
...,...,...,...
1261,"What ""Orwellian"" really means",literature,
1262,"1,000 TED Talks in six words",storytelling,
1263,We actually have a shot at stopping the climat...,climate change,
1264,The billion-dollar campaign to electrify trans...,sustainability,


In [60]:
from tqdm import tqdm

for i in tqdm(range(len(test_df))):
    text = test_df.loc[i,"title"]
    # Perform zero-shot classification on the text
    candidate_labels = list(model.config.id2label.values())
    results = classifier(
        text,
        candidate_labels=candidate_labels,
        device="cuda"
    )
    # Get the predicted labels and assign a value to the target column
    label = results["labels"][0]
    test_df.loc[i, "predicted_tag"] = label

100%|██████████| 1266/1266 [12:06<00:00,  1.74it/s]


In [61]:
from sklearn.metrics import accuracy_score, recall_score, f1_score, roc_auc_score, classification_report

# Zakładamy, że dane są w pandas DataFrame, np. df
y_true = test_df["best_tag"]  # Poprawne etykiety
y_pred = test_df["predicted_tag"]  # Przewidywania modelu

# Obliczanie metryk
accuracy = accuracy_score(y_true, y_pred)
recall = recall_score(y_true, y_pred, average="weighted")  # Dostosuj 'average' do przypadku (binary/multiclass)
f1 = f1_score(y_true, y_pred, average="weighted")
classification_rep = classification_report(y_true, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print("Classification Report:")
print(classification_rep)

# Obliczanie ROC-AUC (tylko dla klasyfikacji binarnej lub wieloetykietowej z wartościami prawdopodobieństwa)
# Załóżmy, że `classifier` zwraca również prawdopodobieństwa w `probs`.
# probs = classifier(..., return_probas=True)  # Prawdopodobieństwa dla każdej klasy
# y_prob = [probs[i][positive_class_idx] for i in range(len(probs))]  # Wyodrębnienie prawdopodobieństw dla pozytywnej klasy
# roc_auc = roc_auc_score(y_true, y_prob, multi_class="ovr")  # Ustaw multi_class w przypadku wieloklasowego problemu
# print(f"ROC-AUC: {roc_auc}")


Accuracy: 0.01579778830963665
Recall: 0.01579778830963665
F1 Score: 0.002786937185488641
Classification Report:
                 precision    recall  f1-score   support

             AI       0.00      0.00      0.00        19
            art       0.00      0.00      0.00        36
       business       0.00      0.00      0.00        29
 climate change       0.00      0.00      0.00        21
  communication       0.00      0.00      0.00        22
     creativity       0.00      0.00      0.00        25
        culture       0.00      0.00      0.00        40
         design       0.00      0.00      0.00        22
      economics       0.00      0.00      0.00        21
      education       0.00      0.00      0.00        60
  entertainment       0.00      0.00      0.00        15
    environment       0.00      0.00      0.00        15
           food       0.06      0.12      0.09        16
         gender       0.02      0.03      0.03        33
  global issues       0.00      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [63]:
test_df

Unnamed: 0,title,best_tag,predicted_tag
0,What is melatonin -- and should you take it to...,health,mental health
1,The world in 2200,social change,mental health
2,The real story behind Archimedes' Eureka!,history,gender
3,Are you technically fit to parent?,psychology,work
4,Big data is better data,technology,work
...,...,...,...
1261,"What ""Orwellian"" really means",literature,work
1262,"1,000 TED Talks in six words",storytelling,work
1263,We actually have a shot at stopping the climat...,climate change,mental health
1264,The billion-dollar campaign to electrify trans...,sustainability,mental health


In [42]:
import pandas as pd
tedx_df = pd.read_csv('/kaggle/input/tedy-with-lang/tedx_videos_extended_with_lang.csv')
tedx_df

Unnamed: 0,full_title,views,date_str,date,year,title,speaker,event,language
0,What Shakespeare teaches us about modern consp...,15518.0,23 godziny temu 13 minut i 11 sekund,2024-12-24 01:00:00,2024,What Shakespeare teaches us about modern consp...,Dr. Paul Budra,TEDxSurreySalon,en
1,How poetry saved me from a cult | Diannely Ant...,14758.0,1 dzień temu 21 minut,2024-12-24 00:00:00,2024,How poetry saved me from a cult,Diannely Antigua,TEDxPortsmouth,en
2,Why language shapes identity (more than race) ...,25684.0,2 dni temu 13 minut i 52 sekundy,2024-12-23 00:00:00,2024,Why language shapes identity (more than race),Malaka Grant,TEDxGeorge,en
3,On designing a presidential library | Craig Dy...,14181.0,3 dni temu 20 minut,2024-12-22 00:00:00,2024,On designing a presidential library,Craig Dykers,TEDxFargo,es
4,Why chasing happiness is nuts: What to do inst...,10858.0,4 dni temu 16 minut,2024-12-21 00:00:00,2024,Why chasing happiness is nuts: What to do instead,Lenorë Lambert,TEDxBillings,en
...,...,...,...,...,...,...,...,...,...
226747,TEDxWarwick - Professor Vinesh Raja - 2/28/09,2474.0,15 years ago 29 minutes,2009-12-25,2009,TEDxWarwick - Professor Vinesh Raja - 2/28/09,,,en
226748,TEDxWarwick - Professor Steve Fuller - 2/28/09,8460.0,15 years ago 24 minutes,2009-12-25,2009,TEDxWarwick - Professor Steve Fuller - 2/28/09,,,en
226749,TEDxWarwick - Francois Grey - 2/28/09,3480.0,15 years ago 27 minutes,2009-12-25,2009,TEDxWarwick - Francois Grey - 2/28/09,,,en
226750,TEDxWarwick - Professor Andrew Oswald - 2/28/09,6390.0,15 years ago 24 minutes,2009-12-25,2009,TEDxWarwick - Professor Andrew Oswald - 2/28/09,,,en


In [43]:
columns_list = list(tedx_df.columns)
columns_list

['full_title',
 'views',
 'date_str',
 'date',
 'year',
 'title',
 'speaker',
 'event',
 'language']

In [None]:
from tqdm import tqdm

for i in tqdm(range(len(test_df))):
    text = tedx_df.iloc[i,"title"]
    # Perform zero-shot classification on the text
    results = classifier(
        text,
        candidate_labels=["disaster",  "normal"],
    )
    # Get the predicted labels and assign a value to the target column
    labels = results["labels"]
    prediction = 1 if labels[0] == "disaster" else 0
    test_df.loc[i, "target"] = prediction

In [31]:
from datasets import load_dataset

file_dict = {
  "test" : "/kaggle/input/tedy-with-lang/tedx_videos_extended_with_lang.csv"
}

dataset = load_dataset(
  'csv',
  data_files=file_dict,
  delimiter=',',
  column_names=columns_list,
  skiprows=1,
)

Generating test split: 0 examples [00:00, ? examples/s]

In [None]:
def data():
    for i in range(1000):
        yield f"My example {i}"


pipe = pipeline(model="openai-community/gpt2", device=0)
generated_characters = 0
for out in pipe(data()):
    generated_characters += len(out[0]["generated_text"])

In [32]:
dataset

DatasetDict({
    test: Dataset({
        features: ['full_title', 'views', 'date_str', 'date', 'year', 'title', 'speaker', 'event', 'language'],
        num_rows: 226752
    })
})

In [34]:
%%time
candidate_labels = list(model.config.id2label.values())
zeroshot_preds = classifier(dataset["test"]["title"], candidate_labels, batch_size=16)

KeyboardInterrupt: 

In [35]:
zeroshot_preds

NameError: name 'zeroshot_preds' is not defined

In [48]:
all_model_outputs = []
for preprocessed in classifier.preprocess(dataset["test"]["title"][3],candidate_labels):
    model_outputs = classifier.forward(preprocessed)
    all_model_outputs.append(model_outputs)
outputs = classifier.postprocess(all_model_outputs)

In [49]:
outputs

{'sequence': 'On designing a presidential library',
 'labels': ['mental health',
  'food',
  'economics',
  'AI',
  'literature',
  'work',
  'business',
  'communication',
  'innovation',
  'education',
  'gender',
  'culture',
  'design',
  'creativity',
  'nature',
  'environment',
  'humanity',
  'entertainment',
  'technology',
  'history',
  'science',
  'politics',
  'art',
  'climate change',
  'health',
  'storytelling',
  'personal growth',
  'global issues',
  'psychology',
  'music',
  'social change',
  'sustainability'],
 'scores': [0.06548116356134415,
  0.06246044859290123,
  0.06139868497848511,
  0.05238157883286476,
  0.039313409477472305,
  0.036209989339113235,
  0.035150595009326935,
  0.034618258476257324,
  0.03361990302801132,
  0.0329565592110157,
  0.032920874655246735,
  0.032761067152023315,
  0.03238565847277641,
  0.032188501209020615,
  0.03218214586377144,
  0.03208659589290619,
  0.02996162138879299,
  0.02914244867861271,
  0.027894774451851845,
  0.0

# Final inference

In [4]:
import pandas as pd
test_df = pd.read_csv('/kaggle/input/terdx-finetuning/topics_test.csv')
test_df

Unnamed: 0,title,best_tag
0,What is melatonin -- and should you take it to...,health
1,The world in 2200,social change
2,The real story behind Archimedes' Eureka!,history
3,Are you technically fit to parent?,psychology
4,Big data is better data,technology
...,...,...
1261,"What ""Orwellian"" really means",literature
1262,"1,000 TED Talks in six words",storytelling
1263,We actually have a shot at stopping the climat...,climate change
1264,The billion-dollar campaign to electrify trans...,sustainability


In [14]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_path = "CzarnyBaranie/bart-finetuned-for-tedx-topics"  # Update with your path
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'AI', '1': 'art', '2': 'business', '3': 'climate change', '4': 'communication', '5': 'creativity', '6': 'culture', '7': 'design', '8': 'economics', '9': 'education', '10': 'entertainment', '11': 'environment', '12': 'food', '13': 'gender', '14': 'global issues', '15': 'health', '16': 'history', '17': 'humanity', '18': 'innovation', '19': 'literature', '20': 'mental health', '21': 'music', '22': 'nature', '23': 'personal growth', '24': 'politics', '25': 'psychology', '26': 'science', '27': 'social change', '28': 'storytelling', '29': 'sustainability', '30': 'technology', '31': 'work'}. The number of labels wil be overwritten to 32.


In [16]:
text = "On designing a presidential library"
inputs = tokenizer(text, return_tensors="pt")

In [20]:
model.config.id2label

{0: 'AI',
 1: 'art',
 2: 'business',
 3: 'climate change',
 4: 'communication',
 5: 'creativity',
 6: 'culture',
 7: 'design',
 8: 'economics',
 9: 'education',
 10: 'entertainment',
 11: 'environment',
 12: 'food',
 13: 'gender',
 14: 'global issues',
 15: 'health',
 16: 'history',
 17: 'humanity',
 18: 'innovation',
 19: 'literature',
 20: 'mental health',
 21: 'music',
 22: 'nature',
 23: 'personal growth',
 24: 'politics',
 25: 'psychology',
 26: 'science',
 27: 'social change',
 28: 'storytelling',
 29: 'sustainability',
 30: 'technology',
 31: 'work'}

In [17]:
with torch.no_grad():  # Disable gradient calculation for inference
    outputs = model(**inputs)
    logits = outputs.logits

In [21]:
import torch

# Apply sigmoid to get probabilities
sigmoid = torch.nn.Sigmoid()
probs = sigmoid(logits.squeeze().cpu())

# Apply threshold to get predictions (e.g., 0.5)
predictions = (probs >= 0.5).numpy().astype(int)

# Get predicted labels
predicted_labels = [model.config.id2label[idx] for idx, label in enumerate(predictions) if label == 1] 
print(predicted_labels)

['design']


In [9]:
test_df

Unnamed: 0,title,best_tag,predicted_tag
0,What is melatonin -- and should you take it to...,health,art
1,The world in 2200,social change,art
2,The real story behind Archimedes' Eureka!,history,art
3,Are you technically fit to parent?,psychology,art
4,Big data is better data,technology,art
...,...,...,...
1261,"What ""Orwellian"" really means",literature,art
1262,"1,000 TED Talks in six words",storytelling,art
1263,We actually have a shot at stopping the climat...,climate change,art
1264,The billion-dollar campaign to electrify trans...,sustainability,art


In [10]:
from sklearn.metrics import accuracy_score, recall_score, f1_score, roc_auc_score, classification_report

# Zakładamy, że dane są w pandas DataFrame, np. df
y_true = test_df["best_tag"]  # Poprawne etykiety
y_pred = test_df["predicted_tag"]  # Przewidywania modelu

# Obliczanie metryk
accuracy = accuracy_score(y_true, y_pred)
recall = recall_score(y_true, y_pred, average="weighted")  # Dostosuj 'average' do przypadku (binary/multiclass)
f1 = f1_score(y_true, y_pred, average="weighted")
classification_rep = classification_report(y_true, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print("Classification Report:")
print(classification_rep)

# Obliczanie ROC-AUC (tylko dla klasyfikacji binarnej lub wieloetykietowej z wartościami prawdopodobieństwa)
# Załóżmy, że `classifier` zwraca również prawdopodobieństwa w `probs`.
# probs = classifier(..., return_probas=True)  # Prawdopodobieństwa dla każdej klasy
# y_prob = [probs[i][positive_class_idx] for i in range(len(probs))]  # Wyodrębnienie prawdopodobieństw dla pozytywnej klasy
# roc_auc = roc_auc_score(y_true, y_prob, multi_class="ovr")  # Ustaw multi_class w przypadku wieloklasowego problemu
# print(f"ROC-AUC: {roc_auc}")


Accuracy: 0.02764612954186414
Recall: 0.02764612954186414
F1 Score: 0.0015563106544286302
Classification Report:
                 precision    recall  f1-score   support

             AI       0.00      0.00      0.00        19
            art       0.03      0.97      0.05        36
       business       0.00      0.00      0.00        29
 climate change       0.00      0.00      0.00        21
  communication       0.00      0.00      0.00        22
     creativity       0.00      0.00      0.00        25
        culture       0.00      0.00      0.00        40
         design       0.00      0.00      0.00        22
      economics       0.00      0.00      0.00        21
      education       0.00      0.00      0.00        60
  entertainment       0.00      0.00      0.00        15
    environment       0.00      0.00      0.00        15
           food       0.00      0.00      0.00        16
         gender       0.00      0.00      0.00        33
  global issues       0.00     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# final inference2

In [46]:
from tqdm import tqdm
import torch
import numpy as np
import pandas as pd  # Ensure pandas is imported

def predict_on_dataset_batched(model, tokenizer, test_df, batch_size=16):
    """
    Predicts labels for a dataset using the specified model, tokenizer,
    and batch size. Handles KeyError by ensuring predicted indices 
    are within the range of id2class keys.

    Args:
        model: The trained model.
        tokenizer: The tokenizer used for the model.
        test_df: The DataFrame containing the text data.
        batch_size: The size of the batch for inference.

    Returns:
        A pandas Series containing the predicted labels.
    """

    # Create id2class from model.config.id2label
    id2class = {int(k): v for k, v in model.config.id2label.items()}

    num_labels = len(id2class)  # Get the number of labels
    all_predictions = []

    for i in tqdm(range(0, len(test_df), batch_size)):
        batch_texts = test_df.iloc[i : i + batch_size]["title"].tolist()
        inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True)

        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits

        sigmoid = torch.nn.Sigmoid()
        probs = sigmoid(logits.cpu())
        predicted_ids = (probs >= 0.5).numpy().astype(int)  

        for j in range(len(batch_texts)):
            predicted_labels = [id2class[idx] for idx, label in enumerate(predicted_ids[j]) if label == 1 and idx in id2class]

            # If no labels are predicted above the threshold, 
            # select the label with the highest probability within valid range
            if not predicted_labels:
                highest_prob_index = np.argmax(probs[j])
                # Ensure highest_prob_index is within valid range and convert to int
                highest_prob_index = int(min(highest_prob_index, num_labels - 1))
                predicted_labels = [id2class[highest_prob_index]]

            all_predictions.append(predicted_labels[0])  # Take the first label

    return pd.Series(all_predictions, index=test_df.index, name="target")


In [25]:
test_df["target"] = predict_on_dataset(model, tokenizer, test_df, model.config.id2label)

100%|██████████| 1266/1266 [04:53<00:00,  4.31it/s]


In [40]:
test_df["target"] = predict_on_dataset_batched(model, tokenizer, test_df)

100%|██████████| 80/80 [01:17<00:00,  1.03it/s]


In [41]:
test_df

Unnamed: 0,title,best_tag,predicted_tag,target
0,What is melatonin -- and should you take it to...,health,art,science
1,The world in 2200,social change,art,global issues
2,The real story behind Archimedes' Eureka!,history,art,history
3,Are you technically fit to parent?,psychology,art,psychology
4,Big data is better data,technology,art,technology
...,...,...,...,...
1261,"What ""Orwellian"" really means",literature,art,history
1262,"1,000 TED Talks in six words",storytelling,art,entertainment
1263,We actually have a shot at stopping the climat...,climate change,art,climate change
1264,The billion-dollar campaign to electrify trans...,sustainability,art,sustainability


In [32]:
test_df[test_df['target'] is None]

KeyError: False

In [42]:
from sklearn.metrics import accuracy_score, recall_score, f1_score, roc_auc_score, classification_report

# Zakładamy, że dane są w pandas DataFrame, np. df
y_true = test_df["best_tag"]  # Poprawne etykiety
y_pred = test_df["target"]  # Przewidywania modelu

# Obliczanie metryk
accuracy = accuracy_score(y_true, y_pred)
recall = recall_score(y_true, y_pred, average="weighted")  # Dostosuj 'average' do przypadku (binary/multiclass)
f1 = f1_score(y_true, y_pred, average="weighted")
classification_rep = classification_report(y_true, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print("Classification Report:")
print(classification_rep)

# Obliczanie ROC-AUC (tylko dla klasyfikacji binarnej lub wieloetykietowej z wartościami prawdopodobieństwa)
# Załóżmy, że `classifier` zwraca również prawdopodobieństwa w `probs`.
# probs = classifier(..., return_probas=True)  # Prawdopodobieństwa dla każdej klasy
# y_prob = [probs[i][positive_class_idx] for i in range(len(probs))]  # Wyodrębnienie prawdopodobieństw dla pozytywnej klasy
# roc_auc = roc_auc_score(y_true, y_prob, multi_class="ovr")  # Ustaw multi_class w przypadku wieloklasowego problemu
# print(f"ROC-AUC: {roc_auc}")


Accuracy: 0.6666666666666666
Recall: 0.6666666666666666
F1 Score: 0.6658223995330474
Classification Report:
                 precision    recall  f1-score   support

             AI       0.79      0.79      0.79        19
            art       0.60      0.58      0.59        36
       business       0.72      0.62      0.67        29
 climate change       0.86      0.86      0.86        21
  communication       0.50      0.41      0.45        22
     creativity       0.63      0.48      0.55        25
        culture       0.47      0.50      0.48        40
         design       0.71      0.68      0.70        22
      economics       0.76      0.76      0.76        21
      education       0.72      0.70      0.71        60
  entertainment       0.33      0.47      0.39        15
    environment       0.42      0.67      0.51        15
           food       0.77      0.62      0.69        16
         gender       0.72      0.70      0.71        33
  global issues       0.45      0.51

In [43]:
tedx_df = pd.read_csv('/kaggle/input/tedy-with-lang/tedx_videos_extended_with_lang.csv')
tedx_df

Unnamed: 0,full_title,views,date_str,date,year,title,speaker,event,language
0,What Shakespeare teaches us about modern consp...,15518.0,23 godziny temu 13 minut i 11 sekund,2024-12-24 01:00:00,2024,What Shakespeare teaches us about modern consp...,Dr. Paul Budra,TEDxSurreySalon,en
1,How poetry saved me from a cult | Diannely Ant...,14758.0,1 dzień temu 21 minut,2024-12-24 00:00:00,2024,How poetry saved me from a cult,Diannely Antigua,TEDxPortsmouth,en
2,Why language shapes identity (more than race) ...,25684.0,2 dni temu 13 minut i 52 sekundy,2024-12-23 00:00:00,2024,Why language shapes identity (more than race),Malaka Grant,TEDxGeorge,en
3,On designing a presidential library | Craig Dy...,14181.0,3 dni temu 20 minut,2024-12-22 00:00:00,2024,On designing a presidential library,Craig Dykers,TEDxFargo,es
4,Why chasing happiness is nuts: What to do inst...,10858.0,4 dni temu 16 minut,2024-12-21 00:00:00,2024,Why chasing happiness is nuts: What to do instead,Lenorë Lambert,TEDxBillings,en
...,...,...,...,...,...,...,...,...,...
226747,TEDxWarwick - Professor Vinesh Raja - 2/28/09,2474.0,15 years ago 29 minutes,2009-12-25,2009,TEDxWarwick - Professor Vinesh Raja - 2/28/09,,,en
226748,TEDxWarwick - Professor Steve Fuller - 2/28/09,8460.0,15 years ago 24 minutes,2009-12-25,2009,TEDxWarwick - Professor Steve Fuller - 2/28/09,,,en
226749,TEDxWarwick - Francois Grey - 2/28/09,3480.0,15 years ago 27 minutes,2009-12-25,2009,TEDxWarwick - Francois Grey - 2/28/09,,,en
226750,TEDxWarwick - Professor Andrew Oswald - 2/28/09,6390.0,15 years ago 24 minutes,2009-12-25,2009,TEDxWarwick - Professor Andrew Oswald - 2/28/09,,,en


In [48]:
test_df["target"] = predict_on_dataset_batched(model, tokenizer, tedx_df)

  0%|          | 10/14172 [00:19<7:40:24,  1.95s/it]


KeyboardInterrupt: 

In [49]:
# To GPU
from tqdm import tqdm
import torch
import numpy as np
import pandas as pd  # Ensure pandas is imported

def predict_on_dataset_batched(model, tokenizer, test_df, batch_size=16, device="cuda"):
    """
    Predicts labels for a dataset using the specified model, tokenizer,
    and batch size. Handles KeyError by ensuring predicted indices 
    are within the range of id2class keys.

    Args:
        model: The trained model.
        tokenizer: The tokenizer used for the model.
        test_df: The DataFrame containing the text data.
        batch_size: The size of the batch for inference.
        device: The device to run the model on ('cuda' for GPU or 'cpu' for CPU).

    Returns:
        A pandas Series containing the predicted labels.
    """

    # Move model to the specified device
    model.to(device)
    model.eval()  # Ensure the model is in evaluation mode

    # Create id2class from model.config.id2label
    id2class = {int(k): v for k, v in model.config.id2label.items()}

    num_labels = len(id2class)  # Get the number of labels
    all_predictions = []

    for i in tqdm(range(0, len(test_df), batch_size)):
        batch_texts = test_df.iloc[i : i + batch_size]["title"].tolist()
        inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True).to(device)

        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits

        sigmoid = torch.nn.Sigmoid()
        probs = sigmoid(logits).cpu()  # Move probabilities back to CPU for processing
        predicted_ids = (probs >= 0.5).numpy().astype(int)  

        for j in range(len(batch_texts)):
            predicted_labels = [id2class[idx] for idx, label in enumerate(predicted_ids[j]) if label == 1 and idx in id2class]

            # If no labels are predicted above the threshold, 
            # select the label with the highest probability within valid range
            if not predicted_labels:
                highest_prob_index = np.argmax(probs[j])
                # Ensure highest_prob_index is within valid range and convert to int
                highest_prob_index = int(min(highest_prob_index, num_labels - 1))
                predicted_labels = [id2class[highest_prob_index]]

            all_predictions.append(predicted_labels[0])  # Take the first label

    return pd.Series(all_predictions, index=test_df.index, name="target")


In [50]:
test_df["target"] = predict_on_dataset_batched(model, tokenizer, test_df)

100%|██████████| 80/80 [00:07<00:00, 10.71it/s]


In [51]:
from sklearn.metrics import accuracy_score, recall_score, f1_score, roc_auc_score, classification_report

# Zakładamy, że dane są w pandas DataFrame, np. df
y_true = test_df["best_tag"]  # Poprawne etykiety
y_pred = test_df["target"]  # Przewidywania modelu

# Obliczanie metryk
accuracy = accuracy_score(y_true, y_pred)
recall = recall_score(y_true, y_pred, average="weighted")  # Dostosuj 'average' do przypadku (binary/multiclass)
f1 = f1_score(y_true, y_pred, average="weighted")
classification_rep = classification_report(y_true, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print("Classification Report:")
print(classification_rep)

# Obliczanie ROC-AUC (tylko dla klasyfikacji binarnej lub wieloetykietowej z wartościami prawdopodobieństwa)
# Załóżmy, że `classifier` zwraca również prawdopodobieństwa w `probs`.
# probs = classifier(..., return_probas=True)  # Prawdopodobieństwa dla każdej klasy
# y_prob = [probs[i][positive_class_idx] for i in range(len(probs))]  # Wyodrębnienie prawdopodobieństw dla pozytywnej klasy
# roc_auc = roc_auc_score(y_true, y_prob, multi_class="ovr")  # Ustaw multi_class w przypadku wieloklasowego problemu
# print(f"ROC-AUC: {roc_auc}")


Accuracy: 0.6666666666666666
Recall: 0.6666666666666666
F1 Score: 0.6658223995330474
Classification Report:
                 precision    recall  f1-score   support

             AI       0.79      0.79      0.79        19
            art       0.60      0.58      0.59        36
       business       0.72      0.62      0.67        29
 climate change       0.86      0.86      0.86        21
  communication       0.50      0.41      0.45        22
     creativity       0.63      0.48      0.55        25
        culture       0.47      0.50      0.48        40
         design       0.71      0.68      0.70        22
      economics       0.76      0.76      0.76        21
      education       0.72      0.70      0.71        60
  entertainment       0.33      0.47      0.39        15
    environment       0.42      0.67      0.51        15
           food       0.77      0.62      0.69        16
         gender       0.72      0.70      0.71        33
  global issues       0.45      0.51

In [52]:
tedx_df["target"] = predict_on_dataset_batched(model, tokenizer, tedx_df)

  1%|          | 98/14172 [00:14<35:25,  6.62it/s]


ValueError: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).

# FINAL INFERENCE ! (REALLY)

In [11]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained('CzarnyBaranie/bart-finetuned-for-tedx-topics')

model = AutoModelForSequenceClassification.from_pretrained("CzarnyBaranie/bart-finetuned-for-tedx-topics")

tokenizer_config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'AI', '1': 'art', '2': 'business', '3': 'climate change', '4': 'communication', '5': 'creativity', '6': 'culture', '7': 'design', '8': 'economics', '9': 'education', '10': 'entertainment', '11': 'environment', '12': 'food', '13': 'gender', '14': 'global issues', '15': 'health', '16': 'history', '17': 'humanity', '18': 'innovation', '19': 'literature', '20': 'mental health', '21': 'music', '22': 'nature', '23': 'personal growth', '24': 'politics', '25': 'psychology', '26': 'science', '27': 'social change', '28': 'storytelling', '29': 'sustainability', '30': 'technology', '31': 'work'}. The number of labels wil be overwritten to 32.


model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

In [12]:
from tqdm import tqdm
import torch
import numpy as np
import pandas as pd  # Ensure pandas is imported

def predict_on_dataset_batched(model, tokenizer, test_df, batch_size=16, device="cuda"):
    """
    Predicts labels for a dataset using the specified model, tokenizer,
    and batch size. Handles KeyError by ensuring predicted indices 
    are within the range of id2class keys and ensures text inputs are valid.

    Args:
        model: The trained model.
        tokenizer: The tokenizer used for the model.
        test_df: The DataFrame containing the text data.
        batch_size: The size of the batch for inference.
        device: The device to run the model on ('cuda' for GPU or 'cpu' for CPU).

    Returns:
        A pandas Series containing the predicted labels or None for invalid rows.
    """

    # Move model to the specified device
    model.to(device)
    model.eval()  # Ensure the model is in evaluation mode

    # Create id2class from model.config.id2label
    id2class = {int(k): v for k, v in model.config.id2label.items()}

    num_labels = len(id2class)  # Get the number of labels
    all_predictions = []

    for i in tqdm(range(0, len(test_df), batch_size)):
        # Extract batch titles
        batch_texts = test_df.iloc[i : i + batch_size]["translated_title"].tolist()

        # Replace non-string entries with None
        valid_texts = [text if isinstance(text, str) else None for text in batch_texts]

        # Prepare inputs for valid texts only
        valid_indices = [j for j, text in enumerate(valid_texts) if text is not None]
        if valid_indices:
            inputs = tokenizer([valid_texts[j] for j in valid_indices], return_tensors="pt", padding=True, truncation=True).to(device)

            with torch.no_grad():
                outputs = model(**inputs)
                logits = outputs.logits

            sigmoid = torch.nn.Sigmoid()
            probs = sigmoid(logits).cpu()  # Move probabilities back to CPU for processing
            predicted_ids = (probs >= 0.5).numpy().astype(int)

            # Process predictions
            for k, idx in enumerate(valid_indices):
                predicted_labels = [id2class[idx] for idx, label in enumerate(predicted_ids[k]) if label == 1 and idx in id2class]

                # If no labels are predicted above the threshold, 
                # select the label with the highest probability within valid range
                if not predicted_labels:
                    highest_prob_index = np.argmax(probs[k])
                    # Ensure highest_prob_index is within valid range and convert to int
                    highest_prob_index = int(min(highest_prob_index, num_labels - 1))
                    predicted_labels = [id2class[highest_prob_index]]

                all_predictions.append(predicted_labels[0])  # Take the first label

        # For invalid entries, append None
        for j in range(len(batch_texts)):
            if j not in valid_indices:
                all_predictions.append(None)

    return pd.Series(all_predictions, index=test_df.index, name="tag")


In [13]:
import pandas as pd
tedx_df = pd.read_csv('/kaggle/input/tedx-with-lang-translated/tedx_videos_extended_with_lang_translated.csv')
tedx_df

Unnamed: 0.1,Unnamed: 0,full_title,views,date_str,date,year,title,speaker,event,final_language,translated_title
0,0,What Shakespeare teaches us about modern consp...,15518.0,23 godziny temu 13 minut i 11 sekund,2024-12-24 01:00:00,2024,What Shakespeare teaches us about modern consp...,Dr. Paul Budra,TEDxSurreySalon,en,What Shakespeare teaches us about modern consp...
1,1,How poetry saved me from a cult | Diannely Ant...,14758.0,1 dzień temu 21 minut,2024-12-24 00:00:00,2024,How poetry saved me from a cult,Diannely Antigua,TEDxPortsmouth,en,How poetry saved me from a cult
2,2,Why language shapes identity (more than race) ...,25684.0,2 dni temu 13 minut i 52 sekundy,2024-12-23 00:00:00,2024,Why language shapes identity (more than race),Malaka Grant,TEDxGeorge,en,Why language shapes identity (more than race)
3,3,On designing a presidential library | Craig Dy...,14181.0,3 dni temu 20 minut,2024-12-22 00:00:00,2024,On designing a presidential library,Craig Dykers,TEDxFargo,en,On designing a presidential library
4,4,Why chasing happiness is nuts: What to do inst...,10858.0,4 dni temu 16 minut,2024-12-21 00:00:00,2024,Why chasing happiness is nuts: What to do instead,Lenorë Lambert,TEDxBillings,en,Why chasing happiness is nuts: What to do instead
...,...,...,...,...,...,...,...,...,...,...,...
226747,226747,TEDxWarwick - Professor Vinesh Raja - 2/28/09,2474.0,15 years ago 29 minutes,2009-12-25,2009,TEDxWarwick - Professor Vinesh Raja - 2/28/09,,,en,TEDxWarwick - Professor Vinesh Raja - 2/28/09
226748,226748,TEDxWarwick - Professor Steve Fuller - 2/28/09,8460.0,15 years ago 24 minutes,2009-12-25,2009,TEDxWarwick - Professor Steve Fuller - 2/28/09,,,en,TEDxWarwick - Professor Steve Fuller - 2/28/09
226749,226749,TEDxWarwick - Francois Grey - 2/28/09,3480.0,15 years ago 27 minutes,2009-12-25,2009,TEDxWarwick - Francois Grey - 2/28/09,,,en,TEDxWarwick - Francois Grey - 2/28/09
226750,226750,TEDxWarwick - Professor Andrew Oswald - 2/28/09,6390.0,15 years ago 24 minutes,2009-12-25,2009,TEDxWarwick - Professor Andrew Oswald - 2/28/09,,,en,TEDxWarwick - Professor Andrew Oswald - 2/28/09


In [14]:
tedx_df["tag"] = predict_on_dataset_batched(model, tokenizer, tedx_df)

100%|██████████| 14172/14172 [36:44<00:00,  6.43it/s] 


In [16]:
tedx_df.to_csv("/kaggle/working/tedx_df_tag_output.csv")