In [None]:
%pip install -q transformers datasets scikit-learn pandas tqdm

import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset

In [None]:
print("Loading dataset...")
df = pd.read_csv("data/youtube_transcripts.csv")
print(f"Loaded {len(df)} rows.")
df.head()

In [None]:
print("Preprocessing text and encoding labels...")

# combine title and transcript
df["text"] = df["title"] + " " + df["transcript"]

# encode topics to numerical labels
le = LabelEncoder()
df["label"] = le.fit_transform(df["topic"])

# save label mappings for future use
label2id = {label: i for i, label in enumerate(le.classes_)}
id2label = {i: label for label, i in label2id.items()}

print("Preprocessing complete.")
df[["text", "topic", "label"]].head()

In [None]:
print("Splitting dataset...")

train_df, test_df = train_test_split(df[["text", "label"]], test_size=0.2, random_state=42, stratify=df["label"])
train_ds = Dataset.from_pandas(train_df.reset_index(drop=True))
test_ds = Dataset.from_pandas(test_df.reset_index(drop=True))

print(f"Training samples: {len(train_ds)}, Testing samples: {len(test_ds)}")

In [None]:
print("Tokenizing dataset...")
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=512)

train_ds = train_ds.map(tokenize, batched=True)
test_ds = test_ds.map(tokenize, batched=True)

train_ds.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_ds.set_format("torch", columns=["input_ids", "attention_mask", "label"])

print("Tokenization complete.")

In [None]:
print("Loading model...")
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)
print("Model loaded.")

In [None]:
print("Starting training...")

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
    save_total_limit=1,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer)
)

trainer.train()
print("Training complete.")

In [None]:
print("Saving model and tokenizer...")

model_path = "./topic_classifier_bert"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

print(f"Model and tokenizer saved to {model_path}")

In [None]:
from transformers import BertForSequenceClassification, BertTokenizerFast

print("Loading saved model and tokenizer...")

model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizerFast.from_pretrained(model_path)

print("Model reloaded and ready.")

In [None]:
def predict_topic(title, transcript):
    text = title + " " + transcript
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=512)

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class_id = torch.argmax(logits, dim=-1).item()

    return id2label[predicted_class_id]

# Example
example_title = "The Rise of Renewable Energy in 2024"
example_transcript = "In recent years, the shift toward wind and solar energy..."
predicted_topic = predict_topic(example_title, example_transcript)
print(f"Predicted topic: {predicted_topic}")