## TED Lens BERT V1

In [2]:
!pip install transformers -v

Using pip 23.1.2 from /usr/local/lib/python3.10/dist-packages/pip (python 3.10)
Collecting transformers
  Downloading transformers-4.35.2-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m63.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.19.3-py3-none-any.whl (311 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.2/311.2 kB[0m [31m35.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.19,>=0.14 (from transformers)
  Downloading tokenizers-0.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m96.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

Importing Libraries

Data Preprocessing

In [3]:
import pandas as pd
import spacy
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,multilabel_confusion_matrix
import numpy as np
import torch
import seaborn as sns

import gensim
import gensim.downloader as api


In [4]:
data = pd.read_csv('/content/ted_talks_en.csv')

xtraining_column = 'description'
useful_data = data[[xtraining_column, 'topics']]
useful_data.head()

Unnamed: 0,description,topics
0,With the same humor and humanity he exuded in ...,"['alternative energy', 'cars', 'climate change..."
1,You've never seen data presented like this. Wi...,"['Africa', 'Asia', 'Google', 'demo', 'economic..."
2,New York Times columnist David Pogue takes aim...,"['computers', 'entertainment', 'interface desi..."
3,"In an emotionally charged talk, MacArthur-winn...","['MacArthur grant', 'activism', 'business', 'c..."
4,Sir Ken Robinson makes an entertaining and pro...,"['children', 'creativity', 'culture', 'dance',..."


Exploring the Topics available in the Dataset

In [5]:
# Load the CSV dataset
data = useful_data

# converting the text to
all_topics = []
for topics,i in zip(data['topics'],range(len(data['topics']))):
    all_topics+=eval(topics)
    data['topics'][i] = eval(topics)

unique_topics = sorted(list(set(all_topics)))
print(unique_topics)

num_labels = len(unique_topics)
print(num_labels,' Topics')

['3D printing', 'AI', 'AIDS', 'Africa', "Alzheimer's", 'Antarctica', 'Anthropocene', 'Asia', 'Audacious Project', 'Autism spectrum disorder', 'Best of the Web', 'Brand', 'Brazil', 'Buddhism', 'CRISPR', 'Christianity', 'DNA', 'Debate', 'Egypt', 'Europe', 'Foreign Policy', 'Gender spectrum', 'God', 'Google', 'HIV', 'Humanities', 'Internet', 'Iran', 'Islam', 'LGBT', 'Latin America', 'MacArthur grant', 'Mars', 'Middle East', 'Moon', 'NASA', 'New York', 'Nobel Prize', 'PTSD', 'Planets', 'Science (hard)', 'Senses', 'Slavery', 'Social Science', 'South America', 'String theory', 'Sun', 'Surgery', 'Syria', 'TED Books', 'TED Connects', 'TED Fellows', 'TED Prize', 'TED Residency', 'TED en Español', 'TED-Ed', 'TEDMED', 'TEDNYC', 'TEDYouth', 'TEDx', 'Transgender', 'United States', 'Vaccines', 'activism', 'addiction', 'adventure', 'advertising', 'aging', 'agriculture', 'aircraft', 'algorithm', 'alternative energy', 'ancient world', 'animals', 'animation', 'anthropology', 'ants', 'apes', 'archaeology

Function to convert a DataFrame column to its BERT Embeddings

In [6]:
data.head()

Unnamed: 0,description,topics
0,With the same humor and humanity he exuded in ...,"[alternative energy, cars, climate change, cul..."
1,You've never seen data presented like this. Wi...,"[Africa, Asia, Google, demo, economics, global..."
2,New York Times columnist David Pogue takes aim...,"[computers, entertainment, interface design, m..."
3,"In an emotionally charged talk, MacArthur-winn...","[MacArthur grant, activism, business, cities, ..."
4,Sir Ken Robinson makes an entertaining and pro...,"[children, creativity, culture, dance, educati..."


In [7]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
topics_labels = mlb.fit_transform(data['topics'])

for i in range(len(data['topics'])):
    data['topics'][i] = topics_labels[i]

In [8]:
data.head()

Unnamed: 0,description,topics
0,With the same humor and humanity he exuded in ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,You've never seen data presented like this. Wi...,"[0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
2,New York Times columnist David Pogue takes aim...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"In an emotionally charged talk, MacArthur-winn...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,Sir Ken Robinson makes an entertaining and pro...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [9]:
from torch.utils.data import Dataset, DataLoader

class MultiLabelDataset(Dataset):

    def __init__(self, df, tokenizer, max_len, new_data=False):
        self.tokenizer = tokenizer
        self.data = df
        self.text = df.description
        self.new_data = new_data
        self.max_len = max_len

        if not new_data:
            self.targets = self.data.topics

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])

        inputs = self.tokenizer.encode_plus(text,None,
                                           add_special_tokens=True,
                                           max_length=self.max_len,
                                           pad_to_max_length=True,
                                           return_token_type_ids=True)
        out = {
            "input_ids": torch.tensor(inputs['input_ids'], dtype=torch.long),
            "attention_mask": torch.tensor(inputs['attention_mask'], dtype=torch.long),
            "token_type_ids": torch.tensor(inputs['token_type_ids'], dtype=torch.long)
        }
        if not self.new_data:
            out["targets"] = torch.tensor(self.targets[index], dtype=torch.float)

        return out

## Model Testing and Evaluation

In [10]:
MAX_LEN = 512
EPOCHS = 100
LEARNING_RATE = 1e-05
DEVICE = 'cuda:0' if torch.cuda.is_available() else 'cpu'
DEVICE

'cuda:0'

In [11]:
import os
import random
import torch

def seed_everything(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(42)

## Preparing Data for the Model

In [12]:
train_size = 0.7

train_df, df_temp = train_test_split(data, test_size=0.3, random_state=42)

val_df, test_df = train_test_split(df_temp, test_size=0.1, random_state=42)

train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

In [13]:
from transformers import DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', truncation=True, do_lower_case=True)
train_set = MultiLabelDataset(train_df, tokenizer, MAX_LEN)
val_set = MultiLabelDataset(val_df, tokenizer, MAX_LEN)
test_set = MultiLabelDataset(test_df, tokenizer, MAX_LEN)

train_loader = DataLoader(train_set, batch_size=32, shuffle=True, num_workers=8)
val_loader = DataLoader(val_set, batch_size=32, shuffle=False, num_workers=8)
test_loader = DataLoader(test_set, batch_size=32, shuffle=False, num_workers=8)

(…)cased/resolve/main/tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

(…)bert-base-uncased/resolve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

(…)base-uncased/resolve/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

(…)rt-base-uncased/resolve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]



## Model Architecture

In [14]:
from transformers import DistilBertModel

class DistilBertClass(torch.nn.Module):
    def __init__(self):
        super(DistilBertClass, self).__init__()

        self.bert = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.classifier = torch.nn.Sequential(torch.nn.Linear(768, 457))


    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        out = hidden_state[:,0]
        out = self.classifier(out)
        return out

In [15]:
model = DistilBertClass()
model.to(DEVICE)
print(f"Model on {DEVICE}")

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Model on cuda:0


In [16]:
# Load the pretrained weights from a .pth file
model_path = 'saved_model.pth'
checkpoint = torch.load(model_path)

# Load the state_dict into the model
model.load_state_dict(checkpoint)

<All keys matched successfully>

In [17]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

def loss_fn(outputs, targets):
    return torch.nn.functional.binary_cross_entropy_with_logits(outputs,targets)

## Training Function

In [18]:
from tqdm.auto import tqdm

loss_values = []

def train(epoch):
    model.train()

    for _, data in tqdm(enumerate(train_loader)):
        input_ids = data['input_ids'].to(DEVICE, dtype=torch.long)
        attention_mask = data['attention_mask'].to(DEVICE, dtype=torch.long)
        token_type_ids = data['token_type_ids'].to(DEVICE, dtype=torch.long)
        targets = data['targets'].to(DEVICE, dtype=torch.float)

        outputs = model(input_ids, attention_mask, token_type_ids)

        loss = loss_fn(outputs, targets)


        if _ % 5000 == 0:
            print(f"Epoch: {epoch}, Loss: {loss.item()}")
            loss_values.append(loss.item())

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()


## Display Confusion Function

In [19]:
import matplotlib.pyplot as plt

def display_confusion(confusion_matrix):
  for matrix,label in zip(confusion_matrix,unique_topics):
    plt.figure(figsize=(4, 4))
    sns.heatmap(matrix, annot=True, cmap='Blues')
    plt.title(label)
    plt.close()


## Validate Function

In [20]:
from sklearn import metrics

def validate(last=False):
  model.eval()

  fin_targets = []
  fin_outputs = []

  with torch.inference_mode():
    for _, data in tqdm(enumerate(val_loader, 0)):
      ids = data['input_ids'].to(DEVICE, dtype=torch.long)
      mask = data['attention_mask'].to(DEVICE, dtype=torch.long)
      token_type_ids = data['token_type_ids'].to(DEVICE, dtype=torch.long)
      targets = data['targets'].to(DEVICE, dtype=torch.float)

      outputs = model(ids, mask, token_type_ids)
      fin_targets.extend(targets.cpu().detach().numpy().tolist())
      fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

  fin_outputs = np.array(fin_outputs) >= 0.2
  fin_targets = np.array(fin_targets) >=0.5
  accuracy = metrics.accuracy_score(fin_targets, fin_outputs)
  f1_score_micro = metrics.f1_score(fin_targets, fin_outputs, average='micro',zero_division=1.0)
  f1_score_macro = metrics.f1_score(fin_targets, fin_outputs, average='macro',zero_division=1.0)
  precision = metrics.precision_score(fin_targets, fin_outputs, average='macro',zero_division=1.0)
  recall = metrics.recall_score(fin_targets, fin_outputs, average='macro',zero_division=1.0)

  if last:
    confusion_matrix = multilabel_confusion_matrix(fin_targets, fin_outputs)
    display_confusion(confusion_matrix)

  return {
      "Accuracy Score": accuracy,
      "F1 score(micro)": f1_score_micro,
      "F1 score(macro)": f1_score_macro,
      "Precision":precision,
      "Recall":recall
  }

## Training

In [21]:
'''
training_data = {
      "Accuracy Score": [],
      "F1 score(micro)": [],
      "F1 score(macro)": [],
      "Precision":[],
      "Recall":[],
  }

for epoch in range(EPOCHS):

  ## Train Step
  train(epoch)
  if epoch == EPOCHS - 1:
    val_data = validate(True)
    print(val_data)

    ## Saving the metrics
    for metric in val_data:
      training_data[metric].append(val_data[metric])

  ## Validation Step
  else:
    val_data = validate()
    print(val_data)

    ## Saving the metrics
    for metric in val_data:
      training_data[metric].append(val_data[metric])

'''

'\ntraining_data = {\n      "Accuracy Score": [],\n      "F1 score(micro)": [],\n      "F1 score(macro)": [],\n      "Precision":[],\n      "Recall":[],\n  }\n\nfor epoch in range(EPOCHS):\n\n  ## Train Step\n  train(epoch)\n  if epoch == EPOCHS - 1:\n    val_data = validate(True)\n    print(val_data)\n\n    ## Saving the metrics\n    for metric in val_data:\n      training_data[metric].append(val_data[metric])\n\n  ## Validation Step\n  else:\n    val_data = validate()\n    print(val_data)\n\n    ## Saving the metrics\n    for metric in val_data:\n      training_data[metric].append(val_data[metric])\n\n'

## Testing

In [25]:
from sklearn import metrics

def test(last=False):
  model.eval()

  fin_targets = []
  fin_outputs = []

  with torch.inference_mode():
    for _, data in tqdm(enumerate(test_loader, 0)):
      ids = data['input_ids'].to(DEVICE, dtype=torch.long)
      mask = data['attention_mask'].to(DEVICE, dtype=torch.long)
      token_type_ids = data['token_type_ids'].to(DEVICE, dtype=torch.long)
      targets = data['targets'].to(DEVICE, dtype=torch.float)

      outputs = model(ids, mask, token_type_ids)
      fin_targets.extend(targets.cpu().detach().numpy().tolist())
      fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

  fin_outputs = np.array(fin_outputs) >= 0.2
  fin_targets = np.array(fin_targets) >= 0.5
  accuracy = metrics.accuracy_score(fin_targets, fin_outputs)
  f1_score_micro = metrics.f1_score(fin_targets, fin_outputs, average='micro',zero_division=1.0)
  f1_score_macro = metrics.f1_score(fin_targets, fin_outputs, average='macro',zero_division=1.0)
  precision = metrics.precision_score(fin_targets, fin_outputs, average='macro',zero_division=1.0)
  recall = metrics.recall_score(fin_targets, fin_outputs, average='macro',zero_division=1.0)

  if last:
    confusion_matrix = multilabel_confusion_matrix(fin_targets, fin_outputs)
    display_confusion(confusion_matrix)



  print(mlb.inverse_transform(fin_targets)[0])
  print(mlb.inverse_transform(fin_outputs)[0])

  targets_sum = 0
  outputs_sum = 0

  for i,j in zip(mlb.inverse_transform(fin_targets), mlb.inverse_transform(fin_outputs)):
    targets_sum += len(i)
    outputs_sum += len(j)

  print(f"Target Average {targets_sum/len(fin_targets)} Output Average {outputs_sum/len(fin_outputs)}")

  return {
      "Accuracy Score": accuracy,
      "F1 score(micro)": f1_score_micro,
      "F1 score(macro)": f1_score_macro,
      "Precision":precision,
      "Recall":recall
  }

In [26]:
test()

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-s

0it [00:00, ?it/s]

('TED-Ed', 'animation', 'education', 'math')
('TED-Ed', 'animation', 'math', 'science')
Target Average 7.933884297520661 Output Average 6.776859504132231


{'Accuracy Score': 0.0,
 'F1 score(micro)': 0.4258426966292135,
 'F1 score(macro)': 0.5174744171819595,
 'Precision': 0.8063862863445241,
 'Recall': 0.5516238341827221}