## TED Lens BERT V1

Importing Libraries

In [1]:
import pandas as pd
import spacy
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np
import torch

import gensim
import gensim.downloader as api


  from .autonotebook import tqdm as notebook_tqdm


Data Preprocessing

In [2]:
data = pd.read_csv('./TED_Talks_Dataset/2020-05-01/ted_talks_en.csv')

xtraining_column = 'description'
useful_data = data[[xtraining_column, 'topics']]
useful_data.head()

Unnamed: 0,description,topics
0,With the same humor and humanity he exuded in ...,"['alternative energy', 'cars', 'climate change..."
1,You've never seen data presented like this. Wi...,"['Africa', 'Asia', 'Google', 'demo', 'economic..."
2,New York Times columnist David Pogue takes aim...,"['computers', 'entertainment', 'interface desi..."
3,"In an emotionally charged talk, MacArthur-winn...","['MacArthur grant', 'activism', 'business', 'c..."
4,Sir Ken Robinson makes an entertaining and pro...,"['children', 'creativity', 'culture', 'dance',..."


Exploring the Topics available in the Dataset

In [3]:
# Load the CSV dataset
data = useful_data

# converting the text to 
all_topics = []
for topics,i in zip(data['topics'],range(len(data['topics']))):
    all_topics+=eval(topics)
    data['topics'][i] = eval(topics)

unique_topics = sorted(list(set(all_topics)))
print(unique_topics)

num_labels = len(unique_topics)
print(num_labels,' Topics')

['3D printing', 'AI', 'AIDS', 'Africa', "Alzheimer's", 'Antarctica', 'Anthropocene', 'Asia', 'Audacious Project', 'Autism spectrum disorder', 'Best of the Web', 'Brand', 'Brazil', 'Buddhism', 'CRISPR', 'Christianity', 'DNA', 'Debate', 'Egypt', 'Europe', 'Foreign Policy', 'Gender spectrum', 'God', 'Google', 'HIV', 'Humanities', 'Internet', 'Iran', 'Islam', 'LGBT', 'Latin America', 'MacArthur grant', 'Mars', 'Middle East', 'Moon', 'NASA', 'New York', 'Nobel Prize', 'PTSD', 'Planets', 'Science (hard)', 'Senses', 'Slavery', 'Social Science', 'South America', 'String theory', 'Sun', 'Surgery', 'Syria', 'TED Books', 'TED Connects', 'TED Fellows', 'TED Prize', 'TED Residency', 'TED en Español', 'TED-Ed', 'TEDMED', 'TEDNYC', 'TEDYouth', 'TEDx', 'Transgender', 'United States', 'Vaccines', 'activism', 'addiction', 'adventure', 'advertising', 'aging', 'agriculture', 'aircraft', 'algorithm', 'alternative energy', 'ancient world', 'animals', 'animation', 'anthropology', 'ants', 'apes', 'archaeology

Function to convert a DataFrame column to its BERT Embeddings

In [4]:
data.head()

Unnamed: 0,description,topics
0,With the same humor and humanity he exuded in ...,"[alternative energy, cars, climate change, cul..."
1,You've never seen data presented like this. Wi...,"[Africa, Asia, Google, demo, economics, global..."
2,New York Times columnist David Pogue takes aim...,"[computers, entertainment, interface design, m..."
3,"In an emotionally charged talk, MacArthur-winn...","[MacArthur grant, activism, business, cities, ..."
4,Sir Ken Robinson makes an entertaining and pro...,"[children, creativity, culture, dance, educati..."


In [5]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
topics_labels = mlb.fit_transform(data['topics'])

for i in range(len(data['topics'])):
    data['topics'][i] = topics_labels[i]

In [6]:
data.head()

Unnamed: 0,description,topics
0,With the same humor and humanity he exuded in ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,You've never seen data presented like this. Wi...,"[0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
2,New York Times columnist David Pogue takes aim...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"In an emotionally charged talk, MacArthur-winn...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,Sir Ken Robinson makes an entertaining and pro...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [7]:
from torch.utils.data import Dataset, DataLoader

class MultiLabelDataset(Dataset):
    
    def __init__(self, df, tokenizer, max_len, new_data=False):
        self.tokenizer = tokenizer
        self.data = df
        self.text = df.description
        self.new_data = new_data
        self.max_len = max_len
        
        if not new_data:
            self.targets = self.data.topics
            
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, index):
        text = str(self.text[index])
        
        inputs = self.tokenizer.encode_plus(text,None,
                                           add_special_tokens=True,
                                           max_length=self.max_len,
                                           pad_to_max_length=True,
                                           return_token_type_ids=True)
        out = {
            "input_ids": torch.tensor(inputs['input_ids'], dtype=torch.long),
            "attention_mask": torch.tensor(inputs['attention_mask'], dtype=torch.long),
            "token_type_ids": torch.tensor(inputs['token_type_ids'], dtype=torch.long)
        }
        if not self.new_data:
            out["targets"] = torch.tensor(self.targets[index], dtype=torch.float)
            
        return out

Model Testing and Evaluation

In [8]:
MAX_LEN = 320
EPOCHS = 5
LEARNING_RATE = 1e-05
DEVICE = 'cuda:0' if torch.cuda.is_available() else 'cpu'
DEVICE

'cpu'

In [9]:
import os
import random
import torch

def seed_everything(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(42)

In [10]:
train_size = 0.8 

train_df = data.sample(frac=train_size, random_state=42)
val_df = data.drop(train_df.index).reset_index(drop=True)
train_df.reset_index(inplace=True, drop=True)

In [11]:
from transformers import DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', truncation=True, do_lower_case=True)
train_set = MultiLabelDataset(train_df, tokenizer, MAX_LEN)
val_set = MultiLabelDataset(val_df, tokenizer, MAX_LEN)

train_loader = DataLoader(train_set, batch_size=32, shuffle=True, num_workers=8)
val_loader = DataLoader(val_set, batch_size=32, shuffle=False, num_workers=8)

In [12]:
from transformers import DistilBertModel

class DistilBertClass(torch.nn.Module):
    def __init__(self):
        super(DistilBertClass, self).__init__()
        
        self.bert = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.classifier = torch.nn.Sequential(torch.nn.Linear(768, 768),
                                             torch.nn.ReLU(),
                                             torch.nn.Dropout(0.1),
                                             torch.nn.Linear(768, 457))
        
    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        out = hidden_state[:,0]
        out = self.classifier(out)
        return out

In [13]:
model = DistilBertClass()
model.to(DEVICE)
print(f"Model on {DEVICE}")

Model on cpu


In [14]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

def loss_fn(outputs, targets):
    return torch.nn.functional.binary_cross_entropy_with_logits(outputs,targets)

In [15]:
from tqdm.auto import tqdm

def train(epoch):
    model.train()
    
    for _, data in tqdm(enumerate(train_loader)):
        input_ids = data['input_ids'].to(DEVICE, dtype=torch.long)
        attention_mask = data['attention_mask'].to(DEVICE, dtype=torch.long)
        token_type_ids = data['token_type_ids'].to(DEVICE, dtype=torch.long)
        targets = data['targets'].to(DEVICE, dtype=torch.float)
        
        outputs = model(input_ids, attention_mask, token_type_ids)
        
        loss = loss_fn(outputs, targets)
        
        if _ % 5000 == 0:
            print(f"Epoch: {epoch}, Loss: {loss.item()}")
            
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

In [16]:
from sklearn import metrics

def validate():
  model.eval()

  fin_targets = []
  fin_outputs = []

  with torch.inference_mode():
    for _, data in tqdm(enumerate(val_loader, 0)):
      ids = data['input_ids'].to(DEVICE, dtype=torch.long)
      mask = data['attention_mask'].to(DEVICE, dtype=torch.long)
      token_type_ids = data['token_type_ids'].to(DEVICE, dtype=torch.long)
      targets = data['targets'].to(DEVICE, dtype=torch.float)

      outputs = model(ids, mask, token_type_ids)
      fin_targets.extend(targets.cpu().detach().numpy().tolist())
      fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
      
  fin_outputs = np.array(fin_outputs) >= 0.5
  accuracy = metrics.accuracy_score(fin_targets, fin_outputs)
  f1_score_micro = metrics.f1_score(fin_targets, fin_outputs, average='micro')
  f1_score_macro = metrics.f1_score(fin_targets, fin_outputs, average='macro') 

  return {
      "Accuracy Score": accuracy,
      "F1 score(micro)": f1_score_micro,
      "F1 score(macro)": f1_score_macro
  }

In [None]:
for epoch in range(EPOCHS):
  train(epoch)
  print(validate())