In [3]:
import numpy as np
import pandas as pd
import time
import datetime
import random

import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler,random_split

from transformers import BertForSequenceClassification, AdamW,BertTokenizer,get_linear_schedule_with_warmup

from nltk.stem import PorterStemmer
from sklearn.metrics import classification_report


In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

In [5]:
df = pd.read_csv("../data/preprocessed_data_bert.csv")

def map_popularity(value):
    if value == "super positiv":
        return 0
    elif value == "positiv":
        return 1
    elif value == "negativ":
        return 2
    else:
        return 3

df["popularity"] = df["popularity"].apply(map_popularity)

df

Unnamed: 0.1,Unnamed: 0,video_id,popularity,sentence
0,0,--14w5SOEUs,1,Channel with title : MigosVEVO has posted vide...
1,1,--40TEbZ9Is,1,Channel with title : Television Academy has po...
2,2,-0PZSxZuAXQ,1,Channel with title : Breakfast Club Power 105....
3,3,-0QSEZIqVWc,1,Channel with title : VarietyJay has posted vid...
4,4,-0Yxqcm0K2I,1,Channel with title : TheMacLife has posted vid...
...,...,...,...,...
67244,79956,,0,Channel with title : SpaceX has posted video w...
67245,79957,,3,Channel with title : Inside Edition has posted...
67246,79958,,3,Channel with title : Thomas Bikias has posted ...
67247,79959,,1,Channel with title : Saturday Night Live has p...


In [9]:
text = df.sentence.values
labels = df.popularity.values

In [10]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [7]:
def tokenize_funciton(text) :
    max_len = 512
    input_ids = []
    attention_masks = []


    for sentence in text:
        encoded_dict = tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,
            max_length=max_len,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        stemmer = PorterStemmer()

        stemmed_input_ids = [stemmer.stem(token) for token in tokenizer.convert_ids_to_tokens(encoded_dict['input_ids'][0].tolist())]

        stemmed_input_ids = tokenizer.convert_tokens_to_ids(stemmed_input_ids)

        encoded_dict['input_ids'] = torch.tensor(stemmed_input_ids).unsqueeze(0)

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])


    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    
    return input_ids, attention_masks

labels = np.array(labels, dtype=object)
labels = np.array(labels, dtype=np.int64)
labels = torch.tensor(labels)


NameError: name 'labels' is not defined

In [16]:
input_ids, attention_masks = tokenize_funciton(text)

dataset = TensorDataset(input_ids, attention_masks, labels)


train_size = int(0.8 * len(dataset))
val_size = int(len(dataset) - train_size)

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])


print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))


53,799 training samples
13,450 validation samples


In [17]:
batch_size = 8

train_dataloader = DataLoader(
            train_dataset,
            sampler = RandomSampler(train_dataset),
            batch_size = batch_size
        )

validation_dataloader = DataLoader(
            val_dataset,
            sampler = SequentialSampler(val_dataset),
            batch_size = batch_size
        )


In [18]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels = 4,
    output_attentions = False,
    output_hidden_states = False
)

model = model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
optimizer = AdamW(model.parameters(),
                  lr = 2e-5,
                  eps = 1e-8
                )



In [20]:
epochs = 4

total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

In [21]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [22]:
def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [23]:
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
training_stats = {
    'epoch': [],
    'Training Loss': [],
    'Valid. Loss': [],
    'Valid. Accur.': [],
    'Training Time': [],
    'Validation Time': [],
    'Accuracy' : []
}

total_t0 = time.time()

for epoch_i in range(0, epochs):
    predictions = []
    ground_truth = []
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()
    total_train_loss = 0
    total_train_accuracy = 0
    model.train()
    for step, batch in enumerate(train_dataloader):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        optimizer.zero_grad()
        output = model(b_input_ids,
                             token_type_ids=None,
                             attention_mask=b_input_mask,
                             labels=b_labels)
        loss = output.loss
        total_train_loss += loss.item()

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()

        scheduler.step()
        
        logits = output.logits
        logits = logits.detach().cpu().numpy()
        pred_flat = np.argmax(logits, axis=1).flatten()
        label_ids = b_labels.to('cpu').numpy()

        predictions.extend(list(pred_flat))
        ground_truth.extend(list(label_ids.flatten()))

    print(classification_report(ground_truth,predictions,labels=[0,1,2,3]))

    avg_train_loss = total_train_loss / len(train_dataloader)

    training_time = format_time(time.time() - t0)
    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))

    print("")
    print("Running Validation...")
    t0 = time.time()

    model.eval()

    total_eval_accuracy = 0
    best_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    for batch in validation_dataloader:
        predictions = []
        ground_truth = []
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        with torch.no_grad():
            output= model(b_input_ids,
                        token_type_ids=None,
                        attention_mask=b_input_mask,
                        labels=b_labels)
        loss = output.loss
        total_eval_loss += loss.item()

        logits = output.logits
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        total_eval_accuracy += flat_accuracy(logits, label_ids)
    
    print(classification_report(ground_truth,predictions,labels=[0,1,2,3]))
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    avg_val_loss = total_eval_loss / len(validation_dataloader)

    validation_time = format_time(time.time() - t0)
    if avg_val_accuracy > best_eval_accuracy:
        torch.save(model, '../models/bert_model.pth')
        best_eval_accuracy = avg_val_accuracy
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    training_stats['epoch'].append(epoch_i+1)
    training_stats['Training Loss'].append(avg_train_loss)
    training_stats['Valid. Loss'].append(avg_val_loss)
    training_stats['Valid. Accur.'].append(avg_val_accuracy)

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training...
              precision    recall  f1-score   support

           0       0.42      0.45      0.44      9665
           1       0.64      0.64      0.64     19214
           2       0.51      0.51      0.51     16116
           3       0.61      0.57      0.59      8804

    accuracy                           0.56     53799
   macro avg       0.55      0.54      0.54     53799
weighted avg       0.56      0.56      0.56     53799


  Average training loss: 1.00
  Training epcoh took: 1:15:27

Running Validation...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00       0.0
           1       0.00      0.00      0.00       0.0
           2       0.00      0.00      0.00       0.0
           3       0.00      0.00      0.00       0.0

   micro avg       0.00      0.00      0.00       0.0
   macro avg       0.00      0.00      0.00       0.0
weighted avg       0.00      0.00      0.00       0.0

  Accuracy: 0.62
  Validation Loss: 0.90
  Validation took: 0:06:16

Training...
              precision    recall  f1-score   support

           0       0.57      0.59      0.58      9665
           1       0.68      0.72      0.70     19214
           2       0.66      0.60      0.63     16116
           3       0.73      0.73      0.73      8804

    accuracy                           0.66     53799
   macro avg       0.66      0.66      0.66     53799
weighted avg       0.66      0.66      0.66     53799


  Average training loss: 0.81
  Training epcoh t

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00       0.0
           1       0.00      0.00      0.00       0.0
           2       0.00      0.00      0.00       0.0
           3       0.00      0.00      0.00       0.0

   micro avg       0.00      0.00      0.00       0.0
   macro avg       0.00      0.00      0.00       0.0
weighted avg       0.00      0.00      0.00       0.0

  Accuracy: 0.65
  Validation Loss: 0.84
  Validation took: 0:06:16

Training...
              precision    recall  f1-score   support

           0       0.69      0.70      0.70      9665
           1       0.75      0.79      0.77     19214
           2       0.75      0.70      0.72     16116
           3       0.82      0.82      0.82      8804

    accuracy                           0.75     53799
   macro avg       0.75      0.75      0.75     53799
weighted avg       0.75      0.75      0.75     53799


  Average training loss: 0.64
  Training epcoh t

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00       0.0
           1       0.00      0.00      0.00       0.0
           2       0.00      0.00      0.00       0.0
           3       0.00      0.00      0.00       0.0

   micro avg       0.00      0.00      0.00       0.0
   macro avg       0.00      0.00      0.00       0.0
weighted avg       0.00      0.00      0.00       0.0

  Accuracy: 0.68
  Validation Loss: 0.88
  Validation took: 0:06:16

Training...
              precision    recall  f1-score   support

           0       0.80      0.81      0.80      9665
           1       0.81      0.85      0.83     19214
           2       0.83      0.78      0.81     16116
           3       0.88      0.87      0.88      8804

    accuracy                           0.83     53799
   macro avg       0.83      0.83      0.83     53799
weighted avg       0.83      0.83      0.83     53799


  Average training loss: 0.48
  Training epcoh t

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00       0.0
           1       0.00      0.00      0.00       0.0
           2       0.00      0.00      0.00       0.0
           3       0.00      0.00      0.00       0.0

   micro avg       0.00      0.00      0.00       0.0
   macro avg       0.00      0.00      0.00       0.0
weighted avg       0.00      0.00      0.00       0.0

  Accuracy: 0.68
  Validation Loss: 1.00
  Validation took: 0:06:16

Training complete!
Total training took 5:27:04 (h:mm:ss)


In [15]:

model = torch.load('../models/bert_model.pth',map_location='cpu')

In [16]:
b_input_ids, attention_masks = tokenize_funciton(df.head(5).sentence.values)



In [17]:
def map_result(value):
    if value == 0:
        return "super positiv"
    elif value == 1:
        return "positiv"
    elif value == 3:
        return "negativ"
    else:
        return "super negativ"

In [18]:
output= model(b_input_ids, 
            token_type_ids=None, 
            attention_mask=attention_masks)

logits = output.logits
logits = logits.detach().cpu().numpy()
pred_flat = np.argmax(logits, axis=1).flatten()

for i in range(0,len(pred_flat)):
    print(map_result(pred_flat[i]))

positiv
positiv
positiv
positiv
positiv
