In [2]:
import numpy as np
import pandas as pd
import random
import torch
from nltk.stem import PorterStemmer
from torch.utils.data import TensorDataset, DataLoader,random_split
from transformers import BertForSequenceClassification, AdamW,BertTokenizerFast,get_linear_schedule_with_warmup
from sklearn.metrics import classification_report,accuracy_score

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [3]:
df = pd.read_csv("../data/preprocessed_data_bert.csv")

def map_popularity(value):
    if value == "super positiv":
        return 0
    elif value == "positiv":
        return 1
    elif value == "negativ":
        return 2
    else:
        return 3

df["popularity"] = df["popularity"].apply(map_popularity)



In [3]:
df

Unnamed: 0.1,Unnamed: 0,video_id,popularity,sentence
0,0,--14w5SOEUs,1,Channel with title : MigosVEVO has posted vide...
1,1,--40TEbZ9Is,1,Channel with title : Television Academy has po...
2,2,-0PZSxZuAXQ,1,Channel with title : Breakfast Club Power 105....
3,3,-0QSEZIqVWc,1,Channel with title : VarietyJay has posted vid...
4,4,-0Yxqcm0K2I,1,Channel with title : TheMacLife has posted vid...
...,...,...,...,...
67244,79956,,0,Channel with title : SpaceX has posted video w...
67245,79957,,3,Channel with title : Inside Edition has posted...
67246,79958,,3,Channel with title : Thomas Bikias has posted ...
67247,79959,,1,Channel with title : Saturday Night Live has p...


In [5]:
sentences = df.sentence.values
popularities = df.popularity.values

In [4]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [6]:
def tokenize_funciton(text) :
    input_ids = []
    attention_masks = []

    for sentence in text:
        encoded_dict = tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,
            max_length=512,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        stemmer = PorterStemmer()
        stemmed_input_ids = [stemmer.stem(token) for token in tokenizer.convert_ids_to_tokens(encoded_dict['input_ids'][0].tolist())]
        stemmed_input_ids = tokenizer.convert_tokens_to_ids(stemmed_input_ids)

        encoded_dict['input_ids'] = torch.tensor(stemmed_input_ids).unsqueeze(0)

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks




In [8]:
labels = np.array(popularities, dtype=object)
labels = np.array(labels, dtype=np.int64)
labels = torch.tensor(labels)

In [9]:
input_ids, attention_masks = tokenize_funciton(sentences)


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [10]:
dataset = TensorDataset(input_ids, attention_masks, labels)

train_data_size = int(0.8 * len(dataset))
validation_data_size = int(len(dataset) - train_data_size)

train_dataset, validating_dataset = random_split(dataset, [train_data_size, validation_data_size])

print('Length of training data : ' + str(train_data_size))
print('Length of validating data : ' + str(validation_data_size))


Length of training data : 53799
Length of validating data : 13450


In [16]:
train_params = {'batch_size': 8,
                'shuffle': True,
                'num_workers': 0
                }

val_params = {'batch_size': 8,
                'shuffle': False,
                'num_workers': 0
                }

training_loader = DataLoader(train_dataset, **train_params)
validating_loader = DataLoader(validating_dataset, **val_params)

In [17]:
class BERTModel(torch.nn.Module):
    def __init__(self):
        super(BERTModel, self).__init__()
        self.model = BertForSequenceClassification.from_pretrained("bert-base-uncased",num_labels = 4)

    def forward(self, ids, mask, b_labels):
        output = self.model(ids,token_type_ids=None,attention_mask=mask,labels=b_labels)
        return output



In [18]:
model = BERTModel()
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTModel(
  (model): BertForSequenceClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-11): 12 x BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias

In [19]:
optimizer = AdamW(model.parameters(),lr = 2e-5,eps = 1e-8)



In [20]:
epochs = 4

num_steps_for_training = len(training_loader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps = 0,num_training_steps = num_steps_for_training)

In [21]:
seed_value = 42
random.seed(seed_value)
np.random.seed(seed_value)
torch.manual_seed(seed_value)
torch.cuda.manual_seed_all(seed_value)

In [22]:
best_accuracy=0
for i in range(0, epochs):
    predictions = []
    ground_truth = []
    print('\nEpoch ' + str(i+1))
    print('\nTraining : ')

    loss = 0
    model.train()
    for step,data in enumerate(training_loader,0):
        ids = data[0].to(device)
        mask = data[1].to(device)
        targets = data[2].to(device)

        optimizer.zero_grad()

        output = model.forward(ids, mask, targets)
        current_loss = output.loss
        loss += current_loss.item()
        current_loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

        logits = output.logits
        logits = logits.detach().cpu().numpy()
        pred_flat = np.argmax(logits, axis=1).flatten()
        target_ids = targets.to('cpu').numpy()

        predictions.extend(list(pred_flat))
        ground_truth.extend(list(target_ids.flatten()))

    print(classification_report(ground_truth,predictions,labels=[0,1,2,3]))

    print("\nTraining loss: " + str(loss/len(training_loader)) + "\n")

    print("\nValidation :")

    model.eval()
    eval_loss = 0
    with torch.no_grad():
        predictions = []
        ground_truth = []
        for _,data in enumerate(validating_loader,0):

            ids = data[0].to(device)
            mask = data[1].to(device)
            targets = data[2].to(device)

            output = model.forward(ids, mask, targets)

            eval_loss += output.loss.item()

            logits = output.logits
            logits = logits.detach().cpu().numpy()
            target_ids = targets.to('cpu').numpy()

            predictions.extend(list(np.argmax(logits, axis=1).flatten()))
            ground_truth.extend(list(target_ids.flatten()))

    print(classification_report(ground_truth, predictions, labels=[0, 1, 2, 3]))
    print("\nEvaluating loss: " + str(eval_loss/len(validating_loader)) + "\n")

    accuracy = accuracy_score(ground_truth, predictions)
    print("\nAccuracy: " + str(accuracy))

    if accuracy > best_accuracy:
        torch.save(model, '../models/bert_model.pth')
        best_accuracy = accuracy



Epoch 1

Training : 
              precision    recall  f1-score   support

           0       0.40      0.45      0.42      9666
           1       0.63      0.62      0.63     19180
           2       0.50      0.49      0.49     16097
           3       0.59      0.56      0.57      8856

    accuracy                           0.54     53799
   macro avg       0.53      0.53      0.53     53799
weighted avg       0.54      0.54      0.54     53799


Training loss: 1.0271483793001637


Validation :
              precision    recall  f1-score   support

           0       0.51      0.50      0.51      2440
           1       0.69      0.65      0.67      4824
           2       0.60      0.54      0.57      4067
           3       0.56      0.75      0.64      2119

    accuracy                           0.61     13450
   macro avg       0.59      0.61      0.60     13450
weighted avg       0.61      0.61      0.61     13450


Evaluating loss: 0.9272671154479847


Accuracy: 0.6067657

In [7]:
model = torch.load('../models/bert_model.pth',map_location='cpu')

In [17]:
example = df.sample(1)
ids, masks = tokenize_funciton(example.sentence.values)



In [18]:
def map_result(value):
    if value == 0:
        return "super positiv"
    elif value == 1:
        return "positiv"
    elif value == 3:
        return "negativ"
    else:
        return "super negativ"

In [19]:
output= model(ids,
            token_type_ids=None,
            attention_mask=masks)

logits = output.logits
logits = logits.detach().cpu().numpy()
pred_flat = np.argmax(logits, axis=1).flatten()

for i in range(0,len(pred_flat)):
    print("prediction : " + str(map_result(pred_flat[i])) + " | truth : " + str(map_result(example.popularity.values)))

prediction : positiv | truth : positiv


Bert model korišćen je za predikciju popularnosti youtube snimka na osnovu naziva videa, naziva kanala, opisa i tagova. Zaključeno je da su sve 4 kolone bitne i utiču na tačnost modela.

Vrednosti ove četri kolone spoje se u jednu rečenecu koja je ulaz u bert model a izlaz predstavlja popularnost određenu na osnovu procenta broja dislajkove u odnosu na lajkove.

Nad ulaznim podacima radi se tokenziacija, stemovanje i padding kako bi model što više naučio o sličnosti između reči.

Što se tiče parametara, learning rate kada je 1e-5 i 2e-5 daje isti rezultat a sve veće dovodi do loše tačnosti. Zbog velike količine podataka korišćen je AdamW optimizator ali je testirano i sa SGD koji je dao duplo gori rezultat. Batch size je uticao sasvim minimalno na model, nije davao veće razlike u tačnosti, razlika je u 1% gore ako se koristi veći batch_size.

Trening i test podaci podeljeni su u odnosu 80:20.

U početku je model davao slabu tačnost od svega 58% i na osnovu recall metrike zaključeno je da postoji nedostatak podataka u određenim klasama i onda su dodatno ubačeni podaci o video snimcima pre 2017 godine za klase koje su davale nizak recall. Model je posle toga dao bolju tačnost od 67%.
