In [None]:
import pandas as pd 
import numpy as np

In [None]:
df = pd.read_csv("./dataset/sem_eval_2018_task_1_train.csv")

In [None]:
df.head(5)

In [None]:
df.shape[0]

In [None]:
data_en = df.loc[:,["Tweet"]]
data_en.dtypes

In [None]:
data_en.values.tolist()[0]

In [None]:
df.iloc[:,2:]

In [None]:
label_index = df.iloc[:,2:].astype(int)
label_index.head(5)

In [None]:
label_one_hot = label_index.to_numpy()
label_one_hot

In [None]:
# https://huggingface.co/docs/transformers/training
!pip install transformers

In [None]:
import pandas as pd 
import numpy as np

In [None]:
import torch 
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import BertTokenizer, BertModel, BertConfig

import torch.nn as nn
import torch.nn.functional as F

In [None]:
class MyNLPDataset(Dataset):
    def __init__(self, file_name, model_name):
        # data loading
        df = pd.read_csv(file_name)
        self.data_en = df.loc[:,["Tweet"]].squeeze().values.tolist()
        self.label_index = torch.from_numpy(df.iloc[:,2:].astype(int).to_numpy())
        self.n_samples = df.shape[0]
        self.embedding_model = None
        self.tokenizer = None
        self.model_name = model_name
        self.tokenized_text = [ {} for i in range(df.shape[0])]
        #self.token_type_ids = None
        #self.attention_mask = None

    def __getitem__(self, index):
        if self.tokenized_text[index] == {}:
            if self.tokenizer == None:
                self.load_embedding_model()
            
            text = self.data_en[index]
            assert isinstance(text, str)
            text_tokenized = self.tokenize_function(text)
            self.tokenized_text[index] = text_tokenized
        #return self.tokenized_text[index]
        return self.tokenized_text[index], self.label_index[index]

    def __len__(self):
        return self.n_samples
    
    def load_embedding_model(self):
        self.tokenizer = BertTokenizer.from_pretrained(self.model_name)
    
    def tokenize_function(self,text):
        # generate token from dataset 
        tokenized_text = self.tokenizer(text, max_length = 128, padding="max_length", return_tensors="pt")
        tokenized_text['input_ids'] = torch.squeeze(tokenized_text['input_ids'])
        tokenized_text['token_type_ids'] = torch.squeeze(tokenized_text['token_type_ids'])
        tokenized_text['attention_mask'] = torch.squeeze(tokenized_text['attention_mask'])
        
        return tokenized_text

In [None]:
train_dataset = MyNLPDataset("./dataset/sem_eval_2018_task_1_train.csv", "bert-base-multilingual-uncased")

In [None]:
#text = "Hello Serena"
#tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-uncased")
#tokenized_text = tokenizer(text, max_length = 128, padding="max_length", return_tensors="pt")

In [None]:
train_dataset[0]

In [None]:
validation_dataset = MyNLPDataset("./dataset/sem_eval_2018_task_1_validation.csv", "bert-base-multilingual-uncased")

In [None]:
training_count = len(train_dataset)
validation_count = len(validation_dataset)
print(training_count, validation_count)

In [None]:
train_dataloader = DataLoader(dataset=train_dataset, batch_size=32, shuffle=True)
validation_dataloader = DataLoader(dataset=validation_dataset, batch_size=32, shuffle=False)

In [None]:
data_iter = iter(train_dataloader)
inputs, labels = next(data_iter)
labels = labels.type(torch.FloatTensor)

In [None]:
inputs

In [None]:
labels

In [None]:
# PyTorch models inherit from torch.nn.Module
class SentenceMultiClassClassifier(nn.Module):
    def __init__(self,number_class, pretrained_model):
        super(SentenceMultiClassClassifier, self).__init__()
        self.number_class = number_class
        self.pretrained = BertModel.from_pretrained(pretrained_model)
        #self.pretrained = BertModel.from_pretrained(pretrained_model,config=AutoConfig.from_pretrained(pretrained_model, output_attentions=True,output_hidden_states=True))

        #self.dropout = nn.Dropout(0.5) 
        #self.fc1 = nn.Linear(768, 1200)
        #self.fc2 = nn.Linear(1200, 1400)
        #self.fc3 = nn.Linear(1400, number_class)
        
        self.linear = nn.Linear(768, number_class)
        self.layeroutput = torch.nn.Sigmoid()



    def forward(self, input_ids, token_type_ids, attention_mask):            
        output_pretrained = self.pretrained(input_ids, token_type_ids, attention_mask)
        # Freeze the BERT parameters
        #for param in self.pretrained.parameters():
        #    param.requires_grad = False
            
        #x = F.relu(self.fc1(output_pretrained.last_hidden_state[:,0,:].view(-1,768)))
        #x = self.dropout(x)
        #x = F.relu(self.fc2(x))
        #x = self.dropout(x)
        #x = output_pretrained.last_hidden_state[:,0,:].view(-1,768)
        
        x = output_pretrained.pooler_output
        x = self.linear(x)
        x = self.layeroutput(x)
        return x

In [None]:
# from transformers import BertTokenizer, BertModel
# import torch

# tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-uncased")
# model = BertModel.from_pretrained("bert-base-multilingual-uncased")

# inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
# print(inputs)
# labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
# pooler_output = model(**inputs).pooler_output
# output_hidden = model(**inputs).last_hidden_state[:,0,:].view(-1,768)

In [None]:
NUM_CLASS = 11
PRETRAINED_MODEL = "bert-base-multilingual-uncased"

model = SentenceMultiClassClassifier(NUM_CLASS, PRETRAINED_MODEL)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
labels = labels.type(torch.FloatTensor)
labels = labels.to(device)
for key, value in inputs.items():
    inputs[key] = inputs[key].to(device)

In [None]:
model.to(device)    

In [None]:
outputs = model(**inputs)
outputs

In [None]:
loss_fn = torch.nn.BCELoss()
# Optimizers specified in the torch.optim package
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0001)

In [None]:
# Compute the loss and its gradients
loss = loss_fn(outputs, labels)
loss.backward()

In [None]:
def train_one_epoch(epoch_index,training_loader, optimizer, model, loss_fn, device):
    running_loss = 0.
    last_loss = 0.
    batch_loss = 0.

    # Here, we use enumerate(training_loader) instead of
    # iter(training_loader) so that we can track the batch
    # index and do some intra-epoch reporting
    for i, data in enumerate(training_loader):
        # Every data instance is an input + label pair
        inputs, labels = data
        labels = labels.type(torch.FloatTensor)
        labels = labels.to(device)
        for key, value in inputs.items():
            inputs[key] = inputs[key].to(device)
            
        # Zero your gradients for every batch!
        optimizer.zero_grad()

        # Make predictions for this batch
        outputs = model(**inputs)

        # Compute the loss and its gradients
        loss = loss_fn(outputs, labels)
        loss.backward()

        # Adjust learning weights
        optimizer.step()

        # Gather data and report
        running_loss += loss.item()
        batch_loss += loss.item()
        if i % 10 == 9:
            last_loss = running_loss / 10 # loss per batch
            print('  batch {} loss: {}'.format(i + 1, last_loss))
            tb_x = epoch_index * len(training_loader) + i + 1
            running_loss = 0.
           
    return batch_loss / len(training_loader)  

In [None]:
train_one_epoch(0,train_dataloader, optimizer, model, loss_fn, device)

In [None]:
train_dataloader

In [None]:
from datetime import datetime
EPOCHS = 2
epoch_number = 0
best_vloss = 1_000_000.
MODEL_SAVE_LOCATION = "./model"

In [None]:
# Initializing in a separate cell so we can easily add more epochs to the same run
for epoch in range(EPOCHS):
    print('EPOCH {}:'.format(epoch_number + 1))

    # Make sure gradient tracking is on, and do a pass over the data
    model.train(True)
    avg_loss = train_one_epoch(epoch_number, train_dataloader, optimizer, model, loss_fn, device)

    # We don't need gradients on to do reporting
    #model.train(False)
    model.eval()
    running_vloss = 0.0

    for i, vdata in enumerate(validation_dataloader):
        vinputs, vlabels = vdata
        vlabels = vlabels.type(torch.FloatTensor)
        vlabels = vlabels.to(device)
        for key, value in vinputs.items():
            vinputs[key] = vinputs[key].to(device)

        with torch.no_grad():
            voutputs = model(**vinputs)
        vloss = loss_fn(voutputs, vlabels)
        running_vloss += vloss

    avg_vloss = running_vloss / (i + 1)
    print('LOSS train {} valid {}'.format(avg_loss, avg_vloss))

    # Track best performance, and save the model's state
    if avg_vloss < best_vloss:
        best_vloss = avg_vloss
        model_path = '{}/model.pth'.format(MODEL_SAVE_LOCATION)
        torch.save(model.state_dict(), model_path)

    epoch_number += 1

In [None]:
!python ./scripts/train_nlp_bert_sm_compatible.py --epochs 100 --model_id "bert-base-multilingual-uncased" --training_dir "./dataset" --output_dir "./model" --learning_rate 0.00001

Start training ...
Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
EPOCH 1:
  batch 10 loss: 0.5953069925308228
  batch 20 loss: 0.5219672858715058
  batch 30 loss: 0.4847619473934174
  batc