In [6]:
import numpy as np
import pandas as pd
import os
import warnings
import random
import torch 
from torch import nn
import torch.optim as optim
from sklearn.model_selection import StratifiedKFold
import tokenizers
from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer

# BERT
from transformers import BertTokenizer, BertModel

warnings.filterwarnings('ignore')

In [7]:
def seed_everything(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = True

seed = 42
seed_everything(seed)

In [8]:
TOKENIZER_NAME = "Davlan/afro-xlmr-small"
MODEL_NAME = "Davlan/afro-xlmr-small"
TARGET_CLASSES = 3

In [43]:
class AfriSentiDataset(torch.utils.data.Dataset):
    '''
        Holds the dataset and also does the tokenization part
    '''
    def __init__(self, df, max_len=300):
        self.df = df
        self.max_len = max_len
        self.labeled = 'label' in df
        self.labels = {'positive': 0, 'neutral':1, 'negative':2}
        self.tokenizer =  BertTokenizer.from_pretrained('bert-base-multilingual-uncased', do_lower_case = True)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index: int):
        data_row = self.df.iloc[index]

        text = data_row.text

        #print(f"label: {data_row.label}, int: {self.labels[data_row.label]}")
        label = self.labels[data_row.label]

        encoding = self.tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=self.max_len,
        return_token_type_ids=False,
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
        )

        return dict(
            text=text,
            input_ids=encoding["input_ids"].flatten(),
            attention_mask=encoding["attention_mask"].flatten(),
            label=label
        )
        
def get_train_val_loaders(df, train_idx, val_idx, batch_size=8):
    train_df = df.iloc[train_idx]
    val_df = df.iloc[val_idx]

    train_loader = torch.utils.data.DataLoader(
        AfriSentiDataset(train_df), 
        batch_size=batch_size, 
        shuffle=True, 
        num_workers=2,
        drop_last=True)

    val_loader = torch.utils.data.DataLoader(
        AfriSentiDataset(val_df), 
        batch_size=batch_size, 
        shuffle=False, 
        num_workers=2)

    dataloaders_dict = {"train": train_loader, "val": val_loader}

    return dataloaders_dict

def get_test_loader(df, batch_size=32):
    loader = torch.utils.data.DataLoader(
        AfriSentiDataset(df), 
        batch_size=batch_size, 
        shuffle=False, 
        num_workers=2)    
    return loader

In [49]:
from transformers import BertForSequenceClassification
class AfriSentiModel(nn.Module):
    def __init__(self, model_name):
        super(AfriSentiModel, self).__init__()
        
        # config = AutoConfig.from_pretrained(
        #     model_name,
        #     #use_auth_token = None
        #     )    
        self.model = BertModel.from_pretrained('bert-base-multilingual-uncased')
        for param in self.model.parameters():
            param.requires_grad = False
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(768, 3)
        self.relu = nn.ReLU()
        nn.init.normal_(self.fc.weight, std=0.02)
        nn.init.normal_(self.fc.bias, 0)

    def forward(self, input_ids, attention_mask):
        _,po = self.model(input_ids, attention_mask, return_dict = False)
        x = self.dropout(po)
        x = self.fc(x)
        x = self.relu(x)
          
        return x

In [50]:
def loss_fn():
    '''
        calculates the loss use CE loss function
    '''
    return nn.CrossEntropyLoss()

In [54]:
import gc
gc.collect()

3284

In [55]:
train_config = {
    "num_epochs": 5,
    "batch_size": 64,
    "learning_rate": 1e-5,
    "optimizer": optim.AdamW,
    'model_name': MODEL_NAME,
    'folds': 5
    
}

In [56]:
from tqdm import tqdm


def train(train_data,config):
    '''
        train the model using config as hyperparameters
    '''

    
    skf = StratifiedKFold(n_splits=config['folds'], shuffle=True, random_state=seed)
    train_data['text'] = train_data['text'].astype(str)
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")


    for fold, (train_idx, val_idx) in enumerate(skf.split(train_data, train_data.label), start=1): 
        
        print(f'Fold: {fold}')
        model = AfriSentiModel(config['model_name'])
        optimizer = config['optimizer'](model.parameters(), lr=config['learning_rate'], betas=(0.9, 0.999))
        criterion = loss_fn()   
        dataloaders_dict = get_train_val_loaders(train_data, train_idx, val_idx, config['batch_size'])

        train_dataloader, val_dataloader = dataloaders_dict['train'], dataloaders_dict['val']

        if use_cuda:
                model = model.cuda()
                criterion = criterion.cuda()

        gc.collect()
        torch.cuda.empty_cache()
        for epoch_num in range(config['num_epochs']):

                total_acc_train = 0
                total_loss_train = 0
                accum_iter = 8

                for b_id, td in tqdm(enumerate(train_dataloader)):

                    train_label = td['label'].to(device)
                    mask = td['attention_mask'].to(device)
                    input_id = td['input_ids'].squeeze(1).to(device)

                    with torch.set_grad_enabled(True):
                        output = model(input_id, mask)
                        batch_loss = criterion(output, train_label.long())
                        total_loss_train += batch_loss.item()
                        
                        acc = (output.argmax(dim=1) == train_label).sum().item()
                        total_acc_train += acc

                        model.zero_grad()
                        batch_loss /= accum_iter
                        batch_loss.backward()

                        if ((b_id + 1) % accum_iter == 0) or (b_id + 1 == len(train_dataloader)):
                            optimizer.step()
                            optimizer.zero_grad()
                            
                        gc.collect()
                        torch.cuda.empty_cache()
                
                total_acc_val = 0
                total_loss_val = 0

                with torch.no_grad():

                    for val_input in val_dataloader:

                        val_label = val_input['label'].to(device)
                        mask = val_input['attention_mask'].to(device)
                        input_id = val_input['input_ids'].squeeze(1).to(device)

                        output = model(input_id, mask)

                        batch_loss = criterion(output, val_label.long())
                        total_loss_val += batch_loss.item()
                        
                        acc = (output.argmax(dim=1) == val_label).sum().item()
                        total_acc_val += acc
                
                print(
                    f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_dataloader.dataset): .3f} \
                    | Train Accuracy: {total_acc_train / len(train_dataloader.dataset): .3f} \
                    | Val Loss: {total_loss_val / len(val_dataloader.dataset): .3f} \
                    | Val Accuracy: {total_acc_val / len(val_dataloader.dataset): .3f}')
                  
        torch.save(model.state_dict(), f"{config['model_name']}_FOLD{fold}.pth")


df = pd.read_csv("multilingual_train.tsv", sep='\t', names=['text', 'label'], header=0)
train(df,train_config)

Fold: 1


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 796/796 [19:12<00:00,  1.45s/it]


TypeError: object of type 'int' has no len()

In [None]:
def infer(test_data):
    ids = test_data.iloc[:,0].astype('str').tolist()
    test_data['text'] = test_data['text'].astype(str)
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    test_dataloader = get_test_loader(test_data)
    predictions = []
    models = []

    
    for fold in range(train_config['folds']):
        model = AfriSentiModel(train_config['model_name'])
        model.cuda()
        model.load_state_dict(torch.load( f"{train_config['model_name']}_FOLD{fold}.pth"))
        model.eval()
        models.append(model)

    with torch.no_grad():

        for data in test_dataloader:

            mask = data['attention_mask'].to(device)
            input_id = data['input_ids'].squeeze(1).to(device)

            pred = 0
            for model in models:
                output = model(input_id, mask)
                pred += torch.softmax(output, dim=1).cpu().detach().numpy()

            predictions.append(round(pred/train_config['folds']))

        labels = pd.Series(predictions).map({0:'positive', 1: 'neutral', 3: 'negative'})

    df = pd.DataFrame(list(zip(ids,labels)), columns=['ID', 'label'])
    df.to_csv(os.path.join('.', 'pred'+ '.tsv'), sep='\t', index=False)

df = pd.read_csv("multilingual_dev.tsv", sep='\t', names=['text'], header=0)
infer(df)