**I/O device register**

In [10]:
from google.colab import drive
drive.mount('/content/drive')

KeyboardInterrupt: ignored

**Install required libraries**

In [3]:
!pip3 install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d8/b2/57495b5309f09fa501866e225c84532d1fd89536ea62406b2181933fb418/transformers-4.5.1-py3-none-any.whl (2.1MB)
[K     |▏                               | 10kB 25.4MB/s eta 0:00:01[K     |▎                               | 20kB 33.3MB/s eta 0:00:01[K     |▌                               | 30kB 22.5MB/s eta 0:00:01[K     |▋                               | 40kB 25.8MB/s eta 0:00:01[K     |▉                               | 51kB 25.4MB/s eta 0:00:01[K     |█                               | 61kB 27.6MB/s eta 0:00:01[K     |█▏                              | 71kB 18.5MB/s eta 0:00:01[K     |█▎                              | 81kB 19.5MB/s eta 0:00:01[K     |█▍                              | 92kB 18.4MB/s eta 0:00:01[K     |█▋                              | 102kB 18.4MB/s eta 0:00:01[K     |█▊                              | 112kB 18.4MB/s eta 0:00:01[K     |██                              | 

In [1]:
!pip install torch==1.8.0+cu111 torchvision==0.9.0+cu111 torchaudio==0.8.0 -f https://download.pytorch.org/whl/torch_stable.html 


Looking in links: https://download.pytorch.org/whl/torch_stable.html
Collecting torch==1.8.0+cu111
[?25l  Downloading https://download.pytorch.org/whl/cu111/torch-1.8.0%2Bcu111-cp37-cp37m-linux_x86_64.whl (1982.2MB)
[K     |█████████████▌                  | 834.1MB 1.4MB/s eta 0:13:53tcmalloc: large alloc 1147494400 bytes == 0x55f58359a000 @  0x7f72e9344615 0x55f54962e06c 0x55f54970deba 0x55f549630e8d 0x55f54972299d 0x55f5496a4fe9 0x55f54969fb0e 0x55f54963277a 0x55f5496a4e50 0x55f54969fb0e 0x55f54963277a 0x55f5496a186a 0x55f5497237c6 0x55f5496a0ee2 0x55f5497237c6 0x55f5496a0ee2 0x55f5497237c6 0x55f5496a0ee2 0x55f5497237c6 0x55f5497a5431 0x55f549706049 0x55f549670c84 0x55f5496318e9 0x55f5496a5ade 0x55f54963269a 0x55f5496a0a45 0x55f54969fe0d 0x55f54963277a 0x55f5496a0a45 0x55f54963269a 0x55f5496a0a45
[K     |█████████████████               | 1055.7MB 1.3MB/s eta 0:11:36tcmalloc: large alloc 1434370048 bytes == 0x55f5c7bf0000 @  0x7f72e9344615 0x55f54962e06c 0x55f54970deba 0x55f549630e

## BERT 

In [4]:
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
import pandas as pd
import torch
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler
import numpy as np
import time
import datetime

In [5]:
def setup_classifier(
    model_name: str,
    num_labels: int) -> BertForSequenceClassification:

    model = BertForSequenceClassification.from_pretrained(
        model_name,
        num_labels = num_labels,
        output_attentions = False,
        output_hidden_states = False,
    )
    
    return model

In [6]:
def setup_data(
    model_name: str,
    x: pd.DataFrame, 
    y: pd.DataFrame,
    do_lower_case: bool,
    max_length: int) -> TensorDataset:

    tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case = do_lower_case)

    input_ids = []
    attention_masks = []

    for text in x:
        encoded_dict = tokenizer.encode_plus(
            text,
            add_special_tokens = True,
            max_length = max_length,
            padding='max_length',
            return_attention_mask = True,
            return_tensors = 'pt',
            truncation = True
        )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(y)

    dataset = TensorDataset(input_ids, attention_masks, labels)

    return dataset

In [7]:
def train_classifier(
    model: BertForSequenceClassification, 
    dataset: TensorDataset, 
    validation_ratio: float,
    batch_size: int,
    freeze_embeddings_layer: bool,
    freeze_encoder_layers: int,
    epochs: int) -> (BertForSequenceClassification, list):

    device = select_device()

    train_size = int(validation_ratio * len(dataset))
    val_size = len(dataset) - train_size

    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

    train_dataloader = DataLoader(
        train_dataset,
        sampler = RandomSampler(train_dataset),
        batch_size = batch_size
    )

    validation_dataloader = DataLoader(
        val_dataset,
        sampler = SequentialSampler(val_dataset),
        batch_size = batch_size
    )

    modules = []

    if freeze_embeddings_layer:
        modules.append(model.bert.embeddings)
    
    for i in range(freeze_encoder_layers):
        modules.append(model.bert.encoder.layer[i])

    for module in modules:
        for param in module.parameters():
            param.requires_grad = False
    
    model.to(device)

    optimizer = AdamW(
        filter(lambda p: p.requires_grad, model.parameters()),
        lr = 2e-5,
        eps = 1e-8
    )

    total_steps = len(train_dataloader) * epochs

    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps = 0,
        num_training_steps = total_steps
    )

    training_stats = []

    total_t0 = time.time()

    for epoch_i in range(0, epochs):

        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
        print('Training...')

        t0 = time.time()

        total_train_loss = 0

        model.train()

        for step, batch in enumerate(train_dataloader):

            if step % 40 == 0 and not step == 0:
                elapsed = format_time(time.time() - t0)
                print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            model.zero_grad()        

            outputs = model(
                b_input_ids, 
                token_type_ids = None, 
                attention_mask = b_input_mask, 
                labels = b_labels
            )

            loss = outputs.loss
            logits = outputs.logits

            total_train_loss += loss.item()

            loss.backward()

            # Clip the norm of the gradients to 1.0.
            # This is to help prevent the "exploding gradients" problem.
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()

            scheduler.step()

        avg_train_loss = total_train_loss / len(train_dataloader)            
        
        training_time = format_time(time.time() - t0)

        print("")
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        print("  Training epcoh took: {:}".format(training_time))
        

        print("")
        print("Running Validation...")

        t0 = time.time()

        model.eval()

        total_eval_accuracy = 0
        total_eval_loss = 0
        nb_eval_steps = 0

        for batch in validation_dataloader:
            
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)
            
            with torch.no_grad():

                outputs = model(
                    b_input_ids, 
                    token_type_ids = None, 
                    attention_mask = b_input_mask,
                    labels = b_labels
                )
                
                loss = outputs.loss
                logits = outputs.logits
                
            total_eval_loss += loss.item()

            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.cpu().numpy()

            total_eval_accuracy += flat_accuracy(logits, label_ids)
            

        avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
        print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

        avg_val_loss = total_eval_loss / len(validation_dataloader)
        
        validation_time = format_time(time.time() - t0)
        
        print("  Validation Loss: {0:.2f}".format(avg_val_loss))
        print("  Validation took: {:}".format(validation_time))

        training_stats.append(
            {
                'epoch': epoch_i + 1,
                'Training Loss': avg_train_loss,
                'Valid. Loss': avg_val_loss,
                'Valid. Accur.': avg_val_accuracy,
                'Training Time': training_time,
                'Validation Time': validation_time
            }
        )

    print("")
    print("Training complete!")

    print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

    return model, training_stats

In [8]:
def test_classifier(
    model: BertForSequenceClassification, 
    dataset: TensorDataset,
    batch_size: int):

    device = select_device()

    prediction_dataloader = DataLoader(
        dataset, 
        sampler = SequentialSampler(dataset), 
        batch_size = batch_size
    )

    print("")
    print("Running Prediction...")

    model.eval()

    predictions , true_labels = [], []

    for batch in prediction_dataloader:

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2]
        
        with torch.no_grad():

            outputs = model(
                b_input_ids, 
                token_type_ids = None, 
                attention_mask = b_input_mask
            )

        logits = outputs.logits

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.numpy()
        
        predictions.append(logits)
        true_labels.append(label_ids)
    
    print('DONE.')

    return predictions, true_labels


In [2]:
def save_checkpoint(path, model, optimizer, epoch, loss):
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss
        }, path)


def save_model(path, model):
    torch.save(model.state_dict(), path)


def load_checkpoint(path):
    checkpoint = torch.load(path)
    return checkpoint['model_state_dict'], checkpoint['optimizer_state_dict'], checkpoint['epoch'], checkpoint['loss']


def load_model(path):
    return torch.load(path)


def select_device():

    if torch.cuda.is_available():
        print('There are %d GPU(s) available.' % torch.cuda.device_count())
        print('We will use the GPU:', torch.cuda.get_device_name(0))
        device = torch.device("cuda")
    else:
        print('No GPU available, using the CPU instead.')
        device = torch.device("cpu")
    
    return device


def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))


def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)


**Accuracy score**



In [10]:
def avg_accuracy(predictions,labels):
  return sum([flat_accuracy(predictions[ind],labels[ind]) for ind in range(len(predictions)) ]) / len(predictions) 

## Training and evaluation

**Load binary dataset**

In [11]:
binary_data = pd.read_csv('drive/MyDrive/Data/Binary/data.csv')
binary_data = binary_data.dropna()
binary_data.reset_index(drop=True, inplace=True)
x = binary_data['preprocessed']
y = binary_data['Label']



In [12]:
    model = setup_classifier(
        model_name = "drive/MyDrive/classifiers/bert/CroSloEngual",
        num_labels = 2
    )
    
    # model.load_state_dict(bert.load_model("models/m1.pt"))
    
    dataset = setup_data(
        model_name = "drive/MyDrive/classifiers/bert/CroSloEngual",
        x = x,
        y = y,
        do_lower_case = False,
        max_length = 180
    )

    test_ratio = 0.8
    train_size = int(test_ratio * len(dataset))
    test_size = len(dataset) - train_size

    train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

    
    model, stats = train_classifier(
        model = model,
        dataset = train_dataset,
        validation_ratio = 0.9,
        batch_size = 32,
        freeze_embeddings_layer = False,
        freeze_encoder_layers = 0,
        epochs = 3
    )
    
    predictions, true_labels = test_classifier(
        model = model,
        dataset = test_dataset,
        batch_size = 32
    )

Some weights of the model checkpoint at drive/MyDrive/classifiers/bert/CroSloEngual were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized fr

There are 1 GPU(s) available.
We will use the GPU: Tesla T4

Training...
  Batch    40  of  1,514.    Elapsed: 0:00:35.
  Batch    80  of  1,514.    Elapsed: 0:01:13.
  Batch   120  of  1,514.    Elapsed: 0:01:53.
  Batch   160  of  1,514.    Elapsed: 0:02:33.
  Batch   200  of  1,514.    Elapsed: 0:03:14.
  Batch   240  of  1,514.    Elapsed: 0:03:54.
  Batch   280  of  1,514.    Elapsed: 0:04:34.
  Batch   320  of  1,514.    Elapsed: 0:05:14.
  Batch   360  of  1,514.    Elapsed: 0:05:54.
  Batch   400  of  1,514.    Elapsed: 0:06:34.
  Batch   440  of  1,514.    Elapsed: 0:07:14.
  Batch   480  of  1,514.    Elapsed: 0:07:54.
  Batch   520  of  1,514.    Elapsed: 0:08:35.
  Batch   560  of  1,514.    Elapsed: 0:09:15.
  Batch   600  of  1,514.    Elapsed: 0:09:55.
  Batch   640  of  1,514.    Elapsed: 0:10:35.
  Batch   680  of  1,514.    Elapsed: 0:11:15.
  Batch   720  of  1,514.    Elapsed: 0:11:55.
  Batch   760  of  1,514.    Elapsed: 0:12:35.
  Batch   800  of  1,514.    Elaps

**Average accuracy**

In [14]:
avg_accuracy(predictions,true_labels)

0.9095902612826603

**Saving model**

In [15]:
save_model("drive/MyDrive/models/binary.pt", model)

**Multi-class dataset**

In [16]:
multiclass_data = pd.read_csv('drive/MyDrive/Data/Multiclass/data.csv')
multiclass_data = multiclass_data.dropna()
multiclass_data.reset_index(drop=True, inplace=True)
x = multiclass_data['preprocessed']
y = multiclass_data['Label']

In [18]:
    model = setup_classifier(
        model_name = "drive/MyDrive/classifiers/bert/CroSloEngual",
        num_labels = 6
    )
    
    # model.load_state_dict(bert.load_model("models/m1.pt"))
    
    dataset = setup_data(
        model_name = "drive/MyDrive/classifiers/bert/CroSloEngual",
        x = x,
        y = y,
        do_lower_case = False,
        max_length = 180
    )
    
    test_ratio = 0.8
    train_size = int(test_ratio * len(dataset))
    test_size = len(dataset) - train_size

    train_dataset, test_dataset = random_split(dataset, [train_size, test_size])




Some weights of the model checkpoint at drive/MyDrive/classifiers/bert/CroSloEngual were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized fr

In [19]:
    
    model, stats = train_classifier(
        model = model,
        dataset = train_dataset,
        validation_ratio = 0.9,
        batch_size = 32,
        freeze_embeddings_layer = False,
        freeze_encoder_layers = 0,
        epochs = 3
    )

There are 1 GPU(s) available.
We will use the GPU: Tesla T4

Training...
  Batch    40  of  1,327.    Elapsed: 0:00:37.
  Batch    80  of  1,327.    Elapsed: 0:01:16.
  Batch   120  of  1,327.    Elapsed: 0:01:56.
  Batch   160  of  1,327.    Elapsed: 0:02:36.
  Batch   200  of  1,327.    Elapsed: 0:03:16.
  Batch   240  of  1,327.    Elapsed: 0:03:56.
  Batch   280  of  1,327.    Elapsed: 0:04:36.
  Batch   320  of  1,327.    Elapsed: 0:05:16.
  Batch   360  of  1,327.    Elapsed: 0:05:55.
  Batch   400  of  1,327.    Elapsed: 0:06:35.
  Batch   440  of  1,327.    Elapsed: 0:07:15.
  Batch   480  of  1,327.    Elapsed: 0:07:55.
  Batch   520  of  1,327.    Elapsed: 0:08:35.
  Batch   560  of  1,327.    Elapsed: 0:09:15.
  Batch   600  of  1,327.    Elapsed: 0:09:55.
  Batch   640  of  1,327.    Elapsed: 0:10:35.
  Batch   680  of  1,327.    Elapsed: 0:11:15.
  Batch   720  of  1,327.    Elapsed: 0:11:55.
  Batch   760  of  1,327.    Elapsed: 0:12:35.
  Batch   800  of  1,327.    Elaps

In [20]:
    
    predictions, true_labels = test_classifier(
        model = model,
        dataset = test_dataset,
        batch_size = 32
    )

There are 1 GPU(s) available.
We will use the GPU: Tesla T4

Running Prediction...
DONE.


**Average accuracy**

In [24]:
avg_accuracy(predictions,true_labels)

0.9778963414634146

**Saving model**

In [3]:
save_model("drive/MyDrive/models/multiclass.pt", model)

NameError: ignored