In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [23]:
!pip install transformers==2.8.0
!pip install torchtext==0.8.0

Collecting transformers==2.8.0
[?25l  Downloading https://files.pythonhosted.org/packages/a3/78/92cedda05552398352ed9784908b834ee32a0bd071a9b32de287327370b7/transformers-2.8.0-py3-none-any.whl (563kB)
[K     |▋                               | 10kB 25.2MB/s eta 0:00:01[K     |█▏                              | 20kB 16.8MB/s eta 0:00:01[K     |█▊                              | 30kB 14.9MB/s eta 0:00:01[K     |██▎                             | 40kB 14.2MB/s eta 0:00:01[K     |███                             | 51kB 11.2MB/s eta 0:00:01[K     |███▌                            | 61kB 11.5MB/s eta 0:00:01[K     |████                            | 71kB 11.6MB/s eta 0:00:01[K     |████▋                           | 81kB 11.8MB/s eta 0:00:01[K     |█████▎                          | 92kB 11.2MB/s eta 0:00:01[K     |█████▉                          | 102kB 12.2MB/s eta 0:00:01[K     |██████▍                         | 112kB 12.2MB/s eta 0:00:01[K     |███████                    



In [1]:
import transformers
transformers.__version__

'2.8.0'

In [2]:
# Importing stock libraries
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

# Importing the T5 modules from huggingface/transformers
from transformers import T5Tokenizer, T5ForConditionalGeneration

In [3]:
!nvidia-smi

Tue Dec 29 16:31:02 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.27.04    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   40C    P8    10W /  70W |      0MiB / 15079MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
# # Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [5]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, source_len, target_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.target_len = target_len
        self.English = self.data.English
        self.Yoruba = self.data.Yoruba

    def __len__(self):
        return len(self.English)

    def __getitem__(self, index):
        Yoruba = str(self.Yoruba[index])
        Yoruba = ' '.join(Yoruba.split())

        English = str(self.English[index])
        English = ' '.join(English.split())

        source = self.tokenizer.batch_encode_plus([Yoruba], max_length= self.source_len, pad_to_max_length=True,return_tensors='pt')
        target = self.tokenizer.batch_encode_plus([English], max_length= self.target_len, pad_to_max_length=True,return_tensors='pt')

        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()
        target_mask = target['attention_mask'].squeeze()

        return {
            'source_ids': source_ids.to(dtype=torch.long), 
            'source_mask': source_mask.to(dtype=torch.long), 
            'target_ids': target_ids.to(dtype=torch.long),
            'target_mask': target_mask.to(dtype=torch.long)
        }

In [6]:
class Inference(Dataset):

    def __init__(self, dataframe, tokenizer, source_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.Yoruba = self.data.Yoruba

    def __len__(self):
        return len(self.Yoruba)

    def __getitem__(self, index):
        Yoruba = str(self.Yoruba[index])
        Yoruba = ' '.join(Yoruba.split())

        source = self.tokenizer.batch_encode_plus([Yoruba], max_length= self.source_len, pad_to_max_length=True,return_tensors='pt',truncation=True)

        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        return {
            'source_ids': source_ids.to(dtype=torch.long), 
            'source_mask': source_mask.to(dtype=torch.long),
        }

In [7]:
def infer(epoch, tokenizer, model, device, loader):
    model.eval()
    predictions = []
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            ids = data['source_ids'].to(device, dtype = torch.long)
            mask = data['source_mask'].to(device, dtype = torch.long)

            generated_ids = model.generate(
                input_ids = ids,
                attention_mask = mask, 
                max_length=150, 
                num_beams=2,
                repetition_penalty=2.5, 
                length_penalty=1.0, 
                early_stopping=True
                )
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
            if _%100==0:
                print(f'Completed {_}')
    return preds

In [8]:
def train(epoch, tokenizer, model, device, loader, optimizer):
    model.train()
    epoch_loss = 0
    for _,data in enumerate(loader, 0):
        y = data['target_ids'].to(device, dtype = torch.long)
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data['source_ids'].to(device, dtype = torch.long)
        mask = data['source_mask'].to(device, dtype = torch.long)

        outputs = model(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids, lm_labels=lm_labels)
        loss = outputs[0]
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(loader)

In [26]:
def evaluate(epoch, tokenizer, model, device, loader):
    model.eval()
    predictions = []
    actuals = []
    epoch_loss = 0
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            y = data['target_ids'].to(device, dtype = torch.long)
            yt_ids = y[:, :-1].contiguous()
            lm_labels = y[:, 1:].clone().detach()
            lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
            ids = data['source_ids'].to(device, dtype = torch.long)
            mask = data['source_mask'].to(device, dtype = torch.long)

            preds = model(input_ids = ids,attention_mask = mask,decoder_input_ids=yt_ids, lm_labels=lm_labels)
            # generated_ids = model.generate(
            #     input_ids = ids,
            #     attention_mask = mask)
            val_loss = preds[0]
            epoch_loss += val_loss.item()
            # preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
            # target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]
            # if _%100==0:
            #     print(f'Completed {_}')

            # predictions.extend(preds)
            # actuals.extend(target)
    return epoch_loss / len(loader)#,predictions, actuals

In [27]:
def validate(epoch, tokenizer, model, device, loader):
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            y = data['target_ids'].to(device, dtype = torch.long)
            ids = data['source_ids'].to(device, dtype = torch.long)
            mask = data['source_mask'].to(device, dtype = torch.long)

            generated_ids = model.generate(
                input_ids = ids,
                attention_mask = mask, 
                max_length=150, 
                num_beams=2,
                repetition_penalty=2.5, 
                length_penalty=1.0, 
                early_stopping=True
                )
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
            target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]
            if _%100==0:
                print(f'Completed {_}')

            predictions.extend(preds)
            actuals.extend(target)
    return predictions, actuals

In [28]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [29]:
from torchtext.data.metrics import bleu_score

In [30]:
import time

In [31]:
pd.read_csv('drive/MyDrive/yorubatoEN/Train.csv')['Yoruba'].apply(len).describe()

count    10054.000000
mean        99.887607
std         77.817411
min          4.000000
25%         41.000000
50%         78.000000
75%        141.000000
max       1183.000000
Name: Yoruba, dtype: float64

In [32]:
pd.read_csv('drive/MyDrive/yorubatoEN/Test.csv')['Yoruba'].apply(len).describe()

count    6816.00000
mean      100.87632
std        67.69594
min         7.00000
25%        49.00000
50%        85.00000
75%       135.00000
max       605.00000
Name: Yoruba, dtype: float64

In [36]:
import math
# from transformers import MT5ForConditionalGeneration,MT5Tokenizer
TRAIN_BATCH_SIZE = 4    # input batch size for training (default: 64)
VALID_BATCH_SIZE = 2    # input batch size for testing (default: 1000)
TRAIN_EPOCHS = 2        # number of epochs to train (default: 10)
VAL_EPOCHS = 1 
LEARNING_RATE = 1e-4    # learning rate (default: 0.01)
SEED = 2020               # random seed (default: 2020)
MAX_LEN = 150 
import random
# Set random seeds and deterministic pytorch for reproducibility
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.cuda.manual_seed_all(SEED)

# tokenzier for encoding the text
tokenizer = T5Tokenizer.from_pretrained("t5-base")

# Importing and Pre-Processing the domain data
# Selecting the needed columns only. 
# Adding the summarzie text in front of the text. This is to format the dataset similar to how T5 model was trained for summarization task. 
df = pd.read_csv('drive/MyDrive/yorubatoEN/Train.csv')
inf_data = pd.read_csv('drive/MyDrive/yorubatoEN/Test.csv')
df = df[['Yoruba','English']]
df.Yoruba = 'translate: ' + df.Yoruba
print(df.head())


# Creation of Dataset and Dataloader
# Defining the train size. So 80% of the data will be used for training and the rest will be used for validation. 
train_size = 0.95
train_dataset=df.sample(frac=train_size, random_state = SEED).reset_index(drop=True)
val_dataset=df.drop(train_dataset.index).reset_index(drop=True)


print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(val_dataset.shape))


# Creating the Training and Validation dataset for further creation of Dataloader
training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN, MAX_LEN)
val_set = CustomDataset(val_dataset, tokenizer, MAX_LEN, MAX_LEN)
inf_set = Inference(inf_data,tokenizer,MAX_LEN)

# Defining the parameters for creation of dataloaders
train_params = {
    'batch_size': TRAIN_BATCH_SIZE,
    'shuffle': True,
    'num_workers': 0
    }

val_params = {
    'batch_size': VALID_BATCH_SIZE,
    'shuffle': False,
    'num_workers': 0
    }

# Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
training_loader = DataLoader(training_set, **train_params)
val_loader = DataLoader(val_set, **val_params)
inf_loader = DataLoader(inf_set)


# Defining the model. We are using t5-base model and added a Language model layer on top for generation of Summary. 
# Further this model is sent to device (GPU/TPU) for using the hardware.
model = T5ForConditionalGeneration.from_pretrained("t5-base")
model = model.to(device)

# Defining the optimizer that will be used to tune the weights of the network in the training session. 
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

# Training loop
print('Initiating Fine-Tuning for the model on our dataset')
best_valid_score = float('-inf')
start_time = time.time()
for epoch in range(TRAIN_EPOCHS):
    train_loss= train(epoch, tokenizer, model, device, training_loader, optimizer)
    valid_loss = evaluate(epoch, tokenizer, model, device, val_loader)
    # valid_score = bleu_score(predictions, actuals)
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    torch.save(model.state_dict(), 't5.pt_{}'.format(epoch))
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\tVal Loss: {valid_loss:.3f}' )#| Val Bleu: {Bleu_score:7.3f}')
    # print(f'Valid Bleu: {valid_scorre}')

                                              Yoruba                                            English
0  translate: A ṣètò Ìgbìmọ̀ Tó Ń Ṣètò Ìrànwọ́ Ní...  A Disaster Relief Committee was formed to orga...
1  translate: Ìrọ̀lẹ́ May 22, 2018 ni wọ́n fàṣẹ ọ...  Brother Solovyev was arrested on the evening o...
2            translate: Iléeṣẹ́ Creative Commons náà                  Creative Commons the Organization
3  translate: Pè̩lú Egypt, Morocco àti Tunisia tí...  With Egypt, Morocco and Tunisia out of the Wor...
4  translate: Adájọ́ àgbà lórílẹ̀ èdè Náíjíríà (A...  The Attorney General of the Federation, Justic...
FULL Dataset: (10054, 2)
TRAIN Dataset: (9551, 2)
TEST Dataset: (503, 2)
Initiating Fine-Tuning for the model on our dataset
Epoch: 01 | Time: 14m 55s
	Train Loss: 3.437 | Train PPL:  31.091
	Val Loss: 2.765
Epoch: 02 | Time: 29m 53s
	Train Loss: 2.796 | Train PPL:  16.381
	Val Loss: 2.300


In [17]:
train_loss= train(epoch, tokenizer, model, device, training_loader, optimizer)
        valid_loss,predictions, actuals = validate(epoch, tokenizer, model, device, val_loader)
        Bleu_score = bleu_score(predictions, actuals)
        end_time = time.time()
    
        epoch_mins, epoch_secs = epoch_time(start_time, end_time)
        if valid_scorre > best_valid_score:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), 'tut6-model.pt')
        print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
        print(f'\tVal Loss: {train_loss:.3f} | Val Bleu: {Bleu_score:7.3f}')

IndentationError: ignored

In [None]:
predictions, actuals = validate(epoch, tokenizer, model, device, val_loader)
valid_scorre = bleu_score(predictions, actuals)

In [None]:
from transformers import MT5Tokenizer

In [None]:
tokenizer = MT5Tokenizer.from_pretrained("google/mt5-xxl")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=4309802.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=65.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=376.0, style=ProgressStyle(description_…




In [None]:
predictions,actuals

In [48]:
inf_data = pd.read_csv('drive/MyDrive/yorubatoEN/Test.csv')
inf_data.Yoruba = 'translate: ' + inf_data.Yoruba
print(inf_data.head())

            ID                                             Yoruba
0  ID_AAAitMaH  translate: Nínú ìpè kan lẹ́yìn ìgbà náà, wọ́n ...
1  ID_AAKKdQwr  translate: Nítorí kò sí nǹkan tí ọkùnrin ò lè ...
2  ID_ABgAyEOp       translate: Bí i kó pariwo. Kí ó kígbe mọ́ ẹ?
3  ID_ACFgfKQs  translate: Tí ó ń lé e lọ sọ́nà etí odò Akókur...
4  ID_ACNPmlhf  translate: Èṣúńiyì mọ̀ iṣẹ́ rẹ̀ dunjú. Màmá ti...


In [49]:
inf_set = Inference(inf_data,tokenizer,MAX_LEN)
inf_loader = DataLoader(inf_set,shuffle=False,batch_size=32)

In [50]:
model.eval()
predictions = []
with torch.no_grad():
  for _, data in enumerate(inf_loader, 0):
    ids = data['source_ids'].to(device, dtype = torch.long)
    mask = data['source_mask'].to(device, dtype = torch.long)

    generated_ids = model.generate(
        input_ids = ids,
        do_sample=True, 
        
        top_k=50,
        early_stopping=True
        )
    preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
    if _%100==0:
        print(f'Completed {_}')

    predictions.extend(preds)

Completed 0
Completed 100
Completed 200


In [52]:
inf_data['English'] = predictions
inf_data[['ID','English']].to_csv('t5yor.csv',index=False)

In [None]:
for epoch in range(VAL_EPOCHS):
    predict = infer(epoch, tokenizer, model, device, inf_loader)
    ab_df = pd.DataFrame({'Generated Text':predict})
    ab_df.to_csv('abpredictions.csv')

In [None]:
model.eval()
predictions = []
actuals = []
with torch.no_grad():
  for _, data in enumerate(val_loader, 0):
    y = data['target_ids'].to(device, dtype = torch.long)
    ids = data['source_ids'].to(device, dtype = torch.long)
    mask = data['source_mask'].to(device, dtype = torch.long)

    generated_ids = model.generate(
        input_ids = ids,
        do_sample=True, 
        
        top_k=50,
        early_stopping=True
        )
    preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
    target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]
    if _%100==0:
        print(f'Completed {_}')

    predictions.extend(preds)
    actuals.extend(target)
    # print(bleu_score(actuals,predictions))