In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Install transformers module

!pip install transformers



!pip install sentencepiece
!pip install plotly

In [None]:
!pip install my_library
!mkdir my_data
!wget my_url

In [None]:
# Libraries
import math
import torch
import pandas as pd
import time
import numpy as np
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

# Torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from torch import optim
from functools import partial

# BERT
from transformers import BertModel, BertTokenizer

%matplotlib notebook

In [None]:
# Get Data from github 
# capture supresses output
%%capture

!mkdir data

!wget -O en-de.tar.gz https://github.com/facebookresearch/mlqe/blob/main/data/en-de.tar.gz?raw=true
!tar -xzvf en-de.tar.gz -C data

!wget -O en-zh.tar.gz https://github.com/facebookresearch/mlqe/blob/main/data/en-zh.tar.gz?raw=true
!tar -xzvf en-zh.tar.gz -C data

!wget -O en-de.tar.gz https://github.com/facebookresearch/mlqe/blob/main/data/en-de_test.tar.gz?raw=true
!tar -xzvf en-de.tar.gz -C data

!wget -O en-zh.tar.gz https://github.com/facebookresearch/mlqe/blob/main/data/en-zh_test.tar.gz?raw=true
!tar -xzvf en-zh.tar.gz -C data

!wget -O ro-en.tar.gz https://github.com/facebookresearch/mlqe/blob/main/data/ro-en.tar.gz?raw=true
!tar -xzvf ro-en.tar.gz -C data

!wget -O et-en.tar.gz https://github.com/facebookresearch/mlqe/blob/main/data/et-en.tar.gz?raw=true
!tar -xzvf et-en.tar.gz -C data

!wget -O ne-en.tar.gz https://github.com/facebookresearch/mlqe/blob/main/data/ne-en.tar.gz?raw=true
!tar -xzvf ne-en.tar.gz -C data

In [None]:
# Create dataframes with data, preprocess data

# Main sets:
df_en_de = pd.read_csv('data/en-de/train.ende.df.short.tsv', sep='\t', error_bad_lines=False)
df_en_zh = pd.read_csv('data/en-zh/train.enzh.df.short.tsv', sep='\t', error_bad_lines=False)
df_ro_en = pd.read_csv('data/ro-en/train.roen.df.short.tsv', sep='\t', error_bad_lines=False)
df_et_en = pd.read_csv('data/et-en/train.eten.df.short.tsv', sep='\t', error_bad_lines=False)
df_ne_en = pd.read_csv('data/ne-en/train.neen.df.short.tsv', sep='\t', error_bad_lines=False)

# Dev sets:
df_en_de_dev = pd.read_csv('data/en-de/dev.ende.df.short.tsv', sep='\t', error_bad_lines=False)
df_en_zh_dev = pd.read_csv('data/en-zh/dev.enzh.df.short.tsv', sep='\t', error_bad_lines=False)

# Test sets:
df_en_de_test = pd.read_csv('data/en-de/test20.ende.df.short.tsv', sep='\t', error_bad_lines=False)
df_en_zh_test = pd.read_csv('data/en-zh/test20.enzh.df.short.tsv', sep='\t', error_bad_lines=False)

# cleaning 
df_en_de = df_en_de.dropna()
df_en_zh = df_en_zh.dropna()
df_ro_en = df_ro_en.dropna()
df_et_en = df_et_en.dropna()
df_ne_en = df_ne_en.dropna()
df_en_de_dev = df_en_de_dev.dropna()
df_en_zh_dev = df_en_zh_dev.dropna()
#df_en_de_test = df_en_de_test.dropna()
#df_en_zh_test = df_en_zh_test.dropna()

# Drop any pairs where either sentence is over half the length
# This is a kinda bad way of preventing overflow in the training set

#full_df = pd.concat([df_ro_en, df_et_en, df_ne_en, df_en_de, df_en_zh])
#train_df = pd.concat([df_ro_en, df_et_en, df_ne_en, df_en_de_dev, df_en_zh_dev])
#train_df = pd.concat([df_en_de])
#train_df = pd.concat([df_en_zh])
train_df = pd.concat([df_ro_en, df_et_en, df_ne_en, df_en_de_dev])
#train_df = pd.concat([df_ro_en, df_et_en, df_ne_en, df_en_zh_dev])

#test_df = pd.concat([df_en_de_test, df_en_zh_test])
test_df = pd.concat([df_en_de_test])g
#test_df = pd.concat([df_en_zh_test])
#test_df = pd.concat([df_en_de, df_en_zh])

# LOOKS LIKE THIS ISNT THE PROBLEM
train_df = train_df.drop(train_df[
   train_df['original'].apply(lambda x: len(x.split()) >= 128) |
   train_df['translation'].apply(lambda x: len(x.split()) >= 128)
].index)
test_df = test_df.drop(test_df[
   test_df['original'].apply(lambda x: len(x.split()) >= 128) |
   test_df['translation'].apply(lambda x: len(x.split()) >= 128)
].index)

#x = train_df[
#    train_df['original'].apply(lambda x: len(x.split()) >= 256) |
#    train_df['translation'].apply(lambda x: len(x.split()) >= 256)
#]

#print(x)
#print(x.split())
#print(len(x.split()))
#print(x.split("\t"))

data = [train_df, test_df]


In [None]:
def split(dataframes, train_test_split = .8, train_val_split = .75):
  #if one dataframe, split into train, val, and test.
  if len(dataframes) == 1:
    full_data = dataframes[0]
    num_rows = len(full_data)
    print(f'Total rows in dataset: {num_rows}')

    idxs = list(range(num_rows))
    
    markers = [
      int(train_test_split * train_val_split * num_rows),
      int(train_test_split * num_rows)
    ]

    train_idxs = idxs[:markers[0]]
    val_idxs = idxs[markers[0]:markers[1]]
    test_idxs = idxs[markers[1]:]

    train_df = full_data.iloc[train_idxs].reset_index(drop=True)
    val_df = full_data.iloc[val_idxs].reset_index(drop=True)
    test_df = full_data.iloc[test_idxs].reset_index(drop=True)

  #if two dataframes, split the first into train and val, use second as test.
  elif len(dataframes) == 2:
    train_val_df = dataframes[0]
    test_df = dataframes[1]

    num_rows = len(train_val_df) + len(test_df)
    print(f'Total rows in dataset: {num_rows}')

    idxs = list(range(len(train_val_df)))
    
    marker = int(train_val_split * len(train_val_df))

    train_idxs = idxs[:marker]
    val_idxs = idxs[marker:]

    train_df = train_val_df.iloc[train_idxs].reset_index(drop=True)
    val_df = train_val_df.iloc[val_idxs].reset_index(drop=True)

  # if three dataframes, use as train, validation, and test (in order)
  elif len(dataframes) == 3:
    train_df = dataframes[0]
    val_df = dataframes[1]
    test_df = dataframes[2]

    num_rows = len(train_df) + len(val_df) + len(test_df)
    print(f'Total rows in dataset: {num_rows}')

  else:
    raise ValueError("Too many different dataframes")

  print(f'Train Split: {len(train_df)}')
  print(f'Validation Split: {len(val_df)}')
  print(f'Test Split: {len(test_df)}')

  train_data = train_df[['index', 'original', 'translation', 'mean']]
  val_data   = val_df[['index', 'original', 'translation', 'mean']]
  test_data  = test_df[['index', 'original', 'translation', 'mean']]
      
  return train_data, val_data,test_data

train_data, val_data, test_data = split(data)

In [None]:
class LanguageDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        return self.df.iloc[idx]

In [None]:
train_dataset = LanguageDataset(train_data)
val_dataset = LanguageDataset(val_data)
test_dataset = LanguageDataset(test_data)

In [None]:
BATCH_SIZE = 128
EPOCHS = 6
# EPOCHS= 10
RNN_HIDDEN_SIZE = 256

print(torch.cuda.is_available())
if torch.cuda.is_available():
  device = torch.device("cuda")
else:
  device = torch.device("cpu")
  
print("Using device:", device)

#device = "cpu"

In [None]:
class QualityEstimator(nn.Module):
    def __init__(self, rnn_hidden_size):
        super(QualityEstimator, self).__init__()
        self.rnn_hidden_size = rnn_hidden_size
        self.bert = BertModel.from_pretrained("bert-base-multilingual-cased")
        self.rnn = nn.LSTM(input_size=768, hidden_size=rnn_hidden_size, bidirectional=True)
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(2*rnn_hidden_size, 128),
            #nn.Linear(768, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, 1),
        )
        self.sigmoid = nn.Sigmoid()

        #freeze the BERT model:
        for param in self.bert.parameters():
          param.requires_grad = False

    def forward(self, **x):
        embeddings = self.bert(
            input_ids = x["input_ids"],
            attention_mask= x["attention_mask"],
            token_type_ids = x["token_type_ids"]
            ).last_hidden_state
        output, _ = self.rnn(embeddings)
        l2r = self.flatten(output[:,-1,:self.rnn_hidden_size])
        r2l = self.flatten(output[:,0,self.rnn_hidden_size:])
        output = torch.cat((l2r, r2l), dim=-1)
        #output = torch.mean(embeddings, dim=1)
        logits = self.linear_relu_stack(output)
        score = 100.0 * self.sigmoid(logits)
        return score.flatten()

In [None]:
#train a given model, using a pytorch dataloader, optimizer
def train(model, dataloader, criterion, optimizer):
    model.train()
    
    train_loss = 0.0
    size = 0
    for input_ids, attention_mask, token_types, labels in tqdm(dataloader):
        if input_ids is None:
            pass
        size += labels.shape[0]
        output = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_types)
        loss = criterion(output, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
    train_loss /= size
    return train_loss

In [None]:
def test(model, dataloader, criterion):
    model.eval()
    
    test_loss = 0.0
    size = len(dataloader.dataset)
    matrix = torch.zeros(2, size)

    with torch.no_grad():
      with tqdm(dataloader, desc="Eval", unit="batch", total=len(dataloader)) as batch_iterator:
        for i, (input_ids, attention_mask, token_types, labels) in enumerate(batch_iterator):
            output = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_types)
            test_loss += criterion(output, labels).item()

            matrix[0,i*BATCH_SIZE:i*BATCH_SIZE+output.numel()] = output.flatten()
            matrix[1,i*BATCH_SIZE:i*BATCH_SIZE+labels.numel()] = labels.flatten()

    plt.scatter(matrix[0,:], matrix[1,:])
    plt.show()

    test_loss /= size
    pearson = torch.corrcoef(matrix)[0,1]
    return test_loss, pearson

In [None]:
#count the number of trainable parameters in the model
def count_parameters(model: nn.Module):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

#computes the amount of time that a training epoch took and displays it in human readable form
def epoch_time(start_time: int, end_time: int):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
# SET UP MODEL
model = QualityEstimator(RNN_HIDDEN_SIZE)

model.train()
model = model.to(device)

In [None]:
mean = np.mean(train_data['mean'].astype(float))
std = np.std(train_data['mean'].astype(float))
print(mean)
print(std)

In [None]:
def collate_fn(batch, tokenizer):
    input, masks, labels, types = [], [], [], []
    for data in batch:
        tokens = tokenizer(data['translation'], data['original'], return_tensors='pt')
        if(tokens['input_ids'].shape[1]>256):
          print(tokens, data['translation'], data['original'], float(data['mean']))
          print(len(data['translation']), len(data['original']))
        input.append(tokens['input_ids'].flatten())
        masks.append(tokens['attention_mask'].flatten())
        types.append(tokens['token_type_ids'].flatten())
        labels.append(float(data['mean']))

    input_ids = pad_sequence(input, batch_first=True).to(device)
    attention_mask = pad_sequence(masks, batch_first=True).to(device)
    token_types = pad_sequence(types, batch_first=True).to(device)

    return input_ids, attention_mask, token_types, torch.tensor(labels).to(device)

In [None]:
#help(BertTokenizer)

In [None]:
#def collate_fn(batch, tokenizer):
#    src_input, tar_input, src_masks, tar_masks, labels = [], [], [], [], []
#    for data in batch:
#        src_tokens = tokenizer(data['original'], return_tensors='pt')
#        tar_tokens = tokenizer(data['translation'], return_tensors='pt')
#
#        src_input.append(src_tokens['input_ids'].flatten())
#        src_masks.append(src_tokens['attention_mask'].flatten())
#
#        tar_input.append(tar_tokens['input_ids'].flatten())
#        tar_masks.append(tar_tokens['attention_mask'].flatten())
#        labels.append(float(data['mean']))
#    
#    input = src_input + tar_input
#    input = pad_sequence(input, batch_first=True).to(device)
#    
#    attention_mask = src_masks + tar_masks
#    attention_mask = pad_sequence(attention_mask, batch_first=True).to(device)
#    return input[input.shape[0]//2:].to(device), attention_mask[input.shape[0]//2:].to(device), torch.tensor(labels).to(device)

In [None]:
class RMSELoss(nn.Module):
    def __init__(self, eps=1e-6, reduction='mean'):
        super().__init__()
        self.mse = nn.MSELoss(reduction=reduction)
        self.eps = eps
    
    def forward(self, input, target):
        return torch.sqrt(self.mse(input, target) + self.eps)

In [None]:
# SET UP OTHER STUFF

# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

# Initialize DataLoader
train_dataloader = DataLoader(train_dataset,batch_size=BATCH_SIZE,collate_fn=partial(collate_fn, tokenizer=tokenizer), shuffle = True)
val_dataloader = DataLoader(val_dataset,batch_size=BATCH_SIZE,collate_fn=partial(collate_fn, tokenizer=tokenizer), shuffle = True)
test_dataloader = DataLoader(test_dataset,batch_size=BATCH_SIZE,collate_fn=partial(collate_fn, tokenizer=tokenizer), shuffle = True)

# set up train loop
optimizer = optim.Adam(model.parameters())
criterion = nn.MSELoss(reduction='sum')

In [None]:
def examples(model, df, tokenizer, num_examples):
    model.eval()
    with torch.no_grad():
        for _ in range(num_examples):
            idx = torch.randint(len(df), (1, 1)).item()
            source = df.iloc[idx]['original']
            translation = df.iloc[idx]['translation']
            mean = float(df.iloc[idx]['mean'])
            encoded_input = tokenizer(translation, source, return_tensors='pt', max_length = 512).to(device)
            output = model(
                input_ids=encoded_input['input_ids'].flatten().unsqueeze(0), 
                attention_mask=encoded_input['attention_mask'].flatten().unsqueeze(0),
                token_type_ids=encoded_input['token_type_ids'].flatten().unsqueeze(0)
            )

            print('source:', source)
            print('translation:', translation)
            print('output:', output.item())
            print('mean:', mean)
            print()

In [None]:
#NEEEEEEEEEEEEEEEEEEEEEEEEW
epoch_loss = []
validation_loss = []
accuracy = []

#count number of params
print(f'The model has {count_parameters(model):,} trainable parameters')

# train_loss = evaluate(model, train_dataloader, criterion, device)
# print(f'Initial Train Loss: {train_loss:.3f}')
"""
train_acc = evaluate_acc(model, train_dataloader, device)
print(f'Initial Train Acc: {train_acc:.3f}')

valid_loss = evaluate(model, val_dataloader, criterion, device)
print(f'Initial Valid Loss: {valid_loss:.3f}')

valid_acc = evaluate_acc(model, val_dataloader, device)
print(f'Initial Valid Acc: {valid_acc:.3f}')

epoch_loss.append(train_loss)
validation_loss.append(valid_loss)
"""

for epoch in range(EPOCHS):
    start_time = time.time()
    train_loss = train(model, train_dataloader, criterion, optimizer)
    end_time = time.time()
    valid_loss, valid_pearson = test(model, val_dataloader, criterion)
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\tValid Loss: {valid_loss:.3f}')
    print(f'\tValid Acc: {valid_pearson:.3f}')

    examples(model, test_data, tokenizer, 2)

    epoch_loss.append(train_loss)
    validation_loss.append(valid_loss)
    accuracy.append(valid_pearson)

In [None]:
test_acc, test_pearson = test(model, test_dataloader, criterion)
print(f'Final Test Acc: {test_pearson:.3f}')

plt.plot(epoch_loss)
plt.show()

In [None]:
examples(model, test_data, tokenizer, 5)

In [None]:
plt.plot(epoch_loss)
plt.plot(validation_loss)
plt.show()

In [None]:
import plotly.graph_objects as go
fig = go.Figure(
    layout_title_text="A Figure Displayed with the 'colab' Renderer"
)
fig.add_trace(go.Line(
    y=accuracy[1:],
    name="Training Pearson"       # this sets its legend entry
))
# fig.add_trace(go.Bar(
#     y=[test_acc],
#     name="Test Acc"       # this sets its legend entry
# ))
fig.update_layout(
    title="Pearson German/German",
    xaxis_title="Epoch",
    yaxis_title="Training Pearson"
)
fig.show(renderer="colab")

In [None]:
import plotly.graph_objects as go
fig = go.Figure(
    layout_title_text="A Figure Displayed with the 'colab' Renderer"
)
fig.add_trace(go.Line(
    y=epoch_loss[1:],
    name="Training Loss"       # this sets its legend entry
))
fig.add_trace(go.Line(
    y=validation_loss[1:],
    name="Validation Loss"       # this sets its legend entry
))
fig.update_layout(
    title="Loss(RMSE) German/German",
    xaxis_title="Epoch",
    yaxis_title="Training Loss"
)
fig.show(renderer="colab")

In [None]:
print(epoch_loss)

In [None]:
test_dataset_en_de = LanguageDataset(df_en_de)
test_dataset_en_zh = LanguageDataset(df_en_zh)
test_en_de_dataloader = DataLoader(test_dataset_en_de,batch_size=BATCH_SIZE,collate_fn=partial(collate_fn, tokenizer=tokenizer), shuffle = True)
test_en_zh_dataloader = DataLoader(test_dataset_en_zh,batch_size=BATCH_SIZE,collate_fn=partial(collate_fn, tokenizer=tokenizer), shuffle = True)

test_acc, test_pearson = test(model, test_en_de_dataloader, criterion)
print(f'Final Test En De Pearson: {test_pearson:.3f}')

test_acc, test_pearson = test(model, test_en_de_dataloader, criterion)
print(f'Final Test En Zh Pearson: {test_pearson:.3f}')
