In [None]:
!pip install transformers

In [None]:
import os, torch
import pandas as pd
import time
from torch.utils.data import TensorDataset, SequentialSampler, DataLoader
from transformers import BertTokenizer, BertModel
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
os.chdir("/content/drive/MyDrive/project")
os.listdir('./')

In [None]:
# If there's a GPU available...
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [None]:
def get_all_data_from_filename(full_filename):
    df = pd.read_csv(full_filename, delimiter='\t', header=None, names=['iid','src','native_speaker','original','dest','text','direct','label'])
    return df

In [None]:
def get_text_and_label(df):
    return df.text.values[1:], df.label.values[1:]

In [None]:
def import_model(name='bert-base-uncased'):
    token = BertTokenizer.from_pretrained(name)
    mod = BertModel.from_pretrained(name)
    return token, mod

In [None]:
def create_test_set(tokenizer, sentences, labels):
    # print(sentences)
    input_ids = []
    attention_masks = []

    # For every sentence...
    for sent in sentences:
        # `encode_plus` will:
        #   (1) Tokenize the sentence.
        #   (2) Prepend the `[CLS]` token to the start.
        #   (3) Append the `[SEP]` token to the end.
        #   (4) Map tokens to their IDs.
        #   (5) Pad or truncate the sentence to `max_length`
        #   (6) Create attention masks for [PAD] tokens.
        encoded_dict = tokenizer.encode_plus(
                            sent,                      # Sentence to encode.
                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                            max_length = 256,           # Pad & truncate all sentences.
                            pad_to_max_length = True,
                            return_attention_mask = True,   # Construct attn. masks.
                            return_tensors = 'pt',     # Return pytorch tensors.
                       )

        # Add the encoded sentence to the list.
        input_ids.append(encoded_dict['input_ids'])

        # And its attention mask (simply differentiates padding from non-padding).
        attention_masks.append(encoded_dict['attention_mask'])

    # Convert the lists into tensors.
    input_ids = torch.cat(input_ids, 0)
    attention_masks = torch.cat(attention_masks, 0)
    # print(type(labels))
    labels = torch.tensor(np.array([int(i) for i in labels]))

    # Set the batch size.
    batch_size = 32
    print('Predicting labels for {:,} test sentences...'.format(len(input_ids)))

    # Create the DataLoader.
    prediction_data = TensorDataset(input_ids, attention_masks, labels)
    prediction_sampler = SequentialSampler(prediction_data)
    prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

    return prediction_dataloader

In [None]:
def run(model, prediction_dataloader):

    # Put model in evaluation mode
    model.cuda()
    model.eval()
    cls_output = []
    label_output = []

    # Predict
    for batch in prediction_dataloader:
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)

          # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch

          # Telling the model not to compute or store gradients, saving memory and
          # speeding up prediction
        with torch.no_grad():
              # Forward pass, calculate logit predictions.
              result = model(b_input_ids,
                             attention_mask=b_input_mask,
                             return_dict=True)

        cls_output.append(result.pooler_output)
        label_output.append(b_labels)


    cls_output = torch.cat(cls_output, 0)
    label_output = torch.cat(label_output, 0)

    # print(type(cls_output))
    print(f"CLS shape = {cls_output.shape}")
    print(f"Labels shape = {label_output.shape}")
    print('    DONE.')
    return cls_output, label_output

In [None]:
def save_files(filename, output, labels):
    np.save(filename + '_cls_output.npy', output)
    np.save(filename + '_labels.npy', labels)
    print(f"Saved files for filename = {filename}\n---------------------------")

In [None]:
data_path = './dataset'
output_path = './cls_output'

if not os.path.exists(output_path):
    os.mkdir(output_path)

tokenizer, model = import_model()

filename = "mono_en_es_train.tsv"
if True:
    if filename[-3:] == 'tsv':
        start = time.time()
        print(f"Working on file = {os.path.join(output_path, filename[:-4])}")
        all_data = get_all_data_from_filename(os.path.join(data_path,filename))
        propositions, labels = get_text_and_label(all_data)
        prediction_dataloader = create_test_set(tokenizer, propositions, labels)
        cls_output, b_labels = run(model, prediction_dataloader)
        end = time.time()
        print(f"Duration is {int(end - start)} seconds which is {int((end-start)/60)} minutes")
        save_files(os.path.join(output_path, filename[:-4]), cls_output.to('cpu').numpy(), b_labels.to('cpu').numpy())

In [None]:
for fil in os.listdir('./cls_output'):
  a = np.load(os.path.join('./cls_output', fil))
  print(f"{fil} \t   \t= {a.shape}")