# Package and definitions


In [43]:
import requests as req
import pandas as pd
import os
import re
from collections import Counter
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
!pip install termcolor
from termcolor import colored
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
try :
  import transformers
except :
  !pip install transformers
  import transformers
from transformers import AutoModel, BertTokenizerFast
from transformers import AutoModelForMaskedLM, AutoTokenizer
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW
from sklearn.utils.class_weight import compute_class_weight
from nltk.tokenize import sent_tokenize
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from itertools import compress
import seaborn as sns
from tqdm import *
import time
! pip install datasets
# torch.cuda.is_available() returns a boolean to check if the GPU can be used or not
if torch.cuda.is_available():
  # if CUDA is available set 'cuda' as the device
  device = 'cuda'
  # and then print the name of the GPU
  print('DEVICE = ', colored(torch.cuda.get_device_name(0), "green" ) )
else:
  # else, set 'cpu' as device
  device = 'cpu'
  # just print than the CPU is used. Alternatively you can check your CPU with the following command (linux based) in the next cell:
  # ! lscpu
  print('DEVICE = ', colored('CPU', "blue"))

git_url = "https://raw.githubusercontent.com/chlolv/NLP_Project/main/Data/"
H1_url = "H1.txt"
H2_url = "H2.txt"
H3_url = "H3.txt"
H4_url = "H4.txt"
H5_url = "H5.txt"
H6_url = "H6.txt"
H7_url = "H7.txt"

H1 = req.get(git_url + H1_url)
H1 = H1.text
H2 = req.get(git_url + H2_url)
H2 = H2.text
H3 = req.get(git_url + H3_url)
H3 = H3.text
H4 = req.get(git_url + H4_url)
H4 = H4.text
H5 = req.get(git_url + H5_url)
H5 = H5.text
H6 = req.get(git_url + H6_url)
H6 = H6.text
H7 = req.get(git_url + H7_url)
H7 = H7.text



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\chloe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\chloe\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


DEVICE =  [34mCPU[0m


### Pipeline to prepare data for prediction

In [1]:
with open('C:/Users/chloe/OneDrive/Bureau/3A/NLP/Git/NLP_Project/text_0.txt', 'r', encoding='utf-8') as fp:
    lines = fp.read().split('\n')

In [3]:
from transformers import RobertaForMaskedLM, BertTokenizerFast, RobertaTokenizer, AutoModel

tokenizer = RobertaTokenizer.from_pretrained('roberta-base', max_len=512, bos_token = '<s>', eos_token = '</s>',
                                             sep_token = '</s>', cls_token = '<s>', unk_token = '<unk>',
                                             pad_token = '<pad>', mask_token = '<mask>' )

In [4]:
new_names = ['Harry', 'Potter', 'Ron', 'Weasley','Hermione', 'Granger','Snape', 'Severus',
              'Albus', 'Dumbledore','Dursley', 'Vernon', 'Dudley', 'Petunia','Draco', 'Malfoy']
num_added_tokens = tokenizer.add_tokens(new_names)

In [6]:
new_tokens = [tokenizer(token)['input_ids'][1] for token in new_names]
#new_tokens

[29345,
 50265,
 27674,
 50266,
 50267,
 50268,
 50269,
 50270,
 50271,
 50272,
 50273,
 50274,
 50275,
 50276,
 50277,
 50278]

In [11]:
special_tokens = [tokenizer(token)['input_ids'][1] for token in ["<s>","</s>","<pad>","<unk>","<mask>"]]
#special_token

[0, 2, 1, 3, 50264]

In [10]:
batch = tokenizer(lines, max_length=512, padding='max_length', truncation=True)

In [12]:
import torch

labels = torch.tensor([x for x in batch.input_ids])
mask = torch.tensor([x for x in batch.attention_mask])

In [46]:
# make copy of labels tensor, this will be input_ids
input_ids = labels.detach().clone()
# create random array of floats with equal dims to input_ids
rand = torch.rand(input_ids.shape)
mask_arr = (rand < .75) * np.prod([input_ids == ele for ele in new_tokens])
# loop through each row in input_ids tensor (cannot do in parallel)
for i in range(input_ids.shape[0]):
    # get indices of mask positions from mask array
    selection = torch.flatten(mask_arr[i].nonzero()).tolist()
    # mask input_ids
    input_ids[i, selection] = special_tokens[4]  # our custom [MASK] token == 3

In [49]:
encodings = {'input_ids': input_ids, 'attention_mask': mask,
             'labels': labels}

In [50]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        # store encodings internally
        self.encodings = encodings

    def __len__(self):
        # return the number of samples
        return self.encodings['input_ids'].shape[0]

    def __getitem__(self, i):
        # return dictionary of input_ids, attention_mask, and labels for index i
        return {key: tensor[i] for key, tensor in self.encodings.items()}


In [51]:
dataset = Dataset(encodings)

In [52]:
loader = torch.utils.data.DataLoader(dataset, batch_size=12, shuffle=True)

### Training the model

In [53]:
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=30959,  # we align this to the tokenizer vocab_size
    max_position_embeddings=514,
    hidden_size=768,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1
)

In [54]:
from transformers import RobertaForMaskedLM

model = RobertaForMaskedLM(config)

In [55]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# and move our model over to the selected device
model.to(device)

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(30959, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNor

In [56]:
torch.cuda.empty_cache()

In [None]:
from transformers import AdamW

# activate training mode
model.train()
# initialize optimizer
optim = AdamW(model.parameters(), lr=1e-4)

In [None]:
epochs = 2

for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    loop = tqdm(loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        # process
        outputs = model(input_ids, attention_mask=attention_mask,
                        labels=labels)
        # extract loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

### Testing the model on names 

In [None]:
# need to feed the model a text with [MASK] labels

In [None]:
model.save_pretrained('pretrained_harrypotbert3')

In [None]:
from transformers import pipeline

In [None]:
fill = pipeline('fill-mask', model='pretrained_harrypotbert3', tokenizer='harrypotbert')

In [None]:
fill(f"Ron held {fill.tokenizer.mask_token}'s hand.")

In [None]:
fill(f"Harry was happy to see {fill.tokenizer.mask_token} in the castle.")