<a href="https://colab.research.google.com/github/emil565a/NLP/blob/main/NLP_Project_week_38.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets
!pip install sklearn
!pip3 install nltk
!pip3 install transformers
!pip install transformers datasets evaluate
!pip install transformers[torch]
!pip install accelerate -U
!pip3 install transformers[torch]
!pip3 install accelerate -U



In [None]:
from datasets import load_dataset
import pandas as pd
import nltk

import tensorflow as tf

import transformers
from transformers import AutoTokenizer
from transformers import pipeline
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
from transformers import Trainer
from transformers import AutoModelForQuestionAnswering
from transformers import DefaultDataCollator
from transformers import TFTrainer #Used for training model
from transformers import TFTrainingArguments #Used fo training arguments
from transformers import BertTokenizer, squad_convert_examples_to_features, SquadV2Processor
from transformers.data.processors.squad import SquadExample

In [None]:
nltk.download('punkt')
dataset = load_dataset("copenlu/answerable_tydiqa")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Also load dataset as pandas.DataFrame, in case we need it later
train_pd = pd.DataFrame(data=dataset["train"])
validation_pd = pd.DataFrame(data=dataset["validation"])

In [None]:
arabic_train = train_pd[train_pd['language'] == 'arabic']
bengali_train = train_pd[train_pd['language'] == 'bengali']
indo_train = train_pd[train_pd['language'] == 'indonesian']

arabic_validation = validation_pd[validation_pd['language'] == 'arabic']
bengali_validation = validation_pd[validation_pd['language'] == 'bengali']
indo_validation = validation_pd[validation_pd['language'] == 'indonesian']

In [None]:
#print columns
print(arabic_train.columns)

Index(['question_text', 'document_title', 'language', 'annotations',
       'document_plaintext', 'document_url'],
      dtype='object')


In [None]:
def tokenizerOld(text):
  #Tokenize the text and translate
  tokens = [word.lower() for word in nltk.word_tokenize(text)]
  return tokens

#New that utilize transformer.AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
def tokenize_function(text):
    return tokenizer(text, padding="max_length", truncation=True)

class entry:
    def __init__(self, question_text, document_title, language, annotations, document_plaintext, document_url):
        self.question_text = question_text
        self.document_title = document_title
        self.language = language
        self.annotations = annotations
        self.answerable = annotations.get("answer_start", [-1]) != [-1]
        self.document_plaintext = document_plaintext
        self.document_url = document_url
        self.question_tokenized = tokenizerOld(self.question_text)
        self.document_tokenized = tokenizerOld(self.document_plaintext)

        #To use with the transformer library
        self.answers = {'answer_start': annotations['answer_start'], 'text': annotations['answer_text']}
        self.context = document_plaintext
        self.id = document_url
        self.question = question_text
        self.title = document_title


    def __str__(self):
        return f"Question: {self.question_text}\n" \
               f"Document Title: {self.document_title}\n" \
               f"Language: {self.language}\n" \
               f"Annotations: {self.annotations}\n" \
               f"Document Plaintext: {self.document_plaintext}\n" \
               f"Document URL: {self.document_url}\n" \
               f"Answerable: {self.answerable}"\
               f"Question Tokenized: {self.question_tokenized}\n" \
               f"Document Tokenized: {self.document_tokenized}"

arabic_train_entries = []
for index,row in arabic_train.iterrows():
    arabic_train_entries.append(entry(row['question_text'], row['document_title'], row['language'], row['annotations'], row['document_plaintext'], row['document_url']))

arabic_validation_entries = []
for index,row in arabic_validation.iterrows():
    arabic_validation_entries.append(entry(row['question_text'], row['document_title'], row['language'], row['annotations'], row['document_plaintext'], row['document_url']))


bengali_train_entries = []
for index,row in bengali_train.iterrows():
    bengali_train_entries.append(entry(row['question_text'], row['document_title'], row['language'], row['annotations'], row['document_plaintext'], row['document_url']))

bengali_validation_entries = []
for index,row in bengali_validation.iterrows():
    bengali_validation_entries.append(entry(row['question_text'], row['document_title'], row['language'], row['annotations'], row['document_plaintext'], row['document_url']))

indo_train_entries = []
for index,row in indo_train.iterrows():
    indo_train_entries.append(entry(row['question_text'], row['document_title'], row['language'], row['annotations'], row['document_plaintext'], row['document_url']))
indo_validation_entries = []
for index,row in indo_validation.iterrows():
    indo_validation_entries.append(entry(row['question_text'], row['document_title'], row['language'], row['annotations'], row['document_plaintext'], row['document_url']))



# Emils Supervised Classifier

### Preproccesing


In [None]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question_text"]]
    inputs = tokenizer(
        questions,
        examples["document_plaintext"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["annotations"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["answer_text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

### tokenize the data

In [None]:
tokenizer = AutoTokenizer.from_pretrained("timpal0l/mdeberta-v3-base-squad2")
tokenized_squad = dataset.map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)


Map:   0%|          | 0/13325 [00:00<?, ? examples/s]

In [None]:
data_collator = DefaultDataCollator()

In [None]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

model = AutoModelForQuestionAnswering.from_pretrained("timpal0l/mdeberta-v3-base-squad2")

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=2,
    weight_decay=0.0001,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad["train"],
    eval_dataset=tokenized_squad["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

Epoch,Training Loss,Validation Loss
