# Getting Sentencewise data into Dataframe

In [None]:
import pandas as pd
import re

In [None]:
#reading the train, valid, test dataset
#converting it into csv
data_dir = '/content/drive/MyDrive/NLP_project/data/output/'

def createFeature(data):

  # Split the text into a list at the empty lines
  sentences = re.split('\n\s*\n', data)

  #getting word: from index 0 and tag: from index 3
  words, tags = [], []
  for sent in sentences:
      #getting list of lines
      lines = sent.split("\n")
      w = []
      t = []
      for line in lines:
        #if the line not empty
        if line.split():
          #print(triData.split())
          w.append(line.split()[0])
          t.append(line.split()[3])
      words.append(w)
      tags.append(t)

  


  return {'word':words, 'tag':tags}

def dataReader(dataPath):
  train_dict = createFeature(open(dataPath+'train.txt').read().strip())
  test_dict = createFeature(open(dataPath+'test.txt').read().strip())
  valid_dict = createFeature(open(dataPath+'valid.txt').read().strip())

  #creating dataframe
  train_df = pd.DataFrame(train_dict)
  test_df = pd.DataFrame(test_dict)
  valid_df = pd.DataFrame(valid_dict)

  #print(len(valid_dict['word']))
  return train_df, test_df, valid_df

train_df, test_df, valid_df = dataReader(data_dir)

#saving the data to csv
train_df.to_csv(data_dir + "train_conll.csv")
test_df.to_csv(data_dir + "test_conll.csv")
valid_df.to_csv(data_dir + "valid_conll.csv")

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Bert Finetuning

In [6]:
#install dependencies
!pip install transformers
!pip install datasets
!pip install seqeval

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [7]:
import os
import random
import numpy as np
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from ast import literal_eval
import pandas as pd
from datasets import Dataset, load_metric


# Set seed for reproducibility
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

set_seed(42)

In [8]:
#loading data from csv
def load_data_from_csv(file_path):
    data = pd.read_csv(file_path)
    data['word'] = data['word'].apply(literal_eval)
    data['tag'] = data['tag'].apply(literal_eval)
    return data

In [9]:
# Load your CoNLL-formatted data
train_data = load_data_from_csv("/content/drive/MyDrive/NLP_project/data/output/train_conll.csv")
valid_data = load_data_from_csv("/content/drive/MyDrive/NLP_project/data/output/valid_conll.csv")

train_dataset = Dataset.from_pandas(train_data)
valid_dataset = Dataset.from_pandas(valid_data)
datasets = {"train": train_dataset, "validation": valid_dataset}

metric = load_metric("seqeval")

  metric = load_metric("seqeval")


Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

In [10]:
datasets

{'train': Dataset({
     features: ['Unnamed: 0', 'word', 'tag'],
     num_rows: 5302
 }),
 'validation': Dataset({
     features: ['Unnamed: 0', 'word', 'tag'],
     num_rows: 1090
 })}

In [11]:
model_checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [12]:
# Get the number of NER labels from the dataset
# Get the number of NER labels from the dataset
label_list = sorted(set(tag for tags in train_data["tag"] for tag in tags))
label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for i, l in enumerate(label_list)}
num_labels = len(label_list)

#define model
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

In [13]:
#tokenizing the data
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["word"], truncation=True, is_split_into_words=True, padding="max_length")
    labels = []
    for i, label in enumerate(examples["tag"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        prev_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != prev_word_idx:
                label_ids.append(label2id[label[word_idx]])
            else:
                label_ids.append(-100)
            prev_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [14]:
#tokenizing
tokenized_train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_valid_dataset = valid_dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/5302 [00:00<?, ? examples/s]

Map:   0%|          | 0/1090 [00:00<?, ? examples/s]

In [19]:
#adding training arguments
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/NLP_project/data/output",
    evaluation_strategy="epoch",
    num_train_epochs=6,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_dir="/content/drive/MyDrive/NLP_project/data/log",
    seed=42,
)


In [20]:
#defining the compute metrics
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    overall_result =  {
        "overall_precision": results["overall_precision"],
        "overall_recall": results["overall_recall"],
        "overall_f1": results["overall_f1"],
        "overall_accuracy": results["overall_accuracy"]
    }

    #label-wise f1 score
    for key in results.keys():
      if key not in overall_result.keys():
        overall_result[key+"_f1"] = results[key]["f1"] 

    return overall_result

In [21]:
#trainer definition
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_valid_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [22]:
#training and saving trained model and tokeinzer
trainer.train()
model.save_pretrained("/content/drive/MyDrive/NLP_project/model")
tokenizer.save_pretrained("/content/drive/MyDrive/NLP_project/model")

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: ignored

In [None]:
#loading test data
test_data = load_data_from_csv("/content/drive/MyDrive/NLP_project/data/output/test_conll.csv")
test_dataset = Dataset.from_pandas(test_data)
tokenized_test_dataset = test_dataset.map(tokenize_and_align_labels, batched=True)

In [None]:
tokenized_test_dataset[3]

In [None]:
#testing on test data
# Load and process the test data
test_data = load_data_from_csv("/content/drive/MyDrive/NLP_project/data/output/test_conll.csv")
test_dataset = Dataset.from_pandas(test_data)
tokenized_test_dataset = test_dataset.map(tokenize_and_align_labels, batched=True)

# Evaluate the model on the test dataset
test_results = trainer.evaluate(tokenized_test_dataset)

# Print the test results
print("Test results:")
for key, value in test_results.items():
    print(f"{key}: {value:.4f}")

In [None]:
import os
import matplotlib.pyplot as plt

# Define the data
epochs = [1, 2, 3, 4, 5, 6]
training_losses = [0.368, 0.1502, 0.1084, 0.0448, 0.0332, 0.0218]
validation_losses = [0.213285, 0.209606, 0.214214, 0.246469, 0.271435, 0.284084]
precision = [0.793032, 0.823885, 0.822263, 0.840572, 0.845966, 0.845614]
recall = [0.828129, 0.852294, 0.850394, 0.846049, 0.848493, 0.850665]
f1 = [0.810201, 0.837849, 0.836092, 0.843302, 0.847228, 0.848132]
accuracy = [0.936638, 0.943098, 0.945893, 0.948504, 0.948413, 0.949054]
person_f1 = [0.870690, 0.890398, 0.882865, 0.888780, 0.884407, 0.886709]
problem_f1 = [0.757895, 0.792636, 0.787819, 0.795888, 0.807139, 0.804144]
pronoun_f1 = [0.962766, 0.954424, 0.962963, 0.957447, 0.958115, 0.952632]
test_f1 = [0.771976, 0.814315, 0.816353, 0.828323, 0.831430, 0.833557]
treatment_f1 = [0.798906, 0.822208, 0.825289, 0.831821, 0.837912, 0.842975]

# Plot the training and validation losses
plt.plot(epochs, training_losses, label='Training Loss')
plt.plot(epochs, validation_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.savefig('bert_training_validation_loss.png')
plt.show()

# Plot the precision, recall, and F1 scores
plt.plot(epochs, precision, label='Precision')
plt.plot(epochs, recall, label='Recall')
plt.plot(epochs, f1, label='F1')
plt.xlabel('Epoch')
plt.ylabel('Score')
plt.title('Precision, Recall, and F1')
plt.legend()
plt.savefig('bert_precision_recall_f1.png')
plt.show()

# Plot the accuracy
plt.plot(epochs, accuracy, label='Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Accuracy')
plt.legend()
plt.savefig('bert_accuracy.png')
plt.show()

# Plot the F1 scores for each entity type
plt.plot(epochs, person_f1, label='Person')
plt.plot(epochs, problem_f1, label='Problem')
plt.plot(epochs, pronoun_f1, label='Pronoun')
plt.plot(epochs, test_f1, label='Test')
plt.plot(epochs, treatment_f1, label='Treatment')
plt.xlabel('Epoch')
plt.ylabel('F1')
plt.title('Entity Type F1 Scores')
plt.legend()
plt.savefig('bert_entitywise_f1.png')
plt.show()
