In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install flair



In [None]:
import json

In [None]:
def convert_to_conll(dataset):
    conll_data = []

    for entry in dataset:
        text = entry['data']['text']
        annotations = entry['annotations'][0]['result']

        entities = []
        for annotation in annotations:
            entity_text = annotation['value']['text']
            start = annotation['value']['start']
            end = annotation['value']['end']
            label = annotation['value']['labels'][0]
            entities.append((start, end, label))

        # Sort entities by start index
        entities.sort(key=lambda x: x[0])

        # Generate CoNLL format
        conll_lines = []
        current_index = 0
        for start, end, label in entities:
            token = text[current_index:start].strip()
            if token:
                token_lines = [(token, 'O')] * len(token.split())
                conll_lines.extend(token_lines)

            entity_text = text[start:end]
            entity_tokens = entity_text.strip().split()
            if len(entity_tokens) == 1:
                conll_lines.append((entity_tokens[0], 'B-' + label))
            else:
                conll_lines.append((entity_tokens[0], 'B-' + label))
                for token in entity_tokens[1:]:
                    conll_lines.append((token, 'I-' + label))

            current_index = end

        # Handle remaining tokens
        remaining_text = text[current_index:].strip()
        if remaining_text:
            remaining_tokens = remaining_text.split()
            remaining_lines = [(token, 'O') for token in remaining_tokens]
            conll_lines.extend(remaining_lines)

        conll_data.append(conll_lines)

    return conll_data

In [None]:
# Read dataset from JSON file
with open('/content/drive/MyDrive/NLP/NER_TRAIN/NER_TRAIN_ALL.json', 'r') as file:
    train_data = json.load(file)

# Read dataset from JSON file
with open('/content/drive/MyDrive/NLP/NER_DEV/NER_DEV_ALL.json', 'r') as file:
    test_data = json.load(file)

In [None]:
# Convert to CoNLL format
train_conll_format = convert_to_conll(train_data)
test_conll_format = convert_to_conll(test_data)

In [None]:
import flair

In [None]:
from flair.data import Corpus
from flair.datasets import ColumnCorpus
from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer
from flair.training_utils import EvaluationMetric
import numpy as np

In [None]:
# Define Flair tag type
tag_type = 'ner'

In [None]:
from flair.data import Dictionary

# Define the labels
original_label_list = [
    "COURT",
    "PETITIONER",
    "RESPONDENT",
    "JUDGE",
    "DATE",
    "ORG",
    "GPE",
    "STATUTE",
    "PROVISION",
    "PRECEDENT",
    "CASE_NUMBER",
    "WITNESS",
    "OTHER_PERSON",
    "LAWYER"
]

# Create a Dictionary object
tag_dictionary = Dictionary()
for label in original_label_list:
    tag_dictionary.add_item(label)

# Add 'O' label
tag_dictionary.add_item('O')


15

In [None]:
# Initialize embeddings
embedding_types = [
    WordEmbeddings('glove'),
    FlairEmbeddings('news-forward'),
    FlairEmbeddings('news-backward')
]
embeddings = StackedEmbeddings(embeddings=embedding_types)


In [None]:
# Initialize sequence tagger
tagger = SequenceTagger(hidden_size=256,
                        embeddings=embeddings,
                        tag_dictionary=tag_dictionary,
                        tag_type=tag_type)


2024-02-23 11:07:52,045 SequenceTagger predicts: Dictionary with 16 tags: <unk>, COURT, PETITIONER, RESPONDENT, JUDGE, DATE, ORG, GPE, STATUTE, PROVISION, PRECEDENT, CASE_NUMBER, WITNESS, OTHER_PERSON, LAWYER, O


In [None]:
from flair.data import Corpus, Sentence
from flair.datasets import CONLL_03
from flair.embeddings import WordEmbeddings, StackedEmbeddings, FlairEmbeddings

# Convert your train and test datasets into Flair Sentence objects
train_sentences = [Sentence(' '.join([token[0] for token in sentence])) for sentence in train_conll_format]
test_sentences = [Sentence(' '.join([token[0] for token in sentence])) for sentence in test_conll_format]

# Convert your train and test datasets into Flair Sentence objects with entity annotations
train_annotations = [[(token[0], token[1]) for token in sentence] for sentence in train_conll_format]
test_annotations = [[(token[0], token[1]) for token in sentence] for sentence in test_conll_format]

# Initialize Flair corpus
corpus = Corpus(train=train_sentences, test=test_sentences)

# Add entity annotations to corpus
for sentence, annotations in zip(corpus.train, train_annotations):
    for token, label in annotations:
        sentence.add_label('ner', label)

for sentence, annotations in zip(corpus.test, test_annotations):
    for token, label in annotations:
        sentence.add_label('ner', label)


2024-02-23 11:10:45,134 No dev split found. Using 0% (i.e. 1100 samples) of the train split as dev data


In [None]:
# Initialize trainer
trainer = ModelTrainer(tagger, corpus)


In [None]:
# Train the model
trainer.train('/content/drive/MyDrive/NLP/resources/taggers/ner-english',
              train_with_dev=True,
              mini_batch_size=1,
              max_epochs=150)

2024-02-23 11:10:45,801 ----------------------------------------------------------------------------------------------------
2024-02-23 11:10:45,803 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): WordEmbeddings(
      'glove'
      (embedding): Embedding(400001, 100)
    )
    (list_embedding_1): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.05, inplace=False)
        (encoder): Embedding(300, 100)
        (rnn): LSTM(100, 2048)
      )
    )
    (list_embedding_2): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.05, inplace=False)
        (encoder): Embedding(300, 100)
        (rnn): LSTM(100, 2048)
      )
    )
  )
  (word_dropout): WordDropout(p=0.05)
  (locked_dropout): LockedDropout(p=0.5)
  (embedding2nn): Linear(in_features=4196, out_features=4196, bias=True)
  (rnn): LSTM(4196, 256, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=512, out_features=18, bias=True)
  (loss_f

In [None]:
import numpy as np

def evaluate_model(model, data):
    model.eval()
    predictions = []
    true_labels = []

    for sentence in data:
        text = ' '.join([token[0] for token in sentence])
        true_label = [token[1] for token in sentence]

        # Create Flair sentence
        flair_sentence = Sentence(text)

        # Predict NER tags
        model.predict(flair_sentence)
        predicted_labels = [entity.tag for entity in flair_sentence.get_spans('ner')]

        predictions.append(predicted_labels)
        true_labels.append(true_label)

    # Convert labels to numpy arrays for computation
    predictions = np.array(predictions)
    true_labels = np.array(true_labels)

    # Compute metrics
    metrics = compute_metrics(predictions, true_labels)
    return metrics


In [None]:
def compute_metrics(predictions, true_labels):
    # Flatten the predictions and true labels
    pred_flat = predictions.flatten()
    true_flat = true_labels.flatten()

    # Remove 'O' labels
    pred_flat_filtered = pred_flat[pred_flat != 'O']
    true_flat_filtered = true_flat[true_flat != 'O']

    # Calculate true positive, false positive, and false negative counts for each label
    tp_counts = {}
    fp_counts = {}
    fn_counts = {}

    for label in np.unique(true_flat_filtered):
        tp_counts[label] = np.sum((true_flat_filtered == label) & (pred_flat_filtered == label))
        fp_counts[label] = np.sum((true_flat_filtered != label) & (pred_flat_filtered == label))
        fn_counts[label] = np.sum((true_flat_filtered == label) & (pred_flat_filtered != label))

    # Calculate precision, recall, and F1-score for each label
    label_metrics = {}
    for label in np.unique(true_flat_filtered):
        precision = tp_counts[label] / (tp_counts[label] + fp_counts[label] + 1e-9)
        recall = tp_counts[label] / (tp_counts[label] + fn_counts[label] + 1e-9)
        f1_score = 2 * precision * recall / (precision + recall + 1e-9)
        label_metrics[label] = {"precision": precision, "recall": recall, "f1-score": f1_score}

    # Calculate weighted average of F1-scores
    total_instances = len(true_flat_filtered)
    weighted_f1_score = np.sum([label_metrics[label]["f1-score"] * np.sum(true_flat_filtered == label) / total_instances
                                for label in label_metrics])

    return {
        "f1-type-match": weighted_f1_score,
        "f1-partial": np.mean([label_metrics[label]["f1-score"] for label in label_metrics]),
        "f1-strict": np.sum([tp_counts[label] for label in label_metrics]) / np.sum([tp_counts[label] + fn_counts[label] for label in label_metrics]),
        "f1-exact": np.sum([tp_counts[label] for label in label_metrics]) / total_instances
    }

In [None]:
# Evaluate the model using the test data
evaluation_metrics = evaluate_model(tagger, test_conll_format)

# Print or use the evaluation metrics as needed
print("Evaluation Metrics:", evaluation_metrics)