In [2]:
import spacy
from spacy.tokens import DocBin
from spacy.training import Example
from thinc.optimizers import Adam
from tqdm import tqdm
import json

In [20]:
nlp = spacy.load("en_core_web_sm")

In [21]:
# Add the NER pipeline if it's not present
if "ner" not in nlp.pipe_names:
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner, last=True)
else:
    ner = nlp.get_pipe("ner")

# Add your custom labels to the NER component
labels = ["GPE", "DISASTER"]
for label in labels:
    ner.add_label(label)

# Function to load data from the .jsonl file
def load_training_data(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()
    training_data = []
    for line in lines:
        entry = json.loads(line)
        text = entry[0]
        entities = entry[1]["entities"]
        training_data.append((text, {"entities": entities}))
    return training_data

# Convert training data into spaCy's Example objects
def create_training_examples(nlp, train_data):
    examples = []
    for text, annotations in tqdm(train_data):
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, annotations)
        examples.append(example)
    return examples

In [22]:
import json

# Function to remove duplicate entities
def remove_duplicate_entities(records):
    for record in records:
        text, annotations = record
        unique_entities = []
        seen = set()
        for entity in annotations['entities']:
            entity_tuple = tuple(entity)  # Convert the list to a tuple so it can be added to the set
            if entity_tuple not in seen:
                unique_entities.append(entity)
                seen.add(entity_tuple)
        annotations['entities'] = unique_entities
    return records

# Function to read data from a .jsonl file
def read_jsonl(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    return data

# Function to load and clean test data
def load_and_clean_test_data(file_path):
    data = read_jsonl(file_path)
    cleaned_data = remove_duplicate_entities(data)
    return cleaned_data


In [23]:

# Load your custom NER training data
# train_data = load_training_data("train/train_data_1.jsonl")
train_data = load_and_clean_test_data("train/filtered_annotated_data.jsonl")

In [24]:

# Convert training data to spaCy's Example format
train_examples = create_training_examples(nlp, train_data)

100%|██████████| 29914/29914 [00:06<00:00, 4426.29it/s]


In [25]:
learning_rate = 0.000001  # Change this to your desired value

other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.resume_training()
    nlp.config["training"]["optimizer"]["learn_rate"] = learning_rate
    # Initialize the optimizer with the NER component
    # optimizer = nlp.create_optimizer()  # Just create the optimizer without any parameters
    # nlp.config["training"]["optimizer"]["learn_rate"] = learning_rate
    # Define your custom learning rate and batch size

    # Fine-tune the model on the new dataset
    for iteration in range(20):  # Adjust the number of iterations as needed
        losses = {}
        # Create batches with the specified batch size
        batches = spacy.util.minibatch(train_examples, size=spacy.util.compounding(4.0, 32.0, 1.001))
        for batch in batches:
            # Update the model with a custom learning rate
            nlp.update(batch, sgd=optimizer, drop=0.35, losses=losses)
        print(f"Iteration {iteration + 1}, Losses: {losses}")

    # Save the updated model
nlp.to_disk("0fner_model_custom_3_1")

print("Model fine-tuning complete!")

Iteration 1, Losses: {'ner': 8470.175394881833}
Iteration 2, Losses: {'ner': 3331.7147140198053}
Iteration 3, Losses: {'ner': 2330.8960888534557}
Iteration 4, Losses: {'ner': 1870.4056896015618}
Iteration 5, Losses: {'ner': 1528.3078657274255}
Iteration 6, Losses: {'ner': 1301.054835377876}
Iteration 7, Losses: {'ner': 1069.2630843802335}
Iteration 8, Losses: {'ner': 896.4830350697554}
Iteration 9, Losses: {'ner': 767.0087452896743}
Iteration 10, Losses: {'ner': 645.3387696542171}
Iteration 11, Losses: {'ner': 625.5262262065476}
Iteration 12, Losses: {'ner': 548.6241726491025}
Iteration 13, Losses: {'ner': 569.2077308888863}
Iteration 14, Losses: {'ner': 490.5553421924499}
Iteration 15, Losses: {'ner': 414.4696965641796}
Iteration 16, Losses: {'ner': 366.9997418613033}
Iteration 17, Losses: {'ner': 379.5542888001005}
Iteration 18, Losses: {'ner': 391.9755414983169}
Iteration 19, Losses: {'ner': 418.2535545411854}
Iteration 20, Losses: {'ner': 336.04969875900775}
Model fine-tuning compl

In [55]:
# Load the fine-tuned model
nlp_custom = spacy.load("ner_model_custom")

# Test the model with a sample tweet
doc = nlp_custom("The aftershock pattern in purple suggests that the M. Noto  earthquake ruptured bilaterally, with the preliminary NEIC slip model also indicating slip on both sides of the epicenter")
for ent in doc.ents:
    print(ent.text, ent.label_)


M. Noto GPE
earthquake DISASTER
NEIC DISASTER
epicenter DISASTER
