In [1]:
import spacy
from spacy.tokens import DocBin
from spacy.training import Example
from thinc.optimizers import Adam
from tqdm import tqdm
import json

In [18]:
nlp = spacy.load("en_core_web_sm")

In [19]:
# Add the NER pipeline if it's not present
if "ner" not in nlp.pipe_names:
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner, last=True)
else:
    ner = nlp.get_pipe("ner")

# Add your custom labels to the NER component
labels = ["GPE", "DISASTER"]
for label in labels:
    ner.add_label(label)

# Function to load data from the .jsonl file
def load_training_data(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()
    training_data = []
    for line in lines:
        entry = json.loads(line)
        text = entry[0]
        entities = entry[1]["entities"]
        training_data.append((text, {"entities": entities}))
    return training_data

# Convert training data into spaCy's Example objects
def create_training_examples(nlp, train_data):
    examples = []
    for text, annotations in tqdm(train_data):
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, annotations)
        examples.append(example)
    return examples

In [20]:

# Load your custom NER training data
train_data = load_training_data("train/data3.jsonl")

# Convert training data to spaCy's Example format
train_examples = create_training_examples(nlp, train_data)

100%|██████████| 29914/29914 [00:26<00:00, 1121.99it/s]


In [21]:
learning_rate = 0.00001  # Change this to your desired value

other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.resume_training()
    nlp.config["training"]["optimizer"]["learn_rate"] = learning_rate
    # Initialize the optimizer with the NER component
    # optimizer = nlp.create_optimizer()  # Just create the optimizer without any parameters
    # nlp.config["training"]["optimizer"]["learn_rate"] = learning_rate
    # Define your custom learning rate and batch size

    # Fine-tune the model on the new dataset
    for iteration in range(30):  # Adjust the number of iterations as needed
        losses = {}
        # Create batches with the specified batch size
        # batches = spacy.util.minibatch(train_examples, size=spacy.util.compounding(4.0, 32.0, 1.001))
        batches = spacy.util.minibatch(train_examples, size=spacy.util.compounding(4.0, 32.0, 1.5))
        for batch in batches:
            # Update the model with a custom learning rate
            nlp.update(batch, sgd=optimizer, drop=0.5, losses=losses)
        print(f"Iteration {iteration + 1}, Losses: {losses}")

    # Save the updated model
nlp.to_disk("0fner_model_custom_4")

print("Model fine-tuning complete!")

Iteration 1, Losses: {'ner': 38328.76468920891}
Iteration 2, Losses: {'ner': 18232.913023904384}


KeyboardInterrupt: 

In [24]:
# Load the fine-tuned model
nlp_custom = spacy.load("0fner_model_custom_2_2")

# Test the model with a sample tweet
doc = nlp_custom("The aftershocks pattern in purple suggests that the M. Noto earthquake ruptured bilaterally, with the preliminary NEIC slip model in Japan also indicating slip on both sides of the epicenter in Ishikawa")
for ent in doc.ents:
    print(ent.text, ent.label_)


aftershocks DISASTER
M. Noto GPE
earthquake DISASTER
Japan GPE
epicenter DISASTER
Ishikawa GPE
