In [2]:
import spacy
from spacy.tokens import DocBin
from spacy.training import Example
from thinc.optimizers import Adam
from tqdm import tqdm
import json

In [13]:
nlp = spacy.load("en_core_web_sm")

In [14]:
# Add the NER pipeline if it's not present
if "ner" not in nlp.pipe_names:
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner, last=True)
else:
    ner = nlp.get_pipe("ner")

# Add your custom labels to the NER component
labels = ["GPE", "DISASTER"]
for label in labels:
    ner.add_label(label)

# Function to load data from the .jsonl file
def load_training_data(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()
    training_data = []
    for line in lines:
        entry = json.loads(line)
        text = entry[0]
        entities = entry[1]["entities"]
        training_data.append((text, {"entities": entities}))
    return training_data

# Convert training data into spaCy's Example objects
def create_training_examples(nlp, train_data):
    examples = []
    for text, annotations in tqdm(train_data):
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, annotations)
        examples.append(example)
    return examples

In [15]:

# Load your custom NER training data
train_data = load_training_data("train/data3.jsonl")

# Convert training data to spaCy's Example format
train_examples = create_training_examples(nlp, train_data[:1000])

100%|██████████| 1000/1000 [00:00<00:00, 1276.35it/s]


In [None]:
learning_rate = 0.000001  # Set desired learning rate

# Disable other pipes except 'ner'
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.resume_training()
    nlp.config["training"]["optimizer"]["learn_rate"] = learning_rate

    # Fine-tune the model on the new dataset
    for iteration in range(40):  # Maximum number of iterations
        losses = {}
        
        # Create batches with the specified batch size
        batches = spacy.util.minibatch(train_examples, size=spacy.util.compounding(4.0, 32.0, 1.001))
        
        # Update model in batches
        for batch in batches:
            nlp.update(batch, sgd=optimizer, drop=0.3, losses=losses)
        
        # Display losses for the current iteration
        print(f"Iteration {iteration + 1}, Losses: {losses}")

        # Stop training if the total loss is below the threshold
        total_loss = sum(losses.values())
        if total_loss < 10:
            print("Early stopping as loss is below threshold.")
            break

    # Save the updated model
    nlp.to_disk("0fner_model_custom_4_2")

print("Model fine-tuning complete!")


Iteration 1, Losses: {'ner': 1568.1188160713702}
Iteration 2, Losses: {'ner': 459.29892484944276}
Iteration 3, Losses: {'ner': 252.24542299692504}
Iteration 4, Losses: {'ner': 138.23971964243464}
Iteration 5, Losses: {'ner': 90.47280653886523}
Iteration 6, Losses: {'ner': 62.04907696036728}
Iteration 7, Losses: {'ner': 62.68806317947039}
Iteration 8, Losses: {'ner': 42.515644518425944}
Iteration 9, Losses: {'ner': 34.906144071241776}
Iteration 10, Losses: {'ner': 44.153884356830304}
Iteration 11, Losses: {'ner': 41.33845888993389}
Iteration 12, Losses: {'ner': 37.745458430482884}
Iteration 13, Losses: {'ner': 46.708330930778345}
Iteration 14, Losses: {'ner': 27.025751817814314}
Iteration 15, Losses: {'ner': 17.85083773468035}
Iteration 16, Losses: {'ner': 29.397530246748083}
Iteration 17, Losses: {'ner': 25.588293585324184}
Iteration 18, Losses: {'ner': 17.472247637660708}
Iteration 19, Losses: {'ner': 16.6206080660219}
Iteration 20, Losses: {'ner': 17.261017455079248}
Iteration 21, Lo

In [24]:
# Load the fine-tuned model
nlp_custom = spacy.load("0fner_model_custom_2_2")

# Test the model with a sample tweet
doc = nlp_custom("The aftershocks pattern in purple suggests that the M. Noto earthquake ruptured bilaterally, with the preliminary NEIC slip model in Japan also indicating slip on both sides of the epicenter in Ishikawa")
for ent in doc.ents:
    print(ent.text, ent.label_)


aftershocks DISASTER
M. Noto GPE
earthquake DISASTER
Japan GPE
epicenter DISASTER
Ishikawa GPE


In [None]:
Iteration 1, Losses: {'ner': 1568.1188160713702}
Iteration 2, Losses: {'ner': 459.29892484944276}
Iteration 3, Losses: {'ner': 252.24542299692504}
Iteration 4, Losses: {'ner': 138.23971964243464}
Iteration 5, Losses: {'ner': 90.47280653886523}
Iteration 6, Losses: {'ner': 62.04907696036728}
Iteration 7, Losses: {'ner': 62.68806317947039}
Iteration 8, Losses: {'ner': 42.515644518425944}
Iteration 9, Losses: {'ner': 34.906144071241776}
Iteration 10, Losses: {'ner': 44.153884356830304}
Iteration 11, Losses: {'ner': 41.33845888993389}
Iteration 12, Losses: {'ner': 37.745458430482884}
Iteration 13, Losses: {'ner': 46.708330930778345}
Iteration 14, Losses: {'ner': 27.025751817814314}
Iteration 15, Losses: {'ner': 17.85083773468035}
Iteration 16, Losses: {'ner': 29.397530246748083}
Iteration 17, Losses: {'ner': 25.588293585324184}
Iteration 18, Losses: {'ner': 17.472247637660708}
Iteration 19, Losses: {'ner': 16.6206080660219}
Iteration 20, Losses: {'ner': 17.261017455079248}
Iteration 21, Losses: {'ner': 25.930193057001894}
Iteration 22, Losses: {'ner': 34.198073189674915}
Iteration 23, Losses: {'ner': 23.332156751474184}
Iteration 24, Losses: {'ner': 24.627552309716123}
Iteration 25, Losses: {'ner': 10.764046509624915}
Iteration 26, Losses: {'ner': 25.373553772957543}
Iteration 27, Losses: {'ner': 25.11540958151469}
Iteration 28, Losses: {'ner': 22.430292268271046}
Iteration 29, Losses: {'ner': 16.838893562292757}
Iteration 30, Losses: {'ner': 26.673109643548035}
Iteration 31, Losses: {'ner': 8.905334486755907}
Iteration 32, Losses: {'ner': 5.098312303031355}
Iteration 33, Losses: {'ner': 18.11413936527767}
Iteration 34, Losses: {'ner': 10.144461859980263}
Iteration 35, Losses: {'ner': 11.884905149733246}
Iteration 36, Losses: {'ner': 13.158588088982128}
Iteration 37, Losses: {'ner': 34.586184823233765}
Iteration 38, Losses: {'ner': 20.433495885370544}
Iteration 39, Losses: {'ner': 20.465540915684613}
Iteration 40, Losses: {'ner': 24.834602603787427}
Model fine-tuning complete!
