In [2]:
import spacy
from spacy.tokens import DocBin
from spacy.training import Example
from tqdm import tqdm
import json

In [3]:
nlp = spacy.load("en_core_web_sm")

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [4]:
# Add the NER pipeline if it's not present
if "ner" not in nlp.pipe_names:
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner, last=True)
else:
    ner = nlp.get_pipe("ner")

# Add your custom labels to the NER component
labels = ["GPE", "DISASTER"]
for label in labels:
    ner.add_label(label)

# Function to load data from the .jsonl file
def load_training_data(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()
    training_data = []
    for line in lines:
        entry = json.loads(line)
        text = entry[0]
        entities = entry[1]["entities"]
        training_data.append((text, {"entities": entities}))
    return training_data

# Convert training data into spaCy's Example objects
def create_training_examples(nlp, train_data):
    examples = []
    for text, annotations in tqdm(train_data):
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, annotations)
        examples.append(example)
    return examples

In [5]:

# Load your custom NER training data
train_data = load_training_data("train/train_data_1.jsonl")

In [6]:

# Convert training data to spaCy's Example format
train_examples = create_training_examples(nlp, train_data)

100%|██████████| 80883/80883 [00:12<00:00, 6695.81it/s]


In [7]:
learning_rate = 0.0000001  # Change this to your desired value

other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.resume_training()
    nlp.config["training"]["optimizer"]["learn_rate"] = learning_rate
    # Initialize the optimizer with the NER component
    # optimizer = nlp.create_optimizer()  # Just create the optimizer without any parameters
    # nlp.config["training"]["optimizer"]["learn_rate"] = learning_rate
    # Define your custom learning rate and batch size

    # Fine-tune the model on the new dataset
    for iteration in range(20):  # Adjust the number of iterations as needed
        losses = {}
        # Create batches with the specified batch size
        batches = spacy.util.minibatch(train_examples, size=spacy.util.compounding(4.0, 32.0, 1.3))
        for batch in batches:
            # Update the model with a custom learning rate
            nlp.update(batch, sgd=optimizer, drop=0.3, losses=losses)
        print(f"Iteration {iteration + 1}, Losses: {losses}")

    # Save the updated model
nlp.to_disk("0fner_model_custom_1_3")

print("Model fine-tuning complete!")

Iteration 1, Losses: {'ner': 181.4309047089791}
Iteration 2, Losses: {'ner': 6.31446047260324}
Iteration 3, Losses: {'ner': 5.299651846845533}
Iteration 4, Losses: {'ner': 5.788887921566374e-09}
Iteration 5, Losses: {'ner': 2.4691197220999227e-09}
Iteration 6, Losses: {'ner': 4.2586802994046043e-10}
Iteration 7, Losses: {'ner': 4.369511781936642e-10}
Iteration 8, Losses: {'ner': 5.722904400086512e-09}
Iteration 9, Losses: {'ner': 1.680717371773283e-12}
Iteration 10, Losses: {'ner': 1.0152630637494759e-13}
Iteration 11, Losses: {'ner': 2.8335370642971674e-12}
Iteration 12, Losses: {'ner': 5.479805999979503e-15}
Iteration 13, Losses: {'ner': 4.984318006467928e-12}
Iteration 14, Losses: {'ner': 5.131933890476306e-15}
Iteration 15, Losses: {'ner': 7.924070197338322e-16}
Iteration 16, Losses: {'ner': 2.7936907585817966e-15}
Iteration 17, Losses: {'ner': 7.839360066843809e-17}
Iteration 18, Losses: {'ner': 1.2422622557424922e-09}
Iteration 19, Losses: {'ner': 3.454404346949668e-14}
Iteration

In [3]:
# Load the fine-tuned model
nlp_custom = spacy.load("0fner_model_custom_1_3")

# Test the model with a sample tweet
doc = nlp_custom("The aftershock pattern in purple suggests that the M. Noto earthquake ruptured bilaterally, with the preliminary NEIC slip model in Japan also indicating slip on both sides of the epicenter in Ishikawa")
for ent in doc.ents:
    print(ent.text, ent.label_)


  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


The aftershock pattern in purple suggests that the M. Noto GPE
bilaterally, with the preliminary NEIC slip model in Japan also indicating slip on both sides of the epicenter in Ishikawa GPE
