In [None]:
# Install spaCy and download a model
!pip install -U spacy spacy-lookups-data
!python -m spacy download en_core_web_sm  # smaller model
# !python -m spacy download en_core_web_trf  # larger, more accurate transformer-based model


In [None]:
# Install required packages
!pip install -U spacy spacy-lookups-data
!python -m spacy download en_core_web_sm

import spacy
from spacy.tokens import DocBin
from spacy.training import Example
from spacy.training import offsets_to_biluo_tags
from spacy.pipeline import EntityRuler

# Initialize a blank English model
nlp = spacy.blank("en")

# Add the NER component to the pipeline
if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner")
else:
    ner = nlp.get_pipe("ner")

# Add the new 'SKU' label to the NER component
ner.add_label("SKU")

# Create training data with correct character offsets
TRAIN_DATA = [
    ("Give me cartridge for 007-F5", {"entities": [(23, 29, "SKU")]}),
    ("Provide me cartridge for 008-S7", {"entities": [(23, 29, "SKU")]}),
    ("I need cartridge for 009-B6", {"entities": [(23, 29, "SKU")]}),
    # ... rest of your training data ...
]

# Function to check entity alignment
def check_alignment(nlp, train_data):
    print("Checking entity alignment...")
    for text, annotations in train_data:
        doc = nlp.make_doc(text)
        tags = offsets_to_biluo_tags(doc, annotations.get("entities"))
        if "-" in tags:
            print(f"Misaligned entities in: {text}")
            print(f"Tags: {tags}")

check_alignment(nlp, TRAIN_DATA)

# Training loop
with nlp.disable_pipes(*[pipe for pipe in nlp.pipe_names if pipe != "ner"]):
    optimizer = nlp.initialize()
    for itn in range(100):
        losses = {}
        for text, annotations in TRAIN_DATA:
            example = Example.from_dict(nlp.make_doc(text), annotations)
            nlp.update([example], drop=0.2, losses=losses)
        if itn % 20 == 0:
            print(f"Iteration {itn}, Losses: {losses}")

# Save the model
nlp.to_disk("sku_ner_model")

# Test the model
test_texts = [
    "I need parts for 016-H8",
    "Give me cartridge for 007-F5",
    "Need to order ABC-123",
    "Replace XYZ-789",
]

print("\nTesting the model:")
for text in test_texts:
    doc = nlp(text)
    print(f"\nText: {text}")
    print("Entities:", [(ent.text, ent.label_) for ent in doc.ents])

In [1]:
# Install required packages
# !pip install -U spacy spacy-lookups-data

import spacy
from spacy.tokens import DocBin
from spacy.training import Example
from spacy.training import offsets_to_biluo_tags
from spacy.pipeline import EntityRuler

# Initialize a blank English model
nlp = spacy.blank("en")

# Add the NER component to the pipeline
if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner")
else:
    ner = nlp.get_pipe("ner")

# Add the new 'SKU' label to the NER component
ner.add_label("SKU")

# Create training data with correct character offsets
TRAIN_DATA = [
    ("Give me cartridge for 007-F5", {"entities": [(23, 29, "SKU")]}),
    ("Provide me cartridge for 008-S7", {"entities": [(23, 29, "SKU")]}),
    ("I need cartridge for 009-B6", {"entities": [(23, 29, "SKU")]}),
    ("I need cartridge for 010-A3", {"entities": [(23, 29, "SKU")]}),
    ("I need cartridge for 011-C4", {"entities": [(23, 29, "SKU")]}),
    ("I need cartridge for 012-D2", {"entities": [(23, 29, "SKU")]}),
    ("I need cartridge for 013-E1", {"entities": [(23, 29, "SKU")]}),
    ("I need cartridge for 014-F6", {"entities": [(23, 29, "SKU")]}),
    ("I need cartridge for 015-G7", {"entities": [(23, 29, "SKU")]}),
    ("I need parts for 016-H8", {"entities": [(19, 25, "SKU")]}),
    ("I need parts for 017-I9", {"entities": [(19, 25, "SKU")]}),
    ("I need parts for 018-J0", {"entities": [(19, 25, "SKU")]}),
    ("Need to order ABC-123", {"entities": [(14, 21, "SKU")]}),
    ("Replace XYZ-789 with 123-A4", {"entities": [(8, 15, "SKU"), (21, 27, "SKU")]}),
    # Additional diverse examples
    ("SKU 016-H8 needs replacement", {"entities": [(4, 10, "SKU")]}),
    ("Product ID: 016-H8", {"entities": [(11, 17, "SKU")]}),
    ("016-H8 is out of stock", {"entities": [(0, 6, "SKU")]}),
    ("Check inventory for ABC-123 and XYZ-789", {"entities": [(19, 26, "SKU"), (31, 38, "SKU")]}),
    ("Order placed for 007-F5", {"entities": [(16, 22, "SKU")]}),
]

# Function to check entity alignment
def check_alignment(nlp, train_data):
    print("Checking entity alignment...")
    for text, annotations in train_data:
        doc = nlp.make_doc(text)
        tags = offsets_to_biluo_tags(doc, annotations.get("entities"))
        if "X" in tags or "O" in tags and "B-SKU" not in tags:
            print(f"Misaligned entities in: {text}")
            print(f"Tags: {tags}")

check_alignment(nlp, TRAIN_DATA)

# Initialize the training
optimizer = nlp.begin_training()

# Disable other pipes for training to speed up and avoid interference
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes):
    # Training loop
    for itn in range(100):
        losses = {}
        for text, annotations in TRAIN_DATA:
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, annotations)
            nlp.update([example], drop=0.2, losses=losses)
        if itn % 20 == 0:
            print(f"Iteration {itn}, Losses: {losses}")

# Save the trained model
nlp.to_disk("sku_ner_model")
print("\nModel saved to 'sku_ner_model'.")

# Test the trained model
print("\nTesting the trained model on various examples:")
TEST_TEXTS = [
    "I need parts for 016-H8",
    "Give me cartridge for 007-F5",
    "Need to order ABC-123",
    "Replace XYZ-789",
    "Check stock of 019-K1",           # New SKU not in training
    "Product reference: 020-L2",      # Different format
]

for text in TEST_TEXTS:
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    print(f"\nText: {text}")
    print("Entities found:", entities)

# Load and test the saved model
print("\nLoading and testing the saved model:")
loaded_nlp = spacy.load("sku_ner_model")

for text in TEST_TEXTS:
    doc = loaded_nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    print(f"\nText: {text}")
    print("Entities found:", entities)


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "/Users/aviralgarg/.pyenv/versions/3.10.15/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/Users/aviralgarg/.pyenv/versions/3.10.15/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/Users/aviralgarg/code/stocks_3_10/.venv/lib/python3.10/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/Users/aviralgarg/code/stocks_3_10/.venv/lib/python3.10/site-packages/traitlets/confi

Checking entity alignment...
Misaligned entities in: Give me cartridge for 007-F5
Tags: ['O', 'O', 'O', 'O', '-', '-', '-']
Misaligned entities in: Provide me cartridge for 008-S7
Tags: ['O', 'O', 'O', '-', '-', '-', 'O']
Misaligned entities in: I need cartridge for 009-B6
Tags: ['O', 'O', 'O', 'O', '-', '-', '-']
Misaligned entities in: I need cartridge for 010-A3
Tags: ['O', 'O', 'O', 'O', '-', '-', '-']
Misaligned entities in: I need cartridge for 011-C4
Tags: ['O', 'O', 'O', 'O', '-', '-', '-']
Misaligned entities in: I need cartridge for 012-D2
Tags: ['O', 'O', 'O', 'O', '-', '-', '-']
Misaligned entities in: I need cartridge for 013-E1
Tags: ['O', 'O', 'O', 'O', '-', '-', '-']
Misaligned entities in: I need cartridge for 014-F6
Tags: ['O', 'O', 'O', 'O', '-', '-', '-']
Misaligned entities in: I need cartridge for 015-G7
Tags: ['O', 'O', 'O', 'O', '-', '-', '-']
Misaligned entities in: I need parts for 016-H8
Tags: ['O', 'O', 'O', 'O', '-', '-', '-']
Misaligned entities in: I need



Iteration 0, Losses: {'ner': np.float32(35.755764)}
Iteration 20, Losses: {'ner': np.float32(4.2526622e-07)}
Iteration 40, Losses: {'ner': np.float32(4.765472e-08)}
Iteration 60, Losses: {'ner': np.float32(3.4979305e-08)}
Iteration 80, Losses: {'ner': np.float32(1.1496209e-08)}

Model saved to 'sku_ner_model'.

Testing the trained model on various examples:

Text: I need parts for 016-H8
Entities found: [('016-H8', 'SKU')]

Text: Give me cartridge for 007-F5
Entities found: [('007-F5', 'SKU')]

Text: Need to order ABC-123
Entities found: [('ABC-123', 'SKU')]

Text: Replace XYZ-789
Entities found: [('XYZ-789', 'SKU')]

Text: Check stock of 019-K1
Entities found: [('019-K1', 'SKU')]

Text: Product reference: 020-L2
Entities found: [('020-L2', 'SKU')]

Loading and testing the saved model:

Text: I need parts for 016-H8
Entities found: [('016-H8', 'SKU')]

Text: Give me cartridge for 007-F5
Entities found: [('007-F5', 'SKU')]

Text: Need to order ABC-123
Entities found: [('ABC-123', 'SKU')

In [32]:
# text = "I need parts for 016-H8"

# # Process the text
# doc = nlp(text)

# # Display entities
# for ent in doc.ents:
#     print(f"Entity: {ent.text}, Label: {ent.label_}")

In [None]:
# # First, install required package
# # !pip install spacy-lookups-data

# import spacy
# from spacy.tokens import DocBin
# from spacy.cli.train import train
# from spacy.training import Example
# from spacy.training import offsets_to_biluo_tags

# # Create training data with corrected entity spans
# TRAIN_DATA = [
#     ("Give me cartridge for 007-F5", {"entities": [(23, 29, "SKU")]}),  # Adjusted span
#     ("Provide me cartridge for 008-S7", {"entities": [(23, 29, "SKU")]}),
#     ("I need cartridge for 009-B6", {"entities": [(23, 29, "SKU")]}),
#     ("I need cartridge for 010-A3", {"entities": [(23, 29, "SKU")]}),
#     ("I need cartridge for 011-C4", {"entities": [(23, 29, "SKU")]}),
#     ("I need cartridge for 012-D2", {"entities": [(23, 29, "SKU")]}),
#     ("I need cartridge for 013-E1", {"entities": [(23, 29, "SKU")]}),
#     ("I need cartridge for 014-F6", {"entities": [(23, 29, "SKU")]}),
#     ("I need cartridge for 015-G7", {"entities": [(23, 29, "SKU")]}),
#     ("I need parts for 016-H8", {"entities": [(16, 22, "SKU")]}),
#     ("I need parts for 017-I9", {"entities": [(16, 22, "SKU")]}),
#     ("I need parts for 018-J0", {"entities": [(16, 22, "SKU")]}),
#     ("Need to order ABC-123", {"entities": [(14, 21, "SKU")]}),
#     ("Replace XYZ-789 with 123-A4", {"entities": [(8, 15, "SKU"), (21, 27, "SKU")]}),
#     # Additional diverse examples
#     ("SKU 016-H8 needs replacement", {"entities": [(4, 10, "SKU")]}),
#     ("Product ID: 016-H8", {"entities": [(11, 17, "SKU")]}),
#     ("016-H8 is out of stock", {"entities": [(0, 6, "SKU")]}),
#     ("Check inventory for ABC-123 and XYZ-789", {"entities": [(19, 26, "SKU"), (31, 38, "SKU")]}),
#     ("Order placed for 007-F5", {"entities": [(16, 22, "SKU")]})
# ]

# # Start with pre-trained model
# nlp = spacy.load("en_core_web_sm")

# # Check entity alignment before training
# print("Checking entity alignment...")
# for text, annotations in TRAIN_DATA:
#     doc = nlp.make_doc(text)
#     tags = offsets_to_biluo_tags(doc, annotations.get("entities"))
#     if "-" in tags:
#         print(f"Misaligned entities in: {text}")
#         print(f"Tags: {tags}")

# # Add NER component
# if "ner" not in nlp.pipe_names:
#     ner = nlp.add_pipe("ner")
# else:
#     ner = nlp.get_pipe("ner")

# # Add new label
# ner.add_label("SKU")

# # Disable other pipeline components during training
# other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
# with nlp.disable_pipes(*other_pipes):
#     # Train the model
#     optimizer = nlp.begin_training()
#     for itn in range(100):
#         losses = {}
#         for text, annotations in TRAIN_DATA:
#             example = Example.from_dict(nlp.make_doc(text), annotations)
#             nlp.update([example], drop=0.2, losses=losses)
#         if itn % 20 == 0:
#             print(f"Losses at iteration {itn}: {losses}")

# # Save the model
# nlp.to_disk("sku_ner_model")

# # Test the model
# print("\nTesting model on various examples:")
# test_texts = [
#     "I need parts for 016-H8",
#     "Give me cartridge for 007-F5",
#     "Need to order ABC-123",
#     "Replace XYZ-789",
#     "Check stock of 019-K1",  # New SKU not in training
#     "Product reference: 020-L2"  # Different format
# ]

# for test_text in test_texts:
#     doc = nlp(test_text)
#     print(f"\nText: {test_text}")
#     print("Entities found:", [(ent.text, ent.label_) for ent in doc.ents])

In [2]:
!pip install streamlit

Collecting streamlit
  Obtaining dependency information for streamlit from https://files.pythonhosted.org/packages/ae/53/418536f5d0b87bfbe7bbd8c001983c27e9474f82723bd2e529660fd9a534/streamlit-1.40.2-py2.py3-none-any.whl.metadata
  Downloading streamlit-1.40.2-py2.py3-none-any.whl.metadata (8.4 kB)
Collecting altair<6,>=4.0 (from streamlit)
  Obtaining dependency information for altair<6,>=4.0 from https://files.pythonhosted.org/packages/aa/f3/0b6ced594e51cc95d8c1fc1640d3623770d01e4969d29c0bd09945fafefa/altair-5.5.0-py3-none-any.whl.metadata
  Downloading altair-5.5.0-py3-none-any.whl.metadata (11 kB)
Collecting cachetools<6,>=4.0 (from streamlit)
  Obtaining dependency information for cachetools<6,>=4.0 from https://files.pythonhosted.org/packages/a4/07/14f8ad37f2d12a5ce41206c21820d8cb6561b728e51fad4530dff0552a67/cachetools-5.5.0-py3-none-any.whl.metadata
  Downloading cachetools-5.5.0-py3-none-any.whl.metadata (5.3 kB)
Collecting pyarrow>=7.0 (from streamlit)
  Obtaining dependency in

In [3]:
import streamlit as st
import json

# Initialize session state for annotations
if 'annotations' not in st.session_state:
    st.session_state.annotations = []

# Initialize session state for entity types
if 'entity_types' not in st.session_state:
    st.session_state.entity_types = ['SKU']

st.title("NER Annotation Tool")
st.write("Select text and assign named entities.")

# Text input
text = st.text_area("Enter Text for Annotation:", height=200)

# Container to show selected text and assign entity
if text:
    st.markdown("### Annotate Entities")
    st.markdown("Select a portion of the text below, and assign an entity type from the dropdown menu.")
    
    # Display the text with a unique identifier for selection
    st.write(text, unsafe_allow_html=True)
    
    # Placeholder for annotations
    placeholder = st.empty()
    
    # Button to add a new entity type
    if st.button("Add New Entity Type"):
        new_entity = st.text_input("Enter new entity type:")
        if new_entity:
            st.session_state.entity_types.append(new_entity)
            st.success(f"Added new entity type: {new_entity}")
    
    # Dropdown to select entity type
    entity_type = st.selectbox("Select Entity Type:", st.session_state.entity_types)
    
    # Button to save annotation
    if st.button("Save Annotation"):
        # Here, you'd capture the selected text and its span
        # For simplicity, we'll simulate with placeholders
        # In a real scenario, you'd use JavaScript to get selected text and its indices
        selected_text = st.text_input("Selected Text:")
        start_pos = st.number_input("Start Position:", min_value=0)
        end_pos = st.number_input("End Position:", min_value=0)
        
        if selected_text and start_pos < end_pos:
            annotation = {
                "text": selected_text,
                "start": start_pos,
                "end": end_pos,
                "label": entity_type
            }
            st.session_state.annotations.append(annotation)
            st.success(f"Annotated: {selected_text} as {entity_type}")
    
    # Display current annotations
    st.markdown("### Current Annotations")
    for annot in st.session_state.annotations:
        st.write(f"{annot['text']} --> {annot['label']} [{annot['start']}, {annot['end']}]")
    
    # Save annotations to a JSON file
    if st.button("Save Annotations"):
        with open("annotations.json", "w") as f:
            json.dump(st.session_state.annotations, f, indent=4)
        st.success("Annotations saved to 'annotations.json'")

2024-12-04 17:28:27.400 
  command:

    streamlit run /Users/aviralgarg/code/stocks_3_10/.venv/lib/python3.10/site-packages/ipykernel_launcher.py [ARGUMENTS]


In [4]:
pip install streamlit-components

[31mERROR: Could not find a version that satisfies the requirement streamlit-components (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for streamlit-components[0m[31m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [5]:
import streamlit as st
import json
import streamlit.components.v1 as components

# Initialize session state for annotations
if 'annotations' not in st.session_state:
    st.session_state.annotations = []

# Initialize session state for entity types
if 'entity_types' not in st.session_state:
    st.session_state.entity_types = ['SKU']

st.title("NER Annotation Tool")
st.write("Select text and assign named entities.")

# Text input
text = st.text_area("Enter Text for Annotation:", height=200)

if text:
    st.markdown("### Annotate Entities")
    st.markdown("Select a portion of the text below, and assign an entity type from the dropdown menu.")
    
    # HTML and JavaScript for text selection
    html_code = f"""
    <div id="text-container" style="border:1px solid #ccc; padding:10px; border-radius:5px;">
        {text}
    </div>
    <script>
        const textContainer = document.getElementById("text-container");
        textContainer.addEventListener("mouseup", function() {{
            const selection = window.getSelection();
            const selectedText = selection.toString();
            if (selectedText.length > 0) {{
                const range = selection.getRangeAt(0);
                const preSelectionRange = range.cloneRange();
                preSelectionRange.selectNodeContents(textContainer);
                preSelectionRange.setEnd(range.startContainer, range.startOffset);
                const start = preSelectionRange.toString().length;
                const end = start + range.toString().length;
                // Send the data back to Streamlit
                window.parent.postMessage({{
                    'type': 'selection',
                    'selected_text': selectedText,
                    'start': start,
                    'end': end
                }}, '*');
            }}
        }});
    </script>
    """
    
    # Create a placeholder for the JavaScript component
    components.html(html_code, height=300)
    
    # JavaScript to Streamlit communication
    selection = st.experimental_get_query_params().get("selected_text", [None])[0]
    start = st.experimental_get_query_params().get("start", [0])[0]
    end = st.experimental_get_query_params().get("end", [0])[0]
    
    # Define a component to capture the JavaScript message
    selection_data = st.empty()
    
    # Custom Streamlit component to handle messages
    components_js = """
    <script>
    window.addEventListener("message", function(event) {
        if (event.data && event.data.type === 'selection') {
            const selected_text = event.data.selected_text;
            const start = event.data.start;
            const end = event.data.end;
            // Send to Streamlit
            const data = {selected_text, start, end};
            window.parent.postMessage({ "streamlit-message": JSON.stringify(data) }, "*");
        }
    });
    </script>
    """
    components.html(components_js)
    
    # Placeholder to display selected text
    selected_text = st.text_input("Selected Text:")
    selected_start = st.number_input("Start Position:", min_value=0, value=0)
    selected_end = st.number_input("End Position:", min_value=0, value=0)
    
    # Dropdown to select entity type
    entity_type = st.selectbox("Select Entity Type:", st.session_state.entity_types)
    
    # Button to save annotation
    if st.button("Save Annotation"):
        if selected_text and selected_start < selected_end:
            annotation = {
                "text": selected_text,
                "start": selected_start,
                "end": selected_end,
                "label": entity_type
            }
            st.session_state.annotations.append(annotation)
            st.success(f"Annotated: {selected_text} as {entity_type}")
        else:
            st.error("Please select valid text and positions.")
    
    # Button to add a new entity type
    if st.button("Add New Entity Type"):
        new_entity = st.text_input("Enter new entity type:")
        if new_entity:
            st.session_state.entity_types.append(new_entity)
            st.success(f"Added new entity type: {new_entity}")
    
    # Display current annotations
    st.markdown("### Current Annotations")
    for annot in st.session_state.annotations:
        st.write(f"{annot['text']} --> {annot['label']} [{annot['start']}, {annot['end']}]")
    
    # Save annotations to a JSON file
    if st.button("Save Annotations"):
        with open("annotations.json", "w") as f:
            json.dump(st.session_state.annotations, f, indent=4)
        st.success("Annotations saved to 'annotations.json'")



In [9]:
# save the last cell into a file called ner_annotation_app.py
# then go to terminal and 

!streamlit run ner_annotation_app.py


      👋 [1mWelcome to Streamlit![0m

      If you’d like to receive helpful onboarding emails, news, offers, promotions,
      and the occasional swag, please enter your email address below. Otherwise,
      leave this field blank.

      [34mEmail: [0m ^C
2024-12-04 17:33:08.449 
