In [3]:
import spacy
from spacy.training.example import Example

# Create a blank language model for English
nlp = spacy.blank("en")

# Add a Named Entity Recognition (NER) component to the pipeline
ner = nlp.add_pipe("ner")
ner.add_label("COLOR")  # Add the "COLOR" label to the NER component

# Training data: list of sentences with labeled entities
training_data = [
    ("The apple is red", {"entities": [(12, 15, "COLOR")]}),
    ("The door is blue", {"entities": [(11, 15, "COLOR")]}),
    ("The car is green", {"entities": [(11, 16, "COLOR")]}),
    ("The house is yellow", {"entities": [(13, 19, "COLOR")]}),
    ("The stadium is orange", {"entities": [(15, 21, "COLOR")]}),
    ("The app is purple", {"entities": [(11, 17, "COLOR")]}),
    ("The ball is pink", {"entities": [(11, 15, "COLOR")]}),
    ("The shirt is brown", {"entities": [(12, 17, "COLOR")]}),
    ("The cat is black", {"entities": [(11, 16, "COLOR")]}),
    ("The book is white", {"entities": [(12, 17, "COLOR")]}),
    ("The cloud is gray", {"entities": [(12, 16, "COLOR")]}),
    ("The bike is silver", {"entities": [(12, 18, "COLOR")]}),
]
# Convert the training data into Spacy's format
spacy_training_data = []
for text, annotations in training_data:
    example = Example.from_dict(nlp.make_doc(text), annotations)
    spacy_training_data.append(example)

# Disable other pipeline components during training to only update the NER component
disabled_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]

# Training loop
with nlp.disable_pipes(*disabled_pipes):
    optimizer = nlp.begin_training()
    for iteration in range(100):  # Number of iterations
        losses = {}
        for example in spacy_training_data:
            nlp.update([example], drop=0.5, losses=losses)
        # Uncomment the following line to print losses during training
        # print(losses)

# Save the trained model to a file
nlp.to_disk("color_ner_model")

# Load the trained model
nlp_loaded = spacy.load("color_ner_model")

# Example text for entity recognition
text = "red blue green yellow orange purple pink brown black white gray"

# Process the text using the trained model
doc = nlp_loaded(text)

# Extract recognized entities
for ent in doc.ents:
    print(ent.text, ent.label_)


red COLOR
blue COLOR
green COLOR
yellow COLOR
orange COLOR
purple COLOR
pink COLOR
brown COLOR
black COLOR
white COLOR
gray COLOR


In [1]:
from nltk import word_tokenize
from nltk.tag import ClassifierBasedPOSTagger
from nltk.classify import MaxentClassifier

# Training sentences for the custom POS tagger
sent_train = [
    [("cars", "NNS"), ("are", "VBP"), ("parked", "VBN"), ("in", "IN"), ("the", "DT"), ("garage", "NN")],
    [("cats", "NNS"), ("are", "VBP"), ("playing", "VBG"), ("in", "IN"), ("the", "DT"), ("garden", "NN")],
    [("doors", "NNS"), ("are", "VBP"), ("closed", "VBN")],
    [("pillows", "NNS"), ("are", "VBP"), ("on", "IN"), ("the", "DT"), ("sofa", "NN")]
]

# I used the nltk library and didn't use the gensim library 
# due to a version issue with gensim.

# Train the classifier based on Maxent
classifier_tagger = ClassifierBasedPOSTagger(train=sent_train, classifier_builder=MaxentClassifier.train)

# The classifier is built using the Maximum Entropy method, which is a 
# widely used machine learning algorithm for classification. 
# The 'MaxentClassifier.train' function is used to train the classifier 
# using the provided training data (sent_train). 

test = [
    "doors",
    "pillows",
    "cars",
    "cats",
    "Books",
    "Houses",
    "Trees",
    "Computers",
    "Flowers",
    "Birds",
    "Children",
    "Chairs",
    "Keys",
    "Cups"
]

# Test the POS tagger
for word in test:
    words = word_tokenize(word)
    pos_tags = classifier_tagger.tag(words)
    print("Original Word:", word)
    print("POS Tags: ", pos_tags)
    print("=" * 50)


  ==> Training (100 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -1.94591        0.200
             2          -0.71757        1.000
             3          -0.45197        1.000
             4          -0.32966        1.000
             5          -0.25940        1.000
             6          -0.21380        1.000
             7          -0.18183        1.000
             8          -0.15817        1.000
             9          -0.13996        1.000
            10          -0.12550        1.000
            11          -0.11376        1.000
            12          -0.10402        1.000
            13          -0.09582        1.000
            14          -0.08882        1.000
            15          -0.08278        1.000
            16          -0.07750        1.000
            17          -0.07286        1.000
            18          -0.06874        1.000
            19          -0.06507        1.000
 