In [16]:
import spacy
import random
from spacy.util import minibatch, compounding
from pathlib import Path
from spacy.training import Example

In [17]:
with open("cars.txt") as file:
    dataset = file.read()

In [18]:
nlp = spacy.load("en_core_web_lg")
doc = nlp(dataset)
print("Entities:", [(ent.text, ent.label_) for ent in doc.ents])

Entities: [('Fiat', 'ORG'), ('Porsche', 'ORG'), ('Toyota', 'ORG'), ('Skoda', 'ORG'), ('110 million euros', 'MONEY'), ('Boleslav', 'PRODUCT'), ('2', 'CARDINAL'), ('Honda', 'ORG'), ('Civic Type R.', 'PRODUCT'), ('Mondeo', 'ORG'), ('fourth', 'ORDINAL'), ('Audi', 'ORG'), ('Sportback', 'ORG'), ('Bugatti', 'ORG'), ('first', 'ORDINAL'), ('2030', 'DATE'), ('Honda', 'ORG'), ('Toyota', 'ORG'), ('Fiat', 'ORG'), ('500XL', 'PRODUCT'), ('5', 'CARDINAL'), ('Kia', 'ORG'), ('Porsche', 'ORG'), ('Volkswagen', 'ORG'), ('Ferrari', 'PRODUCT'), ('Lotus', 'ORG'), ('The Land Rover', 'ORG'), ('the Kia Stonic', 'ORG'), ('Jaguar', 'ORG'), ('Citroen', 'ORG'), ('Ford', 'ORG'), ('Kia', 'ORG'), ('Ford', 'ORG'), ('Ka', 'PRODUCT'), ('Mercedes', 'ORG'), ('Milan', 'GPE'), ('Smart', 'ORG'), ('Jeep', 'ORG'), ('Wrangler', 'PRODUCT'), ('Suzuki', 'ORG')]


In [19]:
words= [ "Abarth", "Alfa Romeo", "Aston Martin", "Audi", "Bentley", "BMW", "Bugatti", "Cadillac", "Chevrolet", "Chrysler", "Citroën",
        "Dacia","Daewoo","Daihatsu","Dodge","Donkervoort","DS", "Ferrari", "Fiat", "Fisker", "Ford", "Honda", "Hummer", "Hyundai",
        "Infiniti","Iveco","Jaguar","Jeep","Kia","KTM","Lada","Lamborghini","Lancia","Land Rover","Landwind","Lexus","Lotus","Maserati",
        "Maybach","Mazda","McLaren","Mercedes-Benz","MG","Mini","Mitsubishi","Morgan","Nissan","Opel","Peugeot","Porsche","Renault",
        "Rolls-Royce","Saab","Seat","Skoda","Smart","SsangYong","Subaru","Suzuki","Tesla","Toyota","Volkswagen","Volvo",]

In [20]:
train_data = []

In [21]:
with open("cars.txt") as file:
    dataset = file.readlines()
    for sentence in dataset:
        print("######")
        print("sentence: ", sentence)
        print("######")
        sentence = sentence.lower()
        entities = []
        for word in words:
            word = word.lower()
            if word in sentence:
                start_index = sentence.index(word)
                end_index = len(word) + start_index
                print("word: ", word)
                print("----------------")
                print("start index:", start_index)
                print("end index:", end_index)
                pos = (start_index, end_index, "CAR")
                entities.append(pos)
        element = (sentence.rstrip('\n'), {"entities": entities})

        train_data.append(element)
        print('----------------')
        print("element:", element)

        ("this is my sentence", {"entities": [0, 4, "PREP"]})
        ("this is my sentence", {"entities": [6, 8, "VERB"]})

######
sentence:  What is the price of that Fiat 500XL?

######
word:  fiat
----------------
start index: 26
end index: 30
----------------
element: ('what is the price of that fiat 500xl?', {'entities': [(26, 30, 'CAR')]})
######
sentence:  Have you ever driven a Porsche?

######
word:  porsche
----------------
start index: 23
end index: 30
----------------
element: ('have you ever driven a porsche?', {'entities': [(23, 30, 'CAR')]})
######
sentence:  Toyota's new model is something extraordinary.

######
word:  toyota
----------------
start index: 0
end index: 6
----------------
element: ("toyota's new model is something extraordinary.", {'entities': [(0, 6, 'CAR')]})
######
sentence:  Skoda has invested 110 million euros to upgrade the lines of the Boleslav factory.

######
word:  skoda
----------------
start index: 0
end index: 5
----------------
element: ('skoda has invested 110 million euros to upgrade the lines of the boleslav factory.', {'entities': [(0, 5, 'CAR')]})
######
sen

In [22]:
ner = nlp.get_pipe("ner")

for _, annotations in train_data:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

In [23]:
# Training model
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]


with nlp.disable_pipes(*unaffected_pipes):
    for iteration in range(60):
        print("Iteration #", iteration)
        # Data shuffle for each iteration
        random.shuffle(train_data)
        losses = {}

        batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
        for batch in batches:
            for text, annotations in batch:
                # Create an Example object
                doc = nlp.make_doc(text)
                example = Example.from_dict(doc, annotations)
                nlp.update([example], losses=losses, drop=0.1)
                # Update the model
        print("Losses:", losses)

Iteration # 0
Losses: {'ner': 51.97861171007611}
Iteration # 1
Losses: {'ner': 21.663507769229497}
Iteration # 2
Losses: {'ner': 2.9478178715584207}
Iteration # 3
Losses: {'ner': 1.858482168662217}
Iteration # 4
Losses: {'ner': 1.2121309533570694}
Iteration # 5
Losses: {'ner': 1.675817336484937}
Iteration # 6
Losses: {'ner': 1.6983436784676362}
Iteration # 7
Losses: {'ner': 1.087571020713953}
Iteration # 8
Losses: {'ner': 1.9918660781384467}
Iteration # 9
Losses: {'ner': 0.951151707052491}
Iteration # 10
Losses: {'ner': 1.4281030047871028}
Iteration # 11
Losses: {'ner': 0.8105468829204088}
Iteration # 12
Losses: {'ner': 27.639045738249134}
Iteration # 13
Losses: {'ner': 28.54100299076437}
Iteration # 14
Losses: {'ner': 1.9229051734174873}
Iteration # 15
Losses: {'ner': 2.709783579269482}
Iteration # 16
Losses: {'ner': 1.4794484479004997}
Iteration # 17
Losses: {'ner': 1.4089912995110128}
Iteration # 18
Losses: {'ner': 0.8913804507396723}
Iteration # 19
Losses: {'ner': 1.890615852630716

In [24]:
output_dir = Path("/ner/")
nlp.to_disk(output_dir)
print("Saved correctly!")

Saved correctly!


In [25]:
print("Loading model...")
nlp_updated = spacy.load(output_dir)

Loading model...


In [26]:
# new sentence, new word
doc = nlp_updated("Research before you buy or lease a new Tesla vehicle with expert ratings")
print("entities:", [(ent.text, ent.label_) for ent in doc.ents])

entities: [('Tesla', 'CAR')]


In [None]:
# new sentence, new word
doc = nlp_updated("Research before you buy or lease a new Tesla vehicle with expert ratings")
print("entities:", [(ent.text, ent.label_) for ent in doc.ents])

In [27]:
# new sentence, new word
doc = nlp_updated("Read the latest Mercedes new car reviews, put through their paces by our team of expert road testers, covering performance, depreciation, servicing cost, ")
print("entities:", [(ent.text, ent.label_) for ent in doc.ents])

entities: [('Mercedes', 'CAR')]


*Farrukh Bulbulov*