In [1]:
import pandas as pd
import re
import spacy
from spacy import displacy
import spacy_transformers

### Load dataset


In [2]:
with open('../dataset/custom_ner_dataset.jsonl', 'r') as f:
    text_data = f.readlines()

In [3]:
dataset_list = []
for i in text_data:
    dataset_list.append(eval(i))
dataset_list

[{'id': 12673,
  'text': 'Well begin with the clamp. Inspect the jaw covers  Check for damage and missing clips and repair or replace as needed. If new jaw covers are needed, it can help to clamp the new covers around a pedal wrench during install to ensure they seat properly. Also check the blue saddle pad for wear and replace as needed.',
  'label': [[194, 199, 'cycLingo'],
   [237, 241, 'cycLingo'],
   [272, 278, 'cycLingo']],
  'Comments': []},
 {'id': 12674,
  'text': 'Loosen and remove the bolts holding the upper chain retainer to the upright. Remove the upper chain retention bolt and unthread the chain from the carriage.',
  'label': [[46, 51, 'cycLingo'],
   [94, 99, 'cycLingo'],
   [132, 137, 'cycLingo']],
  'Comments': []},
 {'id': 12675,
  'text': 'Next, check the set screws in the coupler that couples the motor to the drive shaft of the carriage. These set screws are in charge of transmitting power and can come loose over time. Remove the set screws one at a time, apply med

In [4]:
df = pd.DataFrame(dataset_list, columns=['id','text','label'])

In [5]:
df.head()

Unnamed: 0,id,text,label
0,12673,Well begin with the clamp. Inspect the jaw cov...,"[[194, 199, cycLingo], [237, 241, cycLingo], [..."
1,12674,Loosen and remove the bolts holding the upper ...,"[[46, 51, cycLingo], [94, 99, cycLingo], [132,..."
2,12675,"Next, check the set screws in the coupler that...",[]
3,12676,The chain inside the carriage of the stand sho...,"[[4, 9, cycLingo], [145, 150, cycLingo], [176,..."
4,12677,"To address this, locate the turnbuckle at the ...","[[154, 159, cycLingo]]"


#### Randomize order of rows and split into 80% training and 20% testing set

In [6]:
import numpy as np
df.set_index('id')
df['random'] = np.random.randint(0, 3000, df.shape[0])

In [7]:
df = df.sort_values(by='random')
df = df.reset_index()

df

Unnamed: 0,index,id,text,label,random
0,460,13133,Installation of a press fit bottom bracket is ...,"[[18, 27, cycLingo], [28, 42, cycLingo], [67, ...",0
1,509,13182,If the bottom bracket is installed and removed...,"[[7, 21, cycLingo], [125, 131, cycLingo], [177...",1
2,587,13260,"When dealing with a carbon steerer tube, it is...","[[27, 40, cycLingo], [68, 80, cycLingo], [108,...",1
3,720,13393,Troubleshooting refers to extra issues or feat...,"[[254, 262, cycLingo]]",3
4,1767,14440,"There are different styles of tubeless valves,...","[[30, 38, cycLingo]]",3
...,...,...,...,...,...
2085,1452,14125,NOTE: Brush off and wipe freehub body clean. D...,"[[25, 32, cycLingo], [57, 64, cycLingo], [92, ...",2992
2086,1092,13765,Inspect under the fork leg ends for an adjustm...,"[[18, 22, cycLingo], [85, 89, cycLingo]]",2992
2087,1600,14273,It can be difficult to outline the cleats as d...,"[[35, 41, cycLingo], [73, 79, cycLingo], [127,...",2992
2088,598,13271,This article will walk through the process of ...,"[[64, 73, cycLingo]]",2993


### Training set = 2090 * 0.8 = 1672, 0-1672
### Test set = 2090 - 1672 = 418

In [8]:
df_training = df[df.index <= 1672]
df_test = df[df.index >1672]
df_training

Unnamed: 0,index,id,text,label,random
0,460,13133,Installation of a press fit bottom bracket is ...,"[[18, 27, cycLingo], [28, 42, cycLingo], [67, ...",0
1,509,13182,If the bottom bracket is installed and removed...,"[[7, 21, cycLingo], [125, 131, cycLingo], [177...",1
2,587,13260,"When dealing with a carbon steerer tube, it is...","[[27, 40, cycLingo], [68, 80, cycLingo], [108,...",1
3,720,13393,Troubleshooting refers to extra issues or feat...,"[[254, 262, cycLingo]]",3
4,1767,14440,"There are different styles of tubeless valves,...","[[30, 38, cycLingo]]",3
...,...,...,...,...,...
1668,1787,14460,The less common 36-degree angular contact stan...,"[[58, 65, cycLingo], [138, 145, cycLingo], [17...",2366
1669,1634,14307,The rear wheel is installed only when the hub-...,"[[9, 14, cycLingo], [42, 45, cycLingo]]",2369
1670,1615,14288,The NuVinci uses a single twist-style shifter....,"[[38, 45, cycLingo], [172, 179, cycLingo], [20...",2373
1671,1133,13806,"After referencing the left side of the bike, c...","[[183, 192, cycLingo], [194, 203, cycLingo], [...",2374


In [9]:
df_test

Unnamed: 0,index,id,text,label,random
1673,1309,13982,Campagnolo drivetrains require the longest pos...,"[[11, 22, cycLingo], [52, 57, cycLingo], [212,...",2375
1674,1888,14561,IMPORTANT NOTE: The common ISO\/English thread...,"[[160, 169, cycLingo], [226, 232, cycLingo], [...",2377
1675,214,12887,Snug the slider knob and lift off the tool. Fl...,"[[53, 58, cycLingo]]",2377
1676,1090,13763,Begin by mounting the bike in a repair stand a...,"[[68, 73, cycLingo], [86, 98, cycLingo], [131,...",2377
1677,584,13257,Install the new stem and spacers. Set the bike...,"[[16, 20, cycLingo], [102, 108, cycLingo]]",2378
...,...,...,...,...,...
2085,1452,14125,NOTE: Brush off and wipe freehub body clean. D...,"[[25, 32, cycLingo], [57, 64, cycLingo], [92, ...",2992
2086,1092,13765,Inspect under the fork leg ends for an adjustm...,"[[18, 22, cycLingo], [85, 89, cycLingo]]",2992
2087,1600,14273,It can be difficult to outline the cleats as d...,"[[35, 41, cycLingo], [73, 79, cycLingo], [127,...",2992
2088,598,13271,This article will walk through the process of ...,"[[64, 73, cycLingo]]",2993


In [10]:
#df_training = df_training.drop(['index'], axis=1)
df_training.set_index('id')

Unnamed: 0_level_0,index,text,label,random
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
13133,460,Installation of a press fit bottom bracket is ...,"[[18, 27, cycLingo], [28, 42, cycLingo], [67, ...",0
13182,509,If the bottom bracket is installed and removed...,"[[7, 21, cycLingo], [125, 131, cycLingo], [177...",1
13260,587,"When dealing with a carbon steerer tube, it is...","[[27, 40, cycLingo], [68, 80, cycLingo], [108,...",1
13393,720,Troubleshooting refers to extra issues or feat...,"[[254, 262, cycLingo]]",3
14440,1767,"There are different styles of tubeless valves,...","[[30, 38, cycLingo]]",3
...,...,...,...,...
14460,1787,The less common 36-degree angular contact stan...,"[[58, 65, cycLingo], [138, 145, cycLingo], [17...",2366
14307,1634,The rear wheel is installed only when the hub-...,"[[9, 14, cycLingo], [42, 45, cycLingo]]",2369
14288,1615,The NuVinci uses a single twist-style shifter....,"[[38, 45, cycLingo], [172, 179, cycLingo], [20...",2373
13806,1133,"After referencing the left side of the bike, c...","[[183, 192, cycLingo], [194, 203, cycLingo], [...",2374


In [11]:
training_list = []
for key,value in df_training.iterrows():
    tmp_dict = {}
    tmp_dict['id'] = value['id']
    tmp_dict['text'] = value['text']
    tmp_dict['label'] = value['label']
    training_list.append(tmp_dict)
training_list

[{'id': 13133,
  'text': 'Installation of a press fit bottom bracket is similar to a pressed headset. There are a few tool options and all require a press of some sort. For example, use the Park Tool HHP-2, HHP-3, or the BBP-1.2',
  'label': [[18, 27, 'cycLingo'], [28, 42, 'cycLingo'], [67, 74, 'cycLingo']]},
 {'id': 13182,
  'text': 'If the bottom bracket is installed and removed using threads, there will be tool fittings visible. This is also the case for frames using press fit shells and a thread-together bottom bracket, also known as a thread-thru. For purposes of tool selection, a thread-together bottom bracket is treated the same as a threaded bottom bracket.',
  'label': [[7, 21, 'cycLingo'],
   [125, 131, 'cycLingo'],
   [177, 191, 'cycLingo'],
   [272, 286, 'cycLingo'],
   [321, 335, 'cycLingo']]},
 {'id': 13260,
  'text': 'When dealing with a carbon steerer tube, it is critical to have the steerer tube go all the way through the stem, in order to reduce stress on the end of t

In [12]:
training_data = {'classes' : ['cycLingo'], 'annotations' : []}
for i in training_list:
    temp_dict = {}
    temp_dict["text"] = i["text"]
    temp_dict["entities"] = []
    for annotation in i['label']:
        start = annotation[0]
        end = annotation[1]
        label = annotation[2]
        temp_dict["entities"].append((start,end,label))
    training_data['annotations'].append(temp_dict)

training_data

{'classes': ['cycLingo'],
 'annotations': [{'text': 'Installation of a press fit bottom bracket is similar to a pressed headset. There are a few tool options and all require a press of some sort. For example, use the Park Tool HHP-2, HHP-3, or the BBP-1.2',
   'entities': [(18, 27, 'cycLingo'),
    (28, 42, 'cycLingo'),
    (67, 74, 'cycLingo')]},
  {'text': 'If the bottom bracket is installed and removed using threads, there will be tool fittings visible. This is also the case for frames using press fit shells and a thread-together bottom bracket, also known as a thread-thru. For purposes of tool selection, a thread-together bottom bracket is treated the same as a threaded bottom bracket.',
   'entities': [(7, 21, 'cycLingo'),
    (125, 131, 'cycLingo'),
    (177, 191, 'cycLingo'),
    (272, 286, 'cycLingo'),
    (321, 335, 'cycLingo')]},
  {'text': 'When dealing with a carbon steerer tube, it is critical to have the steerer tube go all the way through the stem, in order to reduce str

In [13]:
test_list = []
for key,value in df_test.iterrows():
    tmp_dict_test = {}
    tmp_dict_test['id'] = value['id']
    tmp_dict_test['text'] = value['text']
    tmp_dict_test['label'] = value['label']
    test_list.append(tmp_dict_test)
test_list

[{'id': 13982,
  'text': 'Campagnolo drivetrains require the longest possible chain length the bike can use. Campagnolo says this results in a feeling of softness in shifting by reducing spring tension at the pulley cage. You will need a chain tool to break the chain and an 8mm hex wrench to use a feeler gauge. If you are running the eleven speed Campagnolo chain, note that our CT-4.3 or CT-6.3 include peening anvils.',
  'label': [[11, 22, 'cycLingo'],
   [52, 57, 'cycLingo'],
   [212, 217, 'cycLingo'],
   [236, 241, 'cycLingo'],
   [334, 339, 'cycLingo']]},
 {'id': 14561,
  'text': 'IMPORTANT NOTE: The common ISO\\/English thread standard found on most bikes uses a left-hand thread on the drive side of the bike. The drive side is also called chainring side, and is the right side as seen from sitting in the saddle. A left-hand thread on the drive side is done so the cups tends to be self tightening. This may on the surface appear counter-intuitive, but because the ball bearings are ro

In [14]:

test_data = {'classes': ['cycLingo'], 'annotations': []}
for i in test_list:
    tmp_dict = {}
    tmp_dict["text"] = i["text"]
    tmp_dict["entities"] = []
    for annotation in i['label']:
        start = annotation[0]
        end = annotation[1]
        label = annotation[2]
        tmp_dict["entities"].append((start, end, label))
    test_data['annotations'].append(tmp_dict)

test_data

{'classes': ['cycLingo'],
 'annotations': [{'text': 'Campagnolo drivetrains require the longest possible chain length the bike can use. Campagnolo says this results in a feeling of softness in shifting by reducing spring tension at the pulley cage. You will need a chain tool to break the chain and an 8mm hex wrench to use a feeler gauge. If you are running the eleven speed Campagnolo chain, note that our CT-4.3 or CT-6.3 include peening anvils.',
   'entities': [(11, 22, 'cycLingo'),
    (52, 57, 'cycLingo'),
    (212, 217, 'cycLingo'),
    (236, 241, 'cycLingo'),
    (334, 339, 'cycLingo')]},
  {'text': 'IMPORTANT NOTE: The common ISO\\/English thread standard found on most bikes uses a left-hand thread on the drive side of the bike. The drive side is also called chainring side, and is the right side as seen from sitting in the saddle. A left-hand thread on the drive side is done so the cups tends to be self tightening. This may on the surface appear counter-intuitive, but because the

In [15]:
print(len(test_data['annotations']))

417


In [16]:
print(len(training_data['annotations']))

1673


### Write training and test dataset to a text file

In [19]:
with open('../dataset/ner_training_data.jsonl', 'w') as f:
    for i in training_data['annotations']:
        f.writelines(str(i)+'\n')

In [20]:
with open('../dataset/ner_test_data.jsonl', 'w') as f:
    for i in test_data['annotations']:
        f.writelines(str(i)+'\n')

#### Convert JSONL dataset from Doccano to Spacy format dataset.

In [21]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm
nlp = spacy.blank("en") # load a new spacy model
doc_bin = DocBin() # create a DocBin object
from spacy.util import filter_spans

for training_example  in tqdm(training_data['annotations']):
    text = training_example['text']
    labels = training_example['entities']
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in labels:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    filtered_ents = filter_spans(ents)
    doc.ents = filtered_ents
    doc_bin.add(doc)

doc_bin.to_disk("../dataset/ner_training_data.spacy") # save the docbin object

 74%|███████▎  | 1232/1673 [00:00<00:00, 4336.37it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping

100%|██████████| 1673/1673 [00:00<00:00, 4253.49it/s]


Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping

In [23]:
nlp_test = spacy.blank("en") # load a new spacy model
doc_bin_test = DocBin() # create a DocBin object

for test_example  in tqdm(test_data['annotations']):
    text_test = test_example['text']
    labels_test = test_example['entities']
    doc_test = nlp_test.make_doc(text_test)
    ents_test = []
    for start, end, label in labels_test:
        span_test = doc_test.char_span(start, end, label=label, alignment_mode="contract")
        if span_test is None:
            print("Skipping entity")
        else:
            ents_test.append(span_test)
    filtered_ents_test = filter_spans(ents_test)
    doc_test.ents = filtered_ents_test
    doc_bin_test.add(doc_test)

doc_bin_test.to_disk("../dataset/ner_test_data.spacy") # save the docbin object

100%|██████████| 417/417 [00:00<00:00, 3993.74it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping




## Train a blank custom NER model with Spacy

In [None]:
# Download and fill out base config file as per documentation: https://spacy.io/usage/training

#!python -m spacy init fill-config base_config.cfg config.cfg

In [3]:
#!python -m spacy train config.cfg --output ./output_parktool_model --paths.train ./training_data_parktool.spacy --paths.dev ./test_data_parktool.spacy --gpu-id 0

[+] Created output directory: output_parktool_model

[2023-02-06 15:37:29,616] [INFO] Set up nlp object from config
[2023-02-06 15:37:29,622] [INFO] Pipeline: ['transformer', 'ner']
[2023-02-06 15:37:29,624] [INFO] Created vocabulary
[2023-02-06 15:37:29,624] [INFO] Finished initializing nlp object
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[2023-02-06 15:37:39,098] [INFO] Initialized pipel


[i] Saving to output directory: output_parktool_model
[i] Using GPU: 0
[1m
[+] Initialized pipeline
[1m
[i] Pipeline: ['transformer', 'ner']
[i] Initial learn rate: 0.0
E    #       LOSS TRANS...  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  -------------  --------  ------  ------  ------  ------
  0       0        2841.07    238.49    0.25    0.26    0.25    0.00
  3     200      412377.02  54591.29   84.32   84.11   84.53    0.84
  6     400        4179.10   6877.74   87.09   86.34   87.85    0.87
 10     600        1976.84   3304.60   88.92   88.92   88.92    0.89
 13     800         987.51   1756.40   87.84   84.17   91.86    0.88
 16    1000         689.49   1257.57   87.72   84.63   91.05    0.88
 20    1200         485.58    994.39   88.31   87.47   89.17    0.88
 23    1400         392.24    833.77   87.55   84.97   90.29    0.88
 27    1600         398.72    747.25   88.10   85.68   90.67    0.88
 30    1800         266.21    633.33   88.87   88.51   89.23    0.89


In [2]:
import spacy
nlp = spacy.load("./output/model-best")
nlp.add_pipe('sentencizer')

doc = nlp('''In more than 20 years of wrenching for pros and even more years as a bike shop mechanic, I’ve learned a few things about bottom bracket repair. It’s important to remember that bottom bracket squeaks have nothing to do with the age of the bike. Sometimes, even when a bike is new from the factory, the bottom bracket isn't tight and it creaks. Other times, it's not the bottom bracket at all''')

print(doc.ents)
for sent in doc.sents:

    doc_ner = nlp(sent.text)

    if doc_ner.ents:

        colors = {"cycLingo": "#F67DE3"}
        options = {"colors": colors}

        spacy.displacy.render(doc_ner, style="ent", options= options, jupyter=True)

    else:
        print(sent.text)

(bottom bracket,)


It’s important to remember that bottom bracket squeaks have nothing to do with the age of the bike.
Sometimes, even when a bike is new from the factory, the bottom bracket isn't tight and it creaks.
Other times, it's not the bottom bracket at all


In [4]:
nlp = spacy.load("./output/model-best")
nlp.add_pipe('sentencizer')

doc = nlp('''Removing an inner tube for repair or replacement is a lot easier if you know a few helpful tricks and don’t try and rush things. Let our experts talk you through the easiest way to fit a fresh tube and get rolling again as soon as possible.''')

for sent in doc.sents:
    doc_ner = nlp(sent.text)
    if doc_ner.ents:
        colors = {"cycLingo": "#F67DE3"}
        options = {"colors": colors}
        spacy.displacy.render(doc_ner, style="ent", options= options, jupyter=True)
    else:
        print(sent.text)

Removing an inner tube for repair or replacement is a lot easier if you know a few helpful tricks and don’t try and rush things.
Let our experts talk you through the easiest way to fit a fresh tube and get rolling again as soon as possible.


In [8]:
doc = nlp('''Diagnose wear issues by first cleaning the chain, ensuring the hanger is straight and the wheel is fully seated in the drops. Shift the gears. Turn the barrel adjuster (knob on the shift cable) either in or out to center the derailleur over the cog with each shift to eliminate jumping. Check derailleur stops in the highest and lowest gears to prevent the chain from jumping off the cassette.''')

print(doc.ents)
for sent in doc.sents:

    doc_ner = nlp(sent.text)

    if doc_ner.ents:

        colors = {"cycLingo": "#F67DE3"}
        options = {"colors": colors}

        spacy.displacy.render(doc_ner, style="ent", options= options, jupyter=True)

    else:
        print(sent.text)

(chain, hanger, wheel, seated, barrel adjuster, shift cable, derailleur, cog, derailleur, chain, cassette)


Shift the gears.


In [14]:
doc = nlp('''A bicycle spoke wrench is used to adjust wheel spokes in order to true a wheel that is, put it back into alignment. It’s also used when installing a new spoke. Each spoke is secured to the wheel rim by a spoke nipple, which can be turned to either tighten or loosen the spoke’s tension. Ideally, you want every spoke tensioned equally and in a way that keeps the wheel trued and not warped. (Note: If you’re tightening one spoke, check the opposing spoke directly across from it. You might need to loosen it a half-turn so as to not over-tension the rim.''')

print(doc.ents)
for sent in doc.sents:

    doc_ner = nlp(sent.text)

    if doc_ner.ents:

        colors = {"cycLingo": "#F67DE3"}
        options = {"colors": colors}

        spacy.displacy.render(doc_ner, style="ent", options= options, jupyter=True)

    else:
        print(sent.text)

(spoke, wheel, spokes, wheel, spoke, spoke, wheel, rim, spoke nipple, spoke, spoke, wheel, spoke, spoke, rim)
