In [1]:
import pandas as pd
import re
import spacy
from spacy import displacy
import spacy_transformers

In [3]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.5.0
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


You should consider upgrading via the 'C:\Users\danie\PycharmProjects\testCyclingo\bike_env\Scripts\python.exe -m pip install --upgrade pip' command.


In [4]:
nlp = spacy.load("en_core_web_sm")

In [8]:
with open("../dataset/example_text.txt", "r") as f:
    text = f.readlines()

In [9]:
df = pd.DataFrame(text, columns=['sentence'])

In [10]:
df['size'] = df['sentence'].apply(lambda x: len(x))

In [11]:
df_clean = df[df['size'] > 47]
df_clean.sort_values(by='size', ascending=False)

Unnamed: 0,sentence,size
1,The PRS-33.2 is Park Tools most versatile re...,540
3,Pull the clamp out of the top tube. This wil...,422
17,"Next, check the set screws in the coupler th...",398
20,The drive shaft in the carriage goes to a sm...,343
2,Well begin with the clamp. Inspect the jaw c...,317
16,Perform a quick visual inspection of the mot...,291
6,The top tube of the repair stand may become ...,257
14,"To access these screws, you will need to rem...",243
0,This article will review some typical yearly m...,216
13,It is possible for vibrations from the motor...,197


In [12]:
df_clean['sentence'] = df_clean['sentence'].apply(lambda x: x.replace('"','').replace('\n',' ').replace('\t', ' ').strip())

In [13]:
df_clean.sort_values(by='size', ascending=False)

Unnamed: 0,sentence,size
1,The PRS-33.2 is Park Tools most versatile repa...,540
3,Pull the clamp out of the top tube. This will ...,422
17,"Next, check the set screws in the coupler that...",398
20,The drive shaft in the carriage goes to a smal...,343
2,Well begin with the clamp. Inspect the jaw cov...,317
16,Perform a quick visual inspection of the motor...,291
6,The top tube of the repair stand may become lo...,257
14,"To access these screws, you will need to remov...",243
0,This article will review some typical yearly m...,216
13,It is possible for vibrations from the motor o...,197


In [17]:
with open('../dataset/lingo_list.txt', 'r') as f:
    text = f.readlines()
lingo_list = []
for i in text:
    word = i.replace("'",'').replace('\n', '').strip().lower()
    if word not in lingo_list:
        lingo_list.append(word)

In [18]:
sorted(lingo_list)

['aero',
 'aerodynamic',
 'axle',
 'bar ends',
 'bar plugs',
 'barb',
 'bashguard',
 'basket',
 'bead',
 'bearing',
 'bell',
 'belt-drive',
 'bicycle suspension',
 'bicycle tools',
 'bicycle wheel',
 'bikes',
 'bmx',
 'bottle cage',
 'bottom bracket',
 'brake',
 'brake lever',
 'brake shifter',
 'brakes',
 'braze-on',
 'cable',
 'cable guide',
 'cable housing',
 'cadence',
 'cargo bike',
 'cartridge bearing',
 'cassette',
 'chain',
 'chain pin',
 'chain suck',
 'chain tensioner',
 'chainguard',
 'chainring',
 'chainrings',
 'chainset',
 'chainstay',
 'chamois',
 'cleat',
 'cleats',
 'clincher',
 'coaster brake',
 'cog',
 'cogset',
 'combi-pedal',
 'cone',
 'crank',
 'crankarm',
 'cranks',
 'crankset',
 'cross-country cycling',
 'crown race',
 'cup',
 'cycling bib',
 'cyclo-cross',
 'cyclocomputer',
 'cyclocross',
 'derailleur',
 'derailleur gears',
 'derailleur hanger',
 'di2',
 'disc brake',
 'down tube',
 'downhill',
 'downshift',
 'drafting',
 'drivetrain',
 'dropout',
 'drops',
 'd

#### Annotate each sentence if word is in lingo list, with position of the word

In [19]:
import re
import json

def annotate_sentence_spacy(string):
    # Comment, should both words and word chunks be present in entity list? ex. coaster brake, will also add brake to the list.

    temp_list = []
    doc = nlp(string)
    word = [token for token in doc]
    lemma_sentence = " ".join([token.lemma_ for token in doc])
    chunk_list = []
    flag = False
    entity_list = []
    for chunk in doc.noun_chunks:
        print(chunk)
        if all(token.is_punct != True and '-PRON-' not in token.lemma_ for token in chunk) == True:
            if len(chunk) > 1:
                chunk_list.append(chunk)

    # First find noun chunks of words in the lingo list
    for noun in chunk_list:
        print(noun.lemma_.lower())
        #print('start', string.index(noun.text))
        end_index = string.index(noun.text) + len(noun.text) -1
        #print('end', end_index)
        if noun.lemma_.lower() in lingo_list:

            noun_list = [m.start() for m in re.finditer(noun.text, string)]
            #print(word_list)
            for i in noun_list:
                start_index = i
                end_index = start_index + len(noun.text)
                entity = [start_index, end_index, f'"cycLingo"']
                if entity not in entity_list:
                    flag = True
                    entity_list.append(entity)

    # Find occurences of words in lingo list
    for j in word:
        #print('start', string.index(j.text))
        end_index = string.index(j.text) + len(j.text) -1
        #print('end', end_index)
        if j.lemma_ in lingo_list:

            word_list = [m.start() for m in re.finditer(j.text, string)]
            #print(word_list)
            for word in word_list:
                start_index = word
                end_index = start_index + len(j.text)
                entity = [start_index, end_index, f'"cycLingo"']
                if entity not in entity_list:
                    flag = True
                    entity_list.append(entity)
    #print(lemma_sentence)
    sentence_list = [string,flag, entity_list]
    jsonl_anno = {}
    jsonl_anno[f'"text"'] = string
    jsonl_anno[f'"label"'] = entity_list
    temp_list.append(sentence_list)

    return jsonl_anno

In [20]:
string = 'As almost all unicycles are ungeared, Crank length is a major factor in determining how much force is transmitted to the wheel'
annotate_sentence_spacy(string)

almost all unicycles
Crank length
a major factor
how much force
the wheel
almost all unicycle
crank length
a major factor
how much force
the wheel


{'"text"': 'As almost all unicycles are ungeared, Crank length is a major factor in determining how much force is transmitted to the wheel',
 '"label"': [[121, 126, '"cycLingo"']]}

In [21]:
df_park = df_clean.drop_duplicates()
df_park

Unnamed: 0,sentence,size
0,This article will review some typical yearly m...,216
1,The PRS-33.2 is Park Tools most versatile repa...,540
2,Well begin with the clamp. Inspect the jaw cov...,317
3,Pull the clamp out of the top tube. This will ...,422
4,"If the wear is acceptable, grease the threads ...",116
5,"While the clamp is removed, wipe out the insid...",105
6,The top tube of the repair stand may become lo...,257
7,Begin by driving the carriage nearly to the to...,64
8,Tip the stand on its side and onto a sturdy it...,129
9,Loosen the cross bar mounting screws and remov...,64


In [22]:
df_park['anno'] = df_park['sentence'].apply(lambda x: annotate_sentence_spacy(x))
df_park

This article
some typical yearly maintenance procedures
your PRS-33
PRS-33.2
These procedures
your stand
optimal shape
peak season
larger problems
the road
this article
some typical yearly maintenance procedure
your prs-33
these procedure
your stand
optimal shape
peak season
large problem
the road
The PRS-33.2
Park Tools
most versatile repair stand
a powered motor
plenty
moving parts
your PRS-33.2
This article
some typical yearly maintenance procedures
your PRS-33.2
These procedures
your stand
optimal shape
peak season
larger problems
the road
you
any issues
your PRS-33.2
PRS-33
that
the procedures
PRS-33
PRS-33.2
Troubleshooting
us
the prs-33.2
park tools
most versatile repair stand
a powered motor
move part
your prs-33.2
this article
some typical yearly maintenance procedure
your prs-33.2
these procedure
your stand
optimal shape
peak season
large problem
the road
any issue
your prs-33.2
the procedure
the clamp
the jaw
damage
missing clips
repair
new jaw covers
it
the new covers
a ped

Unnamed: 0,sentence,size,anno
0,This article will review some typical yearly m...,216,"{'""text""': 'This article will review some typi..."
1,The PRS-33.2 is Park Tools most versatile repa...,540,"{'""text""': 'The PRS-33.2 is Park Tools most ve..."
2,Well begin with the clamp. Inspect the jaw cov...,317,"{'""text""': 'Well begin with the clamp. Inspect..."
3,Pull the clamp out of the top tube. This will ...,422,"{'""text""': 'Pull the clamp out of the top tube..."
4,"If the wear is acceptable, grease the threads ...",116,"{'""text""': 'If the wear is acceptable, grease ..."
5,"While the clamp is removed, wipe out the insid...",105,"{'""text""': 'While the clamp is removed, wipe o..."
6,The top tube of the repair stand may become lo...,257,"{'""text""': 'The top tube of the repair stand m..."
7,Begin by driving the carriage nearly to the to...,64,"{'""text""': 'Begin by driving the carriage near..."
8,Tip the stand on its side and onto a sturdy it...,129,"{'""text""': 'Tip the stand on its side and onto..."
9,Loosen the cross bar mounting screws and remov...,64,"{'""text""': 'Loosen the cross bar mounting scre..."


In [23]:
def is_anno(x):
    if len(x['"label"']) < 1:
        return False
    else:
        return True

In [24]:
df_park['is_anno'] = df_park['anno'].apply(lambda x: is_anno(x) )
df_park = df_park[df_park['is_anno'] == True]

df_park

Unnamed: 0,sentence,size,anno,is_anno
2,Well begin with the clamp. Inspect the jaw cov...,317,"{'""text""': 'Well begin with the clamp. Inspect...",True
10,Loosen and remove the bolts holding the upper ...,159,"{'""text""': 'Loosen and remove the bolts holdin...",True
11,Inspect the carriage and top tube for any wear...,155,"{'""text""': 'Inspect the carriage and top tube ...",True
17,"Next, check the set screws in the coupler that...",398,"{'""text""': 'Next, check the set screws in the ...",True


In [25]:
with open('../dataset/anno_true.jsonl', 'w') as f:
    for i in df_park['anno']:
        f.writelines(str(i).replace('\'"','\"').replace('\"\'','\"').replace(': \'',':"').replace('\',','",') +'\n')

In [27]:
df_park.to_csv('../dataset/annotated_dataset.csv', sep=';')