In [1]:
import pandas as pd
import re
import spacy
from spacy import displacy
import spacy_transformers

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
with open("parktool_article_text.txt", "r") as f:
    text = f.readlines()

In [4]:
df = pd.DataFrame(text, columns=['sentence'])

In [5]:
df['size'] = df['sentence'].apply(lambda x: len(x))

In [7]:
df_clean = df[df['size'] > 47]
df_clean.sort_values(by='size', ascending=False)

Unnamed: 0,sentence,size
2871,When your bike sustains this much damage its...,1359
3236,"This type of headset system has, unfortunate...",1322
1868,"Similar to the H screw setting, make the L s...",1229
3333,An antiquated but still common system uses i...,993
720,The wheel is the complete unit with all its ...,973
...,...,...
1985,Star Fangled Nut and Expansion Plug Installa...,66
2421,Brake Housing & Cable Installation: Upright ...,66
2422,Brake Lever Mounting & Positioning: Upright ...,66
2822,Star Fangled Nut and Expansion Plug Installa...,66


In [8]:
df_clean['sentence'] = df_clean['sentence'].apply(lambda x: x.replace('"','').replace('\n',' ').replace('\t', ' ').strip())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['sentence'] = df_clean['sentence'].apply(lambda x: x.replace('"','').replace('\n',' ').replace('\t', ' ').strip())


In [9]:
df_clean.sort_values(by='size', ascending=False)

Unnamed: 0,sentence,size
2871,When your bike sustains this much damage its a...,1359
3236,"This type of headset system has, unfortunately...",1322
1868,"Similar to the H screw setting, make the L scr...",1229
3333,An antiquated but still common system uses inc...,993
720,The wheel is the complete unit with all its co...,973
...,...,...
1985,Star Fangled Nut and Expansion Plug Installati...,66
2421,Brake Housing & Cable Installation: Upright Ba...,66
2422,Brake Lever Mounting & Positioning: Upright Ba...,66
2822,Star Fangled Nut and Expansion Plug Installati...,66


In [13]:
with open('lingo_dict_parktool.txt', 'r') as f:
    text = f.readlines()
lingo_list = []
for i in text:
    word = i.replace("'",'').replace('\n', '').strip().lower()
    if word not in lingo_list:
        lingo_list.append(word)

In [14]:
sorted(lingo_list)

['aero',
 'aerodynamic',
 'axle',
 'bar ends',
 'bar plugs',
 'barb',
 'bashguard',
 'basket',
 'bead',
 'bearing',
 'bell',
 'belt-drive',
 'bicycle suspension',
 'bicycle tools',
 'bicycle wheel',
 'bikes',
 'bmx',
 'bottle cage',
 'bottom bracket',
 'brake',
 'brake lever',
 'brake shifter',
 'brakes',
 'braze-on',
 'cable',
 'cable guide',
 'cable housing',
 'cadence',
 'cargo bike',
 'cartridge bearing',
 'cassette',
 'chain',
 'chain pin',
 'chain suck',
 'chain tensioner',
 'chainguard',
 'chainring',
 'chainrings',
 'chainset',
 'chainstay',
 'chamois',
 'cleat',
 'cleats',
 'clincher',
 'coaster brake',
 'cog',
 'cogset',
 'combi-pedal',
 'cone',
 'crank',
 'crankarm',
 'cranks',
 'crankset',
 'cross-country cycling',
 'crown race',
 'cup',
 'cycling bib',
 'cyclo-cross',
 'cyclocomputer',
 'cyclocross',
 'derailleur',
 'derailleur gears',
 'derailleur hanger',
 'di2',
 'disc brake',
 'down tube',
 'downhill',
 'downshift',
 'drafting',
 'drivetrain',
 'dropout',
 'drops',
 'd

#### Annotate each sentence if word is in lingo list, with position of the word

In [16]:
import re
import json

def annotate_sentence_spacy(string):
    # Comment, should both words and word chunks be present in entity list? ex. coaster brake, will also add brake to the list.

    temp_list = []
    doc = nlp(string)
    word = [token for token in doc]
    lemma_sentence = " ".join([token.lemma_ for token in doc])
    chunk_list = []
    flag = False
    entity_list = []
    for chunk in doc.noun_chunks:
        print(chunk)
        if all(token.is_punct != True and '-PRON-' not in token.lemma_ for token in chunk) == True:
            if len(chunk) > 1:
                chunk_list.append(chunk)

    # First find noun chunks of words in the lingo list
    for noun in chunk_list:
        print(noun.lemma_.lower())
        #print('start', string.index(noun.text))
        end_index = string.index(noun.text) + len(noun.text) -1
        #print('end', end_index)
        if noun.lemma_.lower() in lingo_list:

            noun_list = [m.start() for m in re.finditer(noun.text, string)]
            #print(word_list)
            for i in noun_list:
                start_index = i
                end_index = start_index + len(noun.text)
                entity = [start_index, end_index, f'"cycLingo"']
                if entity not in entity_list:
                    flag = True
                    entity_list.append(entity)

    # Find occurences of words in lingo list
    for j in word:
        #print('start', string.index(j.text))
        end_index = string.index(j.text) + len(j.text) -1
        #print('end', end_index)
        if j.lemma_ in lingo_list:

            word_list = [m.start() for m in re.finditer(j.text, string)]
            #print(word_list)
            for word in word_list:
                start_index = word
                end_index = start_index + len(j.text)
                entity = [start_index, end_index, f'"cycLingo"']
                if entity not in entity_list:
                    flag = True
                    entity_list.append(entity)
    #print(lemma_sentence)
    sentence_list = [string,flag, entity_list]
    jsonl_anno = {}
    jsonl_anno[f'"text"'] = string
    jsonl_anno[f'"label"'] = entity_list
    temp_list.append(sentence_list)

    return jsonl_anno

In [17]:
string = 'As almost all unicycles are ungeared, Crank length is a major factor in determining how much force is transmitted to the wheel'
annotate_sentence_spacy(string)

almost all unicycles
Crank length
a major factor
how much force
the wheel
almost all unicycle
crank length
a major factor
how much force
the wheel


{'"text"': 'As almost all unicycles are ungeared, Crank length is a major factor in determining how much force is transmitted to the wheel',
 '"label"': [[38, 43, '"cycLingo"'], [121, 126, '"cycLingo"']]}

In [20]:
df_park = df_clean.drop_duplicates()
df_park

Unnamed: 0,sentence,size
0,This article will review some typical yearly m...,218
1,The PRS-33.2 is Park Tools most versatile repa...,542
2,Well begin with the clamp. Inspect the jaw cov...,319
3,Pull the clamp out of the top tube. This will ...,424
4,"If the wear is acceptable, grease the threads ...",118
...,...,...
3686,Park Tool Company has discovered a defect in a...,103
3687,"Under certain conditions, the check valve on t...",187
3688,In order to assure that all PFP-2 owners have ...,402
3689,"Please note that no other model (PFP-3, PFP-4,...",103


In [21]:
df_park['anno'] = df_park['sentence'].apply(lambda x: annotate_sentence_spacy(x))
df_park

This article
some typical yearly maintenance procedures
your PRS-33
PRS-33.2
These procedures
your stand
optimal shape
peak season
larger problems
the road
this article
some typical yearly maintenance procedure
your prs-33
these procedure
your stand
optimal shape
peak season
large problem
the road
The PRS-33.2
Park Tools
most versatile repair stand
a powered motor
plenty
parts
its a good idea
your PRS-33.2
This article
some typical yearly maintenance procedures
your PRS-33.2
These procedures
your stand
optimal shape
peak season
larger problems
the road
you
any issues
your PRS-33.2
PRS-33
that
the procedures
PRS-33
PRS-33.2
Troubleshooting
us
the prs-33.2
park tools
most versatile repair stand
a powered motor
its a good idea
your prs-33.2
this article
some typical yearly maintenance procedure
your prs-33.2
these procedure
your stand
optimal shape
peak season
large problem
the road
any issue
your prs-33.2
the procedure
the clamp
the jaw
damage
clips
repair
new jaw covers
it
the new cover

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_park['anno'] = df_park['sentence'].apply(lambda x: annotate_sentence_spacy(x))


Unnamed: 0,sentence,size,anno
0,This article will review some typical yearly m...,218,"{'""text""': 'This article will review some typi..."
1,The PRS-33.2 is Park Tools most versatile repa...,542,"{'""text""': 'The PRS-33.2 is Park Tools most ve..."
2,Well begin with the clamp. Inspect the jaw cov...,319,"{'""text""': 'Well begin with the clamp. Inspect..."
3,Pull the clamp out of the top tube. This will ...,424,"{'""text""': 'Pull the clamp out of the top tube..."
4,"If the wear is acceptable, grease the threads ...",118,"{'""text""': 'If the wear is acceptable, grease ..."
...,...,...,...
3686,Park Tool Company has discovered a defect in a...,103,"{'""text""': 'Park Tool Company has discovered a..."
3687,"Under certain conditions, the check valve on t...",187,"{'""text""': 'Under certain conditions, the chec..."
3688,In order to assure that all PFP-2 owners have ...,402,"{'""text""': 'In order to assure that all PFP-2 ..."
3689,"Please note that no other model (PFP-3, PFP-4,...",103,"{'""text""': 'Please note that no other model (P..."


In [22]:
def is_anno(x):
    if len(x['"label"']) < 1:
        return False
    else:
        return True

In [23]:
df_park['is_anno'] = df_park['anno'].apply(lambda x: is_anno(x) )
df_park = df_park[df_park['is_anno'] == True]

df_park

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_park['is_anno'] = df_park['anno'].apply(lambda x: is_anno(x) )


Unnamed: 0,sentence,size,anno,is_anno
2,Well begin with the clamp. Inspect the jaw cov...,319,"{'""text""': 'Well begin with the clamp. Inspect...",True
10,Loosen and remove the bolts holding the upper ...,161,"{'""text""': 'Loosen and remove the bolts holdin...",True
17,"Next, check the set screws in the coupler that...",400,"{'""text""': 'Next, check the set screws in the ...",True
22,The chain inside the carriage of the stand sho...,186,"{'""text""': 'The chain inside the carriage of t...",True
23,"To address this, locate the turnbuckle at the ...",278,"{'""text""': 'To address this, locate the turnbu...",True
...,...,...,...,...
3682,It is critical the cable at the bridging hole ...,604,"{'""text""': 'It is critical the cable at the br...",True
3683,Figure 13. Cable transition at the 5:00 positi...,84,"{'""text""': 'Figure 13. Cable transition at the...",True
3684,The brake cable must make a sharp transition t...,301,"{'""text""': 'The brake cable must make a sharp ...",True
3685,The Travel Agent are also available in an in-l...,279,"{'""text""': 'The Travel Agent are also availabl...",True


In [24]:
with open('anno_parktool_true.jsonl', 'w') as f:
    for i in df_park['anno']:
        f.writelines(str(i).replace('\'"','\"').replace('\"\'','\"').replace(': \'',':"').replace('\',','",') +'\n')

In [25]:
df_park.to_csv('annotated_parktool_dataset.csv')