In [1]:
!source ./setup_spacy.sh

Collecting en-core-web-sm==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0-py3-none-any.whl (12.8 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [2]:
from parse_dep import *
import pandas as pd
from spacy import displacy

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [3]:
from datasets import load_dataset

In [4]:
wiki_data = load_dataset("wikipedia", "20220301.en")

Found cached dataset wikipedia (/Users/andrejerkelens/.cache/huggingface/datasets/wikipedia/20220301.en/2.0.0/aa542ed919df55cc5d3347f42dd4521d05ca68751f50dbc32bae2a7f1e167559)


  0%|          | 0/1 [00:00<?, ?it/s]

In [5]:
nlp = get_spacy_pipeline()

In [6]:
# This is how we can read in the data from the conllulex repo
train_df = pd.read_json('datasets/streusle.ud_train.json')
dev_df = pd.read_json('datasets/streusle.ud_dev.json')
test_df = pd.read_json('datasets/streusle.ud_test.json')

In [7]:
# Example from LSTMs compose and learn bottom up
doc = nlp("Socrates asked the student trick questions")
print(doc)

S ocrates Ġasked Ġthe Ġstudent Ġtrick Ġquestions 


In [12]:
# Get the dependencies
train_df['dependencies'] = train_df.apply(lambda x: parse_dependencies(nlp, x['text'], index=True), axis=1)

In [13]:
train_df['dependencies'].head(10)

0    {'B': 'npadvmod', 'illing': 'amod', 'ĠIssues':...
1    {'I': 'nsubj', 'Ġhad': 'ROOT', 'Ġa': 'dobj', '...
2    {'My': 'poss', 'Ġinsurance': 'compound', 'Ġcom...
3    {'Then': 'advmod', 'ĠI': 'compound', 'Ġgot': '...
4    {'The': 'det', 'Ġdoctor': 'poss', ''s': 'case'...
5    {'Blue': 'compound', 'Ġcross': 'compound', 'Ġh...
6    {'The': 'det', 'Ġoffice': 'npadvmod', 'Ġrefuse...
7    {'They': 'nsubj', 'Ġeventually': 'advmod', 'Ġt...
8    {'I': 'nmod', 'Ġeventually': 'advmod', 'Ġdecid...
9    {'It': 'nsubj', 'Ġwas': 'ROOT', 'Ġan': 'ROOT',...
Name: dependencies, dtype: object

In [10]:
doc.ents

(Ġasked Ġthe Ġstudent,)

In [11]:
for e in doc:
    print([e.text, e.ent_iob_, e.ent_type_])

['S', 'O', '']
['ocrates', 'O', '']
['Ġasked', 'B', 'ORG']
['Ġthe', 'I', 'ORG']
['Ġstudent', 'I', 'ORG']
['Ġtrick', 'O', '']
['Ġquestions', 'O', '']


In [12]:
ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
print(ents)

[('Ġasked Ġthe Ġstudent', 10, 30, 'ORG')]


In [13]:
train_df.head(10)

Unnamed: 0,sent_id,text,streusle_sent_id,mwe,toks,etoks,swes,smwes,wmwes
0,reviews-003418-0001,Billing Issues...,ewtb.r.003418.1,Billing Issues ...,"[{'#': 1, 'word': 'Billing', 'lemma': 'billing...",[],"{'1': {'lexlemma': 'billing', 'lexcat': 'N', '...",{},{}
1,reviews-003418-0002,I had a routine surgery for an ingrown toenail.,ewtb.r.003418.2,I had_ a routine _surgery for an ingrown_toena...,"[{'#': 1, 'word': 'I', 'lemma': 'I', 'upos': '...",[],"{'1': {'lexlemma': 'I', 'lexcat': 'PRON', 'ss'...","{'1': {'lexlemma': 'have surgery', 'lexcat': '...",{}
2,reviews-003418-0003,"My insurance company, Blue Cross/Blue Shield p...",ewtb.r.003418.3,"My insurance company , Blue_Cross_/_Blue_Shiel...","[{'#': 1, 'word': 'My', 'lemma': 'my', 'upos':...",[],"{'1': {'lexlemma': 'my', 'lexcat': 'PRON.POSS'...","{'1': {'lexlemma': 'Blue Cross / Blue Shield',...",{}
3,reviews-003418-0004,Then I got a bill for $483.00.,ewtb.r.003418.4,Then I got~ a ~bill for $ 483.00 .,"[{'#': 1, 'word': 'Then', 'lemma': 'then', 'up...",[],"{'1': {'lexlemma': 'then', 'lexcat': 'ADV', 's...",{},"{'1': {'lexlemma': 'get bill', 'toknums': [3, ..."
4,reviews-003418-0005,The doctor's office said that payments had bee...,ewtb.r.003418.5,The doctor_'s_office said that payments had be...,"[{'#': 1, 'word': 'The', 'lemma': 'the', 'upos...","[{'#': [2, 3, '2-3'], 'word': 'doctor's', 'lem...","{'1': {'lexlemma': 'the', 'lexcat': 'DET', 'ss...","{'1': {'lexlemma': 'doctor 's office', 'lexcat...",{}
5,reviews-003418-0006,Blue cross has no record of aa reversal.,ewtb.r.003418.6,Blue_cross has no record of aa reversal .,"[{'#': 1, 'word': 'Blue', 'lemma': 'Blue', 'up...",[],"{'3': {'lexlemma': 'have', 'lexcat': 'V', 'ss'...","{'1': {'lexlemma': 'Blue cross', 'lexcat': 'N'...",{}
6,reviews-003418-0007,The office refused my requests to see what the...,ewtb.r.003418.7,The office refused my requests to see what the...,"[{'#': 1, 'word': 'The', 'lemma': 'the', 'upos...",[],"{'1': {'lexlemma': 'the', 'lexcat': 'DET', 'ss...","{'1': {'lexlemma': 'BC / BS', 'lexcat': 'N', '...",{}
7,reviews-003418-0008,They eventually turned it over to a collection...,ewtb.r.003418.8,They eventually turned_ it _over to a collecti...,"[{'#': 1, 'word': 'They', 'lemma': 'they', 'up...",[],"{'1': {'lexlemma': 'they', 'lexcat': 'PRON', '...","{'1': {'lexlemma': 'turn over', 'lexcat': 'V.V...","{'3': {'lexlemma': 'discuss the matter', 'tokn..."
8,reviews-003418-0009,I eventually decided to just pay the balance e...,ewtb.r.003418.9,I eventually decided to just pay the balance e...,"[{'#': 1, 'word': 'I', 'lemma': 'I', 'upos': '...",[],"{'1': {'lexlemma': 'I', 'lexcat': 'PRON', 'ss'...","{'1': {'lexlemma': 'even though', 'lexcat': 'S...",{}
9,reviews-003418-0010,It was an ingrown toenail.,ewtb.r.003418.10,It was an ingrown_toenail .,"[{'#': 1, 'word': 'It', 'lemma': 'it', 'upos':...",[],"{'1': {'lexlemma': 'it', 'lexcat': 'PRON', 'ss...","{'1': {'lexlemma': 'ingrown toenail', 'lexcat'...",{}


In [9]:
displacy.render(doc, style='dep', options={'compact': True, 'distance': 100})

In [9]:
get_syntactic_distance(doc, index=True)

{(0, 1): 1,
 (0, 2): 3,
 (0, 3): 4,
 (0, 4): 4,
 (0, 5): 3,
 (0, 6): 2,
 (1, 2): 2,
 (1, 3): 3,
 (1, 4): 3,
 (1, 5): 2,
 (1, 6): 1,
 (2, 3): 3,
 (2, 4): 3,
 (2, 5): 2,
 (2, 6): 1,
 (3, 4): 2,
 (3, 5): 1,
 (3, 6): 2,
 (4, 5): 1,
 (4, 6): 2,
 (5, 6): 1}

In [11]:
get_syntactic_distance(doc, index=False)

{(S, ocrates): 1,
 (S, Ġasked): 3,
 (S, Ġthe): 4,
 (S, Ġstudent): 4,
 (S, Ġtrick): 3,
 (S, Ġquestions): 2,
 (ocrates, Ġasked): 2,
 (ocrates, Ġthe): 3,
 (ocrates, Ġstudent): 3,
 (ocrates, Ġtrick): 2,
 (ocrates, Ġquestions): 1,
 (Ġasked, Ġthe): 3,
 (Ġasked, Ġstudent): 3,
 (Ġasked, Ġtrick): 2,
 (Ġasked, Ġquestions): 1,
 (Ġthe, Ġstudent): 2,
 (Ġthe, Ġtrick): 1,
 (Ġthe, Ġquestions): 2,
 (Ġstudent, Ġtrick): 1,
 (Ġstudent, Ġquestions): 2,
 (Ġtrick, Ġquestions): 1}