## DHSI22 Workshop "Deep Learning for Humanists" - Hoyeol Kim
#### Day 4 - Introduction to spaCy by Duygu Altinok (Guest Speaker)

##### Code was modified by Hoyeol Kim to make it work in Google Colab.

In [1]:
import spacy.cli
spacy.cli.download("en_core_web_md")
import en_core_web_md
nlp = en_core_web_md.load()

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')


In [3]:
nlp.pipe_names

['tagger', 'parser', 'ner']

In [4]:
type(nlp)

spacy.lang.en.English

In [5]:
import re

In [6]:
sent1 = "I earned 10.000$"
sent2 = "I spent 200$"
sent3 = "I spent $200"

In [7]:
pattern = u"\d+\s?[$£€]"

In [8]:
re.search(pattern, sent2)

<re.Match object; span=(8, 12), match='200$'>

In [9]:
re.search(pattern, sent1)

<re.Match object; span=(12, 16), match='000$'>

In [10]:
pattern = u"(\d+([.,]\d+)?)\s?[$£€]"

In [11]:
re.search(pattern, sent1)

<re.Match object; span=(9, 16), match='10.000$'>

In [12]:
re.search(pattern, sent2)

<re.Match object; span=(8, 12), match='200$'>

In [13]:
from spacy.matcher import Matcher

In [14]:
# Make an instance
matcher = Matcher(nlp.vocab)

In [15]:
pattern = [{"IS_DIGIT": True}, {"IS_CURRENCY": True}]

In [17]:
matcher.add("money", [pattern])

In [18]:
doc2 = nlp(sent2)

In [19]:
doc2

I spent 200$

In [20]:
dir(doc2)

['_',
 '__bytes__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 '_bulk_merge',
 '_py_tokens',
 '_realloc',
 '_vector',
 '_vector_norm',
 'cats',
 'char_span',
 'count_by',
 'doc',
 'ents',
 'extend_tensor',
 'from_array',
 'from_bytes',
 'from_disk',
 'get_extension',
 'get_lca_matrix',
 'has_extension',
 'has_vector',
 'is_nered',
 'is_parsed',
 'is_sentenced',
 'is_tagged',
 'lang',
 'lang_',
 'mem',
 'merge',
 'noun_chunks',
 'noun_chunks_iterator',
 'print_tree',
 'remove_extension',
 'retokenize',
 'sentiment',
 'sents',
 'set_extension',
 'similarity',
 'tensor',
 'text',
 'text_with_ws',
 'to_array',
 'to_byte

In [22]:
matches = matcher(doc2)

In [23]:
matches

[(14917444839815175757, 2, 4)]

In [28]:
for (_, start, end) in matches:
  print(doc2[start:end])

200$


In [29]:
pattern2 = [{"POS": "VERB"}]

In [30]:
matcher = Matcher(nlp.vocab)

In [31]:
matcher.add("allverbs", [pattern2])

In [32]:
sentence = "I visited my uncle after he had started having coffee"

In [36]:
doc = nlp(sentence)

In [37]:
doc

I visited my uncle after he had started having coffee

In [38]:
matches = matcher(doc)

In [39]:
matches

[(14920810774363675476, 1, 2),
 (14920810774363675476, 7, 8),
 (14920810774363675476, 8, 9)]

In [41]:
for (_, start, end) in matches:
  print(start, end, doc[start:end])

1 2 visited
7 8 started
8 9 having


In [42]:
for token in doc:
  print(token, token.pos_)

I PRON
visited VERB
my DET
uncle NOUN
after ADP
he PRON
had AUX
started VERB
having VERB
coffee NOUN


In [43]:
pattern3 = [{"POS": {"IN": ["VERB", "AUX"]}}]

In [44]:
matcher = Matcher(nlp.vocab)

In [45]:
matcher.add("allverbs", [pattern3])

In [47]:
matches = matcher(doc)

In [48]:
matches

[(14920810774363675476, 1, 2),
 (14920810774363675476, 6, 7),
 (14920810774363675476, 7, 8),
 (14920810774363675476, 8, 9)]

In [49]:
for token in doc:
  print(token, token.pos_)

I PRON
visited VERB
my DET
uncle NOUN
after ADP
he PRON
had AUX
started VERB
having VERB
coffee NOUN


In [50]:
for (_, start, end) in matches:
  print(start, end, doc[start:end])

1 2 visited
6 7 had
7 8 started
8 9 having
