In [1]:
import spacy

In [2]:
nlp = spacy.load("en_core_web_sm")

In [89]:
doc = nlp("My name is Dhiraj.")

In [90]:
print(doc) #looks like a string 

My name is Dhiraj.


In [91]:
type(doc) # but, its actually a special type of object called document object 

spacy.tokens.doc.Doc

In [92]:
[t for t in doc] # special in the sense we can loop over it

[My, name, is, Dhiraj, .]

In [93]:
t = doc[0]

In [94]:
t

My

In [95]:
type(t)

spacy.tokens.token.Token

**document is a collection of token objects**

In [96]:
# token object has lot of properties 
# pos = parts of speech 
for t in doc: 
    print(f"{t}: {t.pos_}")

My: PRON
name: NOUN
is: AUX
Dhiraj: PROPN
.: PUNCT


In [97]:
# visualize 
from spacy import displacy 

displacy.render(doc)

In [98]:
spacy.explain('poss')

'possession modifier'

In [100]:
displacy.render(doc, style='ent')

In [101]:
spacy.explain('GPE')

'Countries, cities, states'

### Lets identify the programming language 

For starters, lets begin with detecting programing language **go**

In [19]:
def has_go_token(doc):
    for token in doc:
        if token.lower_ in ["go", "golang", "objective-c"]:
            if token.pos_ != 'VERB':
                return True
    return False

In [22]:
doc = nlp("I love programming in objective-c")

In [23]:
has_go_token(doc)

False

In [24]:
pattern = [{'LOWER': 'objective'},
           {'IS_PUNCT': True},
           {'LOWER': 'c'}]

In [25]:
from spacy.matcher import Matcher

In [26]:
matcher = Matcher(nlp.vocab)

list

In [33]:
matcher.add("OBJ_C", [pattern]) # changed in 3.0


In [34]:
matcher(doc)

[(14600203822370300727, 4, 7)]

In [35]:
doc[4:7]

objective-c

In [36]:
# now we have the power to span over mulitple tokens and match them according to the rules 

In [53]:
obj_c_pattern = [{'LOWER': 'objective'}, {'IS_PUNCT': True}, {'LOWER': 'c'}] # from above
golang_pattern = [{'LOWER': {'IN' : ['go', 'golang']}, 'POS' : {'NOT_IN': ['VERB']}}]

In [54]:
matcher = Matcher(nlp.vocab)

In [58]:
matcher.add("PATTERN", [obj_c_pattern, golang_pattern])

In [65]:
doc = nlp("I love coding in objective-c and golang")

In [66]:
for t in doc:
    print(t, t.pos_)

I PRON
love VERB
coding VERB
in ADP
objective ADJ
- PUNCT
c NOUN
and CCONJ
golang PROPN


In [67]:
matches = matcher(doc)
matches

[(11920309760829426267, 4, 7), (11920309760829426267, 8, 9)]

In [69]:
for match_id, start, end in matches:
    print(doc[start : end])

objective-c
golang


## Cleaned up code 

In [70]:
from spacy.matcher import Matcher 

In [83]:
obj_c_pattern = [{'LOWER': 'objective'}, {'IS_PUNCT': True}, {'LOWER': 'c'}] # from above
golang_pattern1 = [{'LOWER': 'golang'}]
golang_pattern2 = [{'LOWER': 'go', 'POS' : {'NOT_IN': ['VERB']}}]

ruby_pattern = [{'LOWER': 'ruby'}]
python_pattern = [{'LOWER': 'python'}]
js_pattern = [{'LOWER': {'IN': ['js', 'javascript']}}]

In [84]:
matcher = Matcher(nlp.vocab)
matcher.add("PATTERNS", [obj_c_pattern, golang_pattern1, golang_pattern2, python_pattern, 
                         js_pattern, ruby_pattern])

In [81]:
doc = nlp("I am an iOS dev who codes in both go/golang as well as objective-c")
matches = matcher(doc)
for match_id, start, end in matches:
    print(doc[start: end])

golang
objective-c


In [85]:
doc = nlp("I have done some ruby, python, and go programming")
matches = matcher(doc)
for match_id, start, end in matches:
    print(doc[start: end])

ruby
python


In [86]:
[(t, t.pos_) for t in doc]

[(I, 'PRON'),
 (have, 'AUX'),
 (done, 'VERB'),
 (some, 'DET'),
 (ruby, 'NOUN'),
 (,, 'PUNCT'),
 (python, 'PROPN'),
 (,, 'PUNCT'),
 (and, 'CCONJ'),
 (go, 'VERB'),
 (programming, 'VERB')]

## Train an Entity detector model 

In [102]:
import spacy
import pandas as pd
from spacy import displacy 
from spacy.matcher import Matcher 

ModuleNotFoundError: No module named 'pandas'

In [104]:
nlp = spacy.load('en_core_web_sm')

In [105]:
doc = nlp("My name is Vincent and I was born on 23rd June 1987. \
           I work at Rasa from Haarlem. I just bought a guitar \
           cost $1000 on ebay and I will get is services here for 20 euro a year.")

In [106]:
displacy.render(doc, style='ent')

In [107]:
[(e, type(e)) for e in doc.ents]

[(Vincent, spacy.tokens.span.Span),
 (23rd June 1987, spacy.tokens.span.Span),
 (1000, spacy.tokens.span.Span),
 (ebay, spacy.tokens.span.Span),
 (20 euro, spacy.tokens.span.Span)]

In [110]:
[(e, e.label_) for e in doc.ents]

[(Vincent, 'PERSON'),
 (23rd June 1987, 'DATE'),
 (1000, 'MONEY'),
 (ebay, 'ORG'),
 (20 euro, 'QUANTITY')]

In [109]:
e.label_

'PERSON'

In [113]:
spacy.explain('PERSON')

'People, including fictional'

In [114]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x280cc643040>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x280cc648720>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x280cc6403a0>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x280cc540200>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x280cc4c0c00>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x280cc5e9dc0>)]