In [3]:
import spacy

In [9]:
# when you call spacy.blank("de"), it creates a blank German ("de" for Deutsch) language model. 
# this means that the model will have no pipeline components (like tagger, parser, entity recognizer) pre-loaded.
nlp = spacy.blank("en")
doc = nlp("'Let's go to N.Y.C!'")
print(type(nlp))
for token in doc:
    print("token:",token)

    

<class 'spacy.lang.en.English'>
token: '
token: Let
token: 's
token: go
token: to
token: N.Y.C
token: !
token: '


In [7]:
# when you call spacy.blank("de"), it creates a blank German ("de" for Deutsch) language model. 
# this means that the model will have no pipeline components (like tagger, parser, entity recognizer) pre-loaded.
nlp = spacy.blank("en")
doc = nlp("Dr. Strange loves pav bhaji of mumbai as its costs only £2 per plate.")

for token in doc:
    print("token:",token)

    

token: Dr.
token: Strange
token: loves
token: pav
token: bhaji
token: of
token: mumbai
token: as
token: its
token: costs
token: only
token: £
token: 2
token: per
token: plate
token: .


In [18]:
type(doc)
token=doc[1]
print(token)
token.is_currency

Let


False

In [19]:
for token in doc: 
    print(token, "-->", "index:", token.i, 
          "is_alpha:", token.is_alpha, "is_punct:", token.is_punct, "like_num:", token.like_num, "is_currency:", token.is_currency
         )

' --> index: 0 is_alpha: False is_punct: True like_num: False is_currency: False
Let --> index: 1 is_alpha: True is_punct: False like_num: False is_currency: False
's --> index: 2 is_alpha: False is_punct: False like_num: False is_currency: False
go --> index: 3 is_alpha: True is_punct: False like_num: False is_currency: False
to --> index: 4 is_alpha: True is_punct: False like_num: False is_currency: False
N.Y.C --> index: 5 is_alpha: False is_punct: False like_num: False is_currency: False
! --> index: 6 is_alpha: False is_punct: True like_num: False is_currency: False
' --> index: 7 is_alpha: False is_punct: True like_num: False is_currency: False


In [25]:
# we're going to read a text file and find the emails using spacy.. we could do this via regex by spacy has inbuilt functionality to search for emails in text
with open("students.txt") as f: 
    text = f.readlines()
print(text)

text = ' '.join(text)
print(text)

emails = []
doc = nlp(text)
for token in doc:
    if token.like_email:
        emails.append(token.text)
print("emails:",emails)
        

['Name | Email \n', 'Emily | emily@email.com\n', 'Ashton | ashton@email.com\n', 'Billie | bille@email.com']
Name | Email 
 Emily | emily@email.com
 Ashton | ashton@email.com
 Billie | bille@email.com
emails: ['emily@email.com', 'ashton@email.com', 'bille@email.com']


In [33]:
# adding special rules into tokenizer
# slang words - we want our nlp model spacy to recognise slang words like gimme - that should be two words 'give me'
doc = nlp("gimme doube cheese extra large healthy pizza") 
tokens = [token.text for token in doc]
print(tokens)

from spacy.symbols import ORTH
nlp.tokenizer.add_special_case("gimme",[{ORTH: "gim"},{ORTH: "me"}]) # we can't split to characters that do not already exist in the text
doc = nlp("gimme doube cheese extra large healthy pizza") 
tokens = [token.text for token in doc]
print(tokens)

['gim', 'me', 'doube', 'cheese', 'extra', 'large', 'healthy', 'pizza']
['gim', 'me', 'doube', 'cheese', 'extra', 'large', 'healthy', 'pizza']


In [47]:
nlp = spacy.blank("en")
doc = nlp("Dr. Strange loves pav bhaji of mumbai as its costs only £2 per plate. Hulk love chat of delhi")
print(nlp.pipe_names)
# for sentence in doc.sents:
#     print(sentence)

# this gives an error because we have created a 'blank' model and hence we don't have ability to break down by sentences, only tokens
# we need to add a pipeline 'sentencizer'
nlp.add_pipe('sentencizer')
doc = nlp("Dr. Strange loves pav bhaji of mumbai as its costs only £2 per plate. Hulk love chat of delhi")
for sentence in doc.sents:
    print(sentence)


[]
Dr. Strange loves pav bhaji of mumbai as its costs only £2 per plate.
Hulk love chat of delhi
