In [18]:
%run util/installers.ipynb

  "        !conda install tensorflow -y >> ~/install.log        \n",
  "    except:\n",


In [2]:
!jupyter nbextension enable --py --sys-prefix widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


### A brief primer on using SpaCy:

* Spacy Objects:
    * Documents
    * Tokens
    * Spans
* Each can contain metadata
* Metadata is added through Language models.

In [3]:
import spacy
# creates a Language model
if 'nlp' not in globals():
    nlp = spacy.load('en')

# language models are *callable*, returning Document objects:
document = nlp("Welcome to JupyterCon 2017!")

# documents provide *metadata* about the text, including tokenization, parts of speech, etc...
for token in document:
    print(token, token.pos_, token.dep_, token.lemma_)
    
    
# When we call language models, were calling a *list* of functions that operate on documents:
print()
for function in nlp.pipeline:
    print(function)

Welcome ADJ ROOT welcome
to ADP prep to
JupyterCon PROPN pobj jupytercon
2017 NUM nummod 2017
! PUNCT punct !

<spacy.tagger.Tagger object at 0x7face9592048>
<spacy.pipeline.DependencyParser object at 0x7facececc188>
<spacy.matcher.Matcher object at 0x7faceceb2128>
<spacy.pipeline.EntityRecognizer object at 0x7facececc778>


In [4]:
from IPython.display import IFrame
IFrame('https://spacy.io/docs/usage/lightning-tour', 800, 800)

### Word Tokenization and Sentence Segmentation: What is a lexical unit?
What is a word? [Dictionary entries + alternate word forms]

In [5]:
from IPython.display import IFrame
IFrame('https://en.wikipedia.org/wiki/Special:Random', 800, 400)

# Regex, Deterministic Approach:


### Intro to Regex:

Regex involves defining a sequence of characters, character sequences, quantifiers and rules in order to find certain types of text.




#### Characters:

Most characters are simple; they match themselves:

Pattern | Matches | Example
--- | --- 
Jupyter | Instances of "Jupyter" | Welcome to **Jupyter**Con 2017!
<br>
<br>
<br>
<br>
Other characters are special, such as [], which define **character sets**:

Pattern | Matches | Example
--- | --- 
[JCe] |  Single instances of the characters of "J", "C", "e" | W**e**lcom**e** to **J**upyt**e**r**C**on 2017!

<br>
<br>
<br>
Or **quantifiers** \*, ?, or +:

Pattern | Matches | Example
--- | --- 
e* | Zero or more "e", as many as possible | W**e**lcom**e** to Jupyt**e**rCon 2017!
e? | Zero or ones instances of "e" | W**e**lcom**e** to Jupyt**e**rCon 2017!
me+ | "m", then at least one "e", as many as possible | Welco**me** to JupyterCon 2017!
<br>
<br>
<br>
These quantifiers act on the pattern directly preceeding it.
<br>

<center>
What will this match?
</center>

Pattern | Example 
--- | --- 
re? | Welcome to JupyterCon 2017! 

<br>
<br>
<br>

The \*, +, and ? quantifiers are **greedy**, they will match as much text as they can.

Pattern | Matches | Example
--- | --- 
[A-Z][a-z]+ | Match the longest sequence of letters beginning with a capital | **Welcome** to **JupyterCon** 2017!

Adding "?" to a quantifier makes the search thrifty instead:

Pattern | Matches | Example
--- | --- 
[A-Z][a-z]+? | Match the shortest sequence of letters beginning with a capital | **We**lcome to **Ju**pyter**Co**n 2017!

<br>
<br>
<br>

Characters like \s, \d, are special sequences, denoting groups of characters:

Pattern | Matches | Example
--- | --- 
\s | Whitespace. | Welcome**_**to**_**JupyterCon**_**2017!
\d | Digits. | Welcome to JupyterCon **2017**!

Adding an extra "\" can escape special characters

Pattern | Matches | Example
--- | --- 
\\\s | Instances of "\s" | Welcome to JupyterCon 2017!

<br>
<br>
<br>

Special characters can also restrict matches to certain positions:


Pattern | Matches | Example
--- | --- 
^W | Match "W" if string begins with "W" | **W**elcome to JupyterCon 2017!
W$ | Match "W" if string ends with "W" | Welcome to JupyterCon 2017!


<br>
<br>
<br>

We can also use "|" to match patternA or patternB, given patternA|patternB:

Pattern | Matches | Example
--- | --- 
to&#124;Jupyter| Match "to" or "Jupyter" | Welcome **to** **Jupyter**Con 2017!

In [6]:
#%matplotlib notebook
from ipywidgets import Button, Textarea, Layout, Box, Label
from IPython.display import display, Markdown, clear_output
import re
import html


class RegexFinder(object):
    def __init__(self, init_text, init_pattern="", height="100px", width="600px"):
           
        # the areas for patterns and input strings
        self.text_field = Textarea(init_text, layout=Layout(height=height, width=width))
        self.pattern_field = Textarea(init_pattern, layout=Layout(height='30px'))
        
        # the boxes containing the fields and respective labels
        self.text_box = Box([Label(value='Text Box'), self.text_field])
        self.pattern_box = Box([Label(value='Pattern Box'), self.pattern_field])
        
        # a button to display results
        self.match_button = Button(description='Match Pattern', )
        self.match_button.on_click(self.match_pattern)
        
        # tell jupyter to display everything
        display(self.text_box)
        display(self.pattern_box)
        display(self.match_button)
        self.match_pattern(None)
        
    @property
    def pattern(self):
        # whenever we ask for the current pattern, create a re object
        try:
            return re.compile(self.pattern_field.value)
        except:
            raise ValueError("Bad Regex Pattern: {}, try again please!".format(self.pattern_field.value))
    
    @property
    def text(self):
        # whenever we ask for the string input, grab the text_field's current value
        return self.text_field.value    
    
    def match_pattern(self, b):
        #clear any output
        clear_output()
        # create a formatted string, create a markdown object, and have Jupyter display it
        if self.pattern.pattern != "":
            
            display(Markdown(self.format_match_markdown(self.text, self.pattern)))
        else:
            display(Markdown(self.text))
            
        
    def format_match_markdown(self, text, pattern):
        new_string = ""
        last = 0
        
        # for each found match
        for i in pattern.finditer(text):
            # get the string index of the match
            start, stop = i.span()
            # append string_since_last_match + format(match) to result
            new_string += text[last:start] + "<b style='color:blue;'><u>" + html.escape(text[start:stop]) +  "</u></b>"
            # set string index since last match to end of current match
            last = stop
        
        # grab anything leftover
        new_string += text[last:]
        return new_string
        

r = RegexFinder("Welcome to JupyterCon 2017!")

Welcome to JupyterCon 2017!

### Gather some data from Wikipedia

In [7]:
!pip install wikipedia >> ~/wikipedia_install.log
import wikipedia
wiki = wikipedia.WikipediaPage('Timeline_of_Solar_System_exploration')
intro_paragraph = "\n".join(wiki.content.split('\n')[11:40])

### Exercise 1: Match the dates in this document

(AA)

e.g. [day] [month] [year]

In [8]:
finder = RegexFinder(intro_paragraph, height="200px")

== 1950s ==

1957
 Sputnik 1 – 4 October 1957 – First Earth orbiter
 Sputnik 2 – 3 November 1957 – Earth orbiter, first animal in orbit, a dog named Laika
1958
 Explorer 1 – 1 February 1958 – Earth orbiter; first American orbiter, discovered Van Allen radiation belts
 Vanguard 1 – 17 March 1958 – Earth orbiter; oldest spacecraft still in Earth orbit
1959
 Luna 1 – 2 January 1959 – First lunar flyby (attempted lunar impact?)
 Pioneer 4 – 3 March 1959 – Lunar flyby
 Luna 2 – 12 September 1959 – First lunar impact
 Luna 3 – 4 October 1959 – Lunar flyby; First images of far side of Moon


== 1960s ==

1960
 Pioneer 5 – 11 March 1960 – Interplanetary space investigations
1961
 Venera 1 – 12 February 1961 – Venus flyby (contact lost before flyby)
 Vostok 1 – 12 April 1961 – First manned Earth orbiter
 Mercury-Redstone 3 – 5 May 1961 – First American in space
 Ranger 1 – 23 August 1961 – Attempted lunar test flight
 Ranger 2 – 18 November 1961 – Attempted lunar test flight
1962
 Ranger 3 – 26 January 1962 – Attempted lunar impact (missed Moon)
 Mercury-Atlas 6 – 20 February 1962 – First American manned Earth orbiter
 Ranger 4 – 23 April 1962 – Lunar impact (but unintentionally became the first spacecraft to hit the lunar farside and returned no data)

### Exercise 2: 
Write a pattern that splits all the sentences in "Separate Sentences", and nothing in "Single Sentence"

In [9]:
sents = """
Separate Sentences:
-------------------
assumes word senses. Within 

does the clustering. In the

but when? It was hard to tell

he arrive." After she had

mess! He did not let it

it wasn't hers!' She replied

always thought so.) Then

Single Sentences:
-------------------
in the U.S.A., people often

John?", he often thought, but

weighed 17.5 grams

well ... they'd better not

A.I. has long been a very

like that", he thought

but W. G. Grace never had much

"""
r = RegexFinder(sents, init_pattern="", width="400px", height="400px")


Separate Sentences:
-------------------
assumes word senses. Within 

does the clustering. In the

but when? It was hard to tell

he arrive." After she had

mess! He did not let it

it wasn't hers!' She replied

always thought so.) Then

Single Sentences:
-------------------
in the U.S.A., people often

John?", he often thought, but

weighed 17.5 grams

well ... they'd better not

A.I. has long been a very

like that", he thought

but W. G. Grace never had much



### Piecing it Together: segment sentences

In [10]:
import re

def sent_segmenter(doc):
    """Returns a list of sentences from document
    
    doc: string
    
    returns: list
    """
    
    # our pattern for finding sentence barriers
    pattern = re.compile('([a-z0-9])(?P<end_of_sentence>\.|\?|\!)(?P<next_sentence>\s|\"\s|\'\s|\)\s)')
    
    def recurse_sent(text, sents):
        match = pattern.search(text)
        
        # base case: if there is no sentence boundaries, add the current text
        # to list of sentences, return sentences
        if match is None:
            sents.append(text)
            return sents
        
        # other wise we have multiple sentences. add the current sentence to
        # list of sentences, pass in remaining text to head of the function
        else:
            curr_sent_ends = match.end('end_of_sentence')
            next_doc_begins = match.end('next_sentence')
            sents.append(text[:curr_sent_ends])
            return recurse_sent(text[next_doc_begins:], sents)
    return recurse_sent(doc, [])

text = "I went to the store. Then I ate garlic. Then I ate some more."
print(list(sent_segmenter(text)))

['I went to the store.', 'Then I ate garlic.', 'Then I ate some more.']


In [11]:
def word_tokenizer(doc):
    """Returns a list of sentences from document
    
    doc: string
    
    returns: list
    """    
    pattern = re.compile('\s')
    return pattern.split(doc)
    
text = "I  went to the store. Then I ate garlic. Then I ate some more."
print(list(word_tokenizer(text)))

['I', '', 'went', 'to', 'the', 'store.', 'Then', 'I', 'ate', 'garlic.', 'Then', 'I', 'ate', 'some', 'more.']


### But there is a problem:

**Tokenization**:
* Inconsistencies with Rules:
    * Given the pattern "[word]. [next]", is "." a token?
    * "I went to the store**.** Then I..." $\rightarrow$ yes, its a punctuation.
    * "Dr**.** White will now see you" $\rightarrow$ no, it's part of the token "Dr."
    * **SpaCy Solution**: define special exceptions.
* Some characters, even if not separated by a space, should be their own tokens:
    * The film made \$192,200,000 at the box office **(US: \$102,000,000).** $\rightarrow$ ["(", "\$","102,000,000", ")", "."]
    * **SpaCy Solution**: tokenize prefixes and suffixes
* Tokenization "destroys" the white space
    * **SpaCy Solution**, tokenization defined by character indexes


### How Spacy's Tokenizer Works:
* Split text by whitespace.
* Iterate over space-separated substrings: 
    * The dog doesn't shop at Macy's (anymore).$\rightarrow$ [The, dog, doesn't, shop, at, Macy's, (anymore).]
* Check whether we have an explicitly defined rule for this substring. If we do, use it.
    * doesn't $\rightarrow$ [does, n't]
* Otherwise, try to consume a prefix.
* (anymore). $\rightarrow$ [(, anymore).]
* If we consumed a prefix, go back to the beginning of the loop, so that special-cases always get priority.
* If we didn't consume a prefix, try to consume a suffix.
    * anymore). $\rightarrow$ [anymore, ), .]
* If we can't consume a prefix or suffix, look for "infixes" — stuff like hyphens etc.
* Once we can't consume any more of the string, handle it as a single token.
    * [The, dog, does, n't, shop, at, Macy's, (, anymore, ), .]

### A Closer Look at Exceptions: how to tokenize "don't"?

In [12]:
### Tokenizer execptions:
### What is Orth???
from spacy.en import TOKENIZER_EXCEPTIONS
from spacy.attrs import ORTH



# how we should tokenize "don't"
special_word = "don't"
exception = TOKENIZER_EXCEPTIONS["don't"]
readable_exception = [{nlp.vocab.strings[j]:i[j] for j in i} for i in exception]
print("Word: {}".format(special_word))
print("How to tokenize it: {}".format(exception))
print("Readble version: {}".format(readable_exception))



Word: don't
How to tokenize it: [{65: 'do', 73: 'do'}, {65: "n't", 75: 'RB', 73: 'not'}]
Readble version: [{'LEMMA': 'do', 'ORTH': 'do'}, {'TAG': 'RB', 'ORTH': "n't", 'LEMMA': 'not'}]


### A toy version of the SpaCy tokenizer:

In [13]:

from spacy.attrs import ORTH
import re

if 'nlp' not in globals():
    import spacy
    nlp = spacy.load('en')


def tokenize(doc):
    print("Now tokenizing:\n {}".format(doc))
    print()
    whitespace = re.compile("[\s]+")
    
    token_list = []
    
    # begin by iterating over candidate tokens, 
    # which are separated by white space
    for token in whitespace.split(doc):
        #expand the candidate tokens to the full list
        new_tokens = iter_tokenize(token, [])
        token_list.extend(new_tokens)
    return token_list

def indent(level):
    return "\t" * level

def iter_tokenize(token, tokens, level=0):
    if token in TOKENIZER_EXCEPTIONS:
        for addition in TOKENIZER_EXCEPTIONS[token]:
            print(indent(level), "appending exception ", addition[ORTH])
            tokens.append(addition[ORTH])
        return tokens
    else:
        pre_match = nlp.tokenizer.prefix_search(token)
        suff_match = nlp.tokenizer.suffix_search(token)     
        if pre_match:
            start, end = pre_match.start(), pre_match.end()
            print(indent(level), "Prefix case for {}".format(token))
            prefix, token = token[start:end], token[end:]
            print(indent(level + 1), "appending prefix ", prefix)
            tokens.append(prefix)
            tokens = iter_tokenize(token, tokens, level+1)
        elif suff_match:
            start, end = suff_match.start(), suff_match.end()
            print(indent(level), "Suffix case for {}".format(token))            
            token, suffix = token[:start], token[start:end]
            tokens = iter_tokenize(token, tokens, level+1)
            print(indent(level + 1), "appending suffix ", suffix)            
            tokens.append(suffix)
        else:
            print(indent(level), "appending token ", token)            
            tokens.append(token)
        return tokens

In [14]:
example = "She said 'don't forget to take out the trash!'"
tokens = tokenize(example)
print(tokens)

Now tokenizing:
 She said 'don't forget to take out the trash!'

 appending token  She
 appending token  said
 Prefix case for 'don't
	 appending prefix  '
	 appending exception  do
	 appending exception  n't
 appending token  forget
 appending token  to
 appending token  take
 appending token  out
 appending token  the
 Suffix case for trash!'
	 Suffix case for trash!
		 appending token  trash
		 appending suffix  !
	 appending suffix  '
['She', 'said', "'", 'do', "n't", 'forget', 'to', 'take', 'out', 'the', 'trash', '!', "'"]


# Extending the spacy tokenizer

* add a special case
* modify the tokenizer's prefix/suffix/infix search patterns
* create an entirely new tokenizer

#### Adding a special case
We might want to tokenize "gimme" as ["gim", "me"]

In [15]:
# Default English model tokenization:
example2 = u'Gimme that sandwich!'
for token in nlp(example2):
    print(token)

Gimme
that
sandwich
!


In [16]:
# lets add a special case to our model's tokenizer
from spacy.attrs import ORTH, LEMMA, TAG

nlp.tokenizer.add_special_case('gimme', [{ORTH:u'gim', LEMMA:u'give', TAG:u"VB"}, {ORTH:'me'}])
nlp.tokenizer.add_special_case('Gimme', [{ORTH:u'Gim', LEMMA:u'give', TAG:u"VB"}, {ORTH:'me'}])

# tokenization with our special case added:
example2 = u'Gimme that sandwich!'
for token in nlp(example2):
    print(token)


Gim
me
that
sandwich
!


More Examples:
* willya -> will, you
* tbt -> throw back thursdays
* ...

### Excercise:
* update the tokenizer with 3 additional special cases:

In [19]:
# nlp.tokenizer.add_special_case('', [{ORTH:u'', LEMMA:u'', TAG:u""}])
# nlp.tokenizer.add_special_case('', [{ORTH:u'', LEMMA:u'', TAG:u""}])
# nlp.tokenizer.add_special_case('', [{ORTH:u'', LEMMA:u'', TAG:u""}])

### Modify the tokenizer
We just modified the special exceptions component of the tokenizer. Sometimes we may need to modify our definitions of prefix, suffix.

**Example**: keep hashtags and mentions as part of the original token:

In [20]:
from spacy.tokenizer import Tokenizer
exceptions = spacy.en.TOKENIZER_EXCEPTIONS
prefixes = re.compile('''[\[\]\(\)\'\"]''')
suffixes = re.compile('''[\[\]\(\)\'\"]''') 

custom_tokenizer = Tokenizer(nlp.vocab
                             , exceptions
                             , prefixes.search
                             , suffixes.search
                             , nlp.tokenizer.infix_finditer)

example3 = "Gimme that sandwich @John #food"

print("Original Tokenizer")
print(list(nlp.tokenizer(example3)))    
print
print("New Tokenizer")
print(list(custom_tokenizer(example3)))    

    
    

Original Tokenizer
[Gim, me, that, sandwich, @John, #, food]
New Tokenizer
[Gimme, that, sandwich, @John, #food]


### create an entirely new tokenizer
Lastly, if you want to use an entirely different tokenization strategy. 

The Tokenizer must:
* be initialized with a SpaCy language model object.
* be callable, which returns a SpaCy Document object.

In [21]:
from spacy.tokens import Doc
class CustomTokenizer(object):
    
    def __init__(self, nlp):
        self.vocab = nlp.vocab
        self.whitespace = re.compile("\s")

    def __call__(self, text):
        words = self.tokenize(text)
        spaces = [True] * len(words)
        return Doc(self.vocab, words=words, spaces=spaces)
    
    def tokenize(self, text):
        return self.whitespace.split(text)
    

nlp.make_doc = CustomTokenizer(nlp)
list(nlp(example3))

[Gimme, that, sandwich, @John, #food]

### Adding your Tokenizer to the language pipeline

In [22]:
#method 1
nlp = spacy.load('en')
nlp.make_doc = custom_tokenizer

#method 2
nlp = spacy.load('en', make_doc=custom_tokenizer)