# Tokenization

Parsing text into tokens - words, punctuation or numbers

In [8]:
import re
import spacy
import pandas as pd

nlp = spacy.load('en_core_web_sm')

doc = nlp(open('./assets/sentence_got.txt').read())
chapter = nlp(open('./assets/chapter_got.txt').read())

print([token.text for token in doc])

['“', 'We', 'should', 'start', 'back', ',', '”', 'Gared', 'urged', 'as', 'the', 'woods', 'began', 'to', 'grow', 'dark', 'around', 'them', '.', '\n', '“', 'The', 'wildlings', 'are', 'dead', '.', '”']


# Lemmatization

Reducing tokens to it's base forms. 

In [9]:
df = pd.DataFrame([(token.text, token.lemma_) for token in doc], columns=['Token', 'Lemma'])

df.style

Unnamed: 0,Token,Lemma
0,“,""""
1,We,we
2,should,should
3,start,start
4,back,back
5,",",","
6,”,""""
7,Gared,Gared
8,urged,urge
9,as,as


# Coarse-grained part-of-speech

Core parts of speech - noun, pronoun, determiner, adjective, verb, adverb, preposition, conjunction and interjection

In [10]:
pd.DataFrame([(token.text, token.pos_) for token in doc], columns=['Token','POS']).style

Unnamed: 0,Token,POS
0,“,PUNCT
1,We,PRON
2,should,AUX
3,start,VERB
4,back,ADV
5,",",PUNCT
6,”,PUNCT
7,Gared,PROPN
8,urged,VERB
9,as,SCONJ


# Fine-grained part-of-speech

[part-of-speech with morphological features](https://github.com/explosion/spaCy/blob/master/spacy/glossary.py) such as tenses and types of pronouns.

In [11]:
pd.DataFrame([(token.text, token.tag_, spacy.explain(token.tag_)) for token in doc], columns=['Token', 'tag_', 'Explanation.tag_']).style

Unnamed: 0,Token,tag_,Explanation.tag_
0,“,``,opening quotation mark
1,We,PRP,"pronoun, personal"
2,should,MD,"verb, modal auxiliary"
3,start,VB,"verb, base form"
4,back,RB,adverb
5,",",",","punctuation mark, comma"
6,”,'',closing quotation mark
7,Gared,NNP,"noun, proper singular"
8,urged,VBD,"verb, past tense"
9,as,IN,"conjunction, subordinating or preposition"


# Find quotes

In [12]:
for sentence in chapter.sents:
    quotes = re.findall(r"“(.*?)”", str(sentence))
    
    if len(quotes):
        print(quotes)

['We should start back,']
['The wildlings are dead.']
['Do the dead frighten you?']
['Dead is dead,']
['Are they dead?']
['What proof have we?']
['Will saw them,']
['If he says they are dead, that’s proof enough for me.']
['My wet nurse said the same thing, Will,']
['We have a long ride before us,']
['Bet he killed them all himself, he did,', 'twisted their little heads off, our mighty warrior.']


# Syntactic Relations

Shows syntactic relations between tokens in sentence and connects pairs of words with a single arc.

In [13]:
pd.DataFrame([(token.head.text, token.text, token.dep_, spacy.explain(token.dep_) ) for token in doc], columns=['Head','text', 'dep_', 'explanation']).style

Unnamed: 0,Head,text,dep_,explanation
0,urged,“,punct,punctuation
1,start,We,nsubj,nominal subject
2,start,should,aux,auxiliary
3,urged,start,ccomp,clausal complement
4,start,back,advmod,adverbial modifier
5,urged,",",punct,punctuation
6,urged,”,punct,punctuation
7,urged,Gared,nsubj,nominal subject
8,urged,urged,ROOT,
9,began,as,mark,marker


Extract description of money

In [14]:
phrase = nlp(u"Taiwan Semiconductor Manufacturing Co. on Tuesday, 6 december, to announce the opening of a second chip plant in Arizona, raising the company’s investment in the state from $12 billion to $40 billion.")

def extract():
    extraction = ''
    if token.tag_ == '$':
        extraction = token.text
        nextToken = token.i + 1
        while phrase[nextToken].tag_ == 'CD':
            extraction += phrase[nextToken].text + ' '
            nextToken += 1
        print(extraction)
     
for token in phrase:
    extract()

pd.DataFrame([(token.text, token.tag_, token.pos_, spacy.explain(token.tag_), spacy.explain(token.pos_)) for token in phrase], columns=['text', 'tag_', 'pos_', 'expl.tag_', 'expl.pos_'])


$12 billion 
$40 billion 


Unnamed: 0,text,tag_,pos_,expl.tag_,expl.pos_
0,Taiwan,NNP,PROPN,"noun, proper singular",proper noun
1,Semiconductor,NNP,PROPN,"noun, proper singular",proper noun
2,Manufacturing,NNP,PROPN,"noun, proper singular",proper noun
3,Co.,NNP,PROPN,"noun, proper singular",proper noun
4,on,IN,ADP,"conjunction, subordinating or preposition",adposition
5,Tuesday,NNP,PROPN,"noun, proper singular",proper noun
6,",",",",PUNCT,"punctuation mark, comma",punctuation
7,6,CD,NUM,cardinal number,numeral
8,december,NNP,PROPN,"noun, proper singular",proper noun
9,",",",",PUNCT,"punctuation mark, comma",punctuation
