# Tokenization

Parsing text into tokens - words, punctuation or numbers

In [8]:
import re
import spacy
import pandas as pd

nlp = spacy.load('en_core_web_sm')

doc = nlp(open('./assets/sentence_got.txt').read())
chapter = nlp(open('./assets/chapter_got.txt').read())

print([token.text for token in doc])

['“', 'We', 'should', 'start', 'back', ',', '”', 'Gared', 'urged', 'as', 'the', 'woods', 'began', 'to', 'grow', 'dark', 'around', 'them', '.', '\n', '“', 'The', 'wildlings', 'are', 'dead', '.', '”']


# Lemmatization

Reducing tokens to it's base forms. 

In [9]:
df = pd.DataFrame([(token.text, token.lemma_) for token in doc], columns=['Token', 'Lemma'])

df.style

Unnamed: 0,Token,Lemma
0,“,""""
1,We,we
2,should,should
3,start,start
4,back,back
5,",",","
6,”,""""
7,Gared,Gared
8,urged,urge
9,as,as


# Coarse-grained part-of-speech

Core parts of speech - noun, pronoun, determiner, adjective, verb, adverb, preposition, conjunction and interjection

In [10]:
pd.DataFrame([(token.text, token.pos_) for token in doc], columns=['Token','POS']).style

Unnamed: 0,Token,POS
0,“,PUNCT
1,We,PRON
2,should,AUX
3,start,VERB
4,back,ADV
5,",",PUNCT
6,”,PUNCT
7,Gared,PROPN
8,urged,VERB
9,as,SCONJ


# Fine-grained part-of-speech

[part-of-speech with morphological features](https://github.com/explosion/spaCy/blob/master/spacy/glossary.py) such as tenses and types of pronouns.

In [11]:
pd.DataFrame([(token.text, token.tag_, spacy.explain(token.tag_)) for token in doc], columns=['Token', 'pos_', 'Explanation']).style

Unnamed: 0,Token,pos_,Explanation
0,“,``,opening quotation mark
1,We,PRP,"pronoun, personal"
2,should,MD,"verb, modal auxiliary"
3,start,VB,"verb, base form"
4,back,RB,adverb
5,",",",","punctuation mark, comma"
6,”,'',closing quotation mark
7,Gared,NNP,"noun, proper singular"
8,urged,VBD,"verb, past tense"
9,as,IN,"conjunction, subordinating or preposition"


# Find quotes

In [7]:
for sentence in chapter.sents:
    quotes = re.findall(r"“(.*?)”", str(sentence))
    
    if len(quotes):
        print(quotes)

['We should start back,']
['The wildlings are dead.']
['Do the dead frighten you?']
['Dead is dead,']
['Are they dead?']
['What proof have we?']
['Will saw them,']
['If he says they are dead, that’s proof enough for me.']
['My wet nurse said the same thing, Will,']
['We have a long ride before us,']
['Bet he killed them all himself, he did,', 'twisted their little heads off, our mighty warrior.']


# Syntactic Relations

In [14]:
pd.DataFrame([(token.head.text, token.dep_, token.text) for token in doc], columns=['Head', 'dep_', 'text']).style

Unnamed: 0,Head,pos_,dep_
0,urged,punct,“
1,start,nsubj,We
2,start,aux,should
3,urged,ccomp,start
4,start,advmod,back
5,urged,punct,","
6,urged,punct,”
7,urged,nsubj,Gared
8,urged,ROOT,urged
9,began,mark,as
