# [spaCy overview](http://spacy.io/docs/#examples)

## Load spaCy resources

In [None]:
# Import spacy and English models
import spacy

nlp = spacy.load('en')

## Process text

In [None]:
# Process sentences 'Hello, world. Here are two sentences.' using spaCy

doc = nlp('Hello, world. Here are two sentences.')

## Get tokens and sentences

In [None]:
# Get first token of the processed document
token = doc[0]
print(token)

print()
# Print sentences (one sentence per line)
for sent in doc.sents:
    print(sent)


## Part of speech tags

In [None]:
# For each token, print corresponding part of speech tag
for token in doc:
    print('{} - {}'.format(token, token.pos_))

## Visual part of speech tagging ([displaCy](https://displacy.spacy.io))

## Syntactic dependencies

In [None]:
# Write a function that walk up the syntactic tree of the given token and collects all tokent to the root token (including root token).
def tokens_to_root(token):
    '''Walk up the syntactic tree, collecting tokens to the root.'''
    tokens_to_r = []
    while token.head is not token:
        tokens_to_r.append(token)
        token = token.head
        
    tokens_to_r.append(token)
    return tokens_to_r

# For every token in document, print it's tokens to the root
for token in doc:
    print('{} --> {}'.format(token, tokens_to_root(token)))
    
print()
# Print dependency labels of the tokens
for token in doc:
    print('-> '.join(['{}-{}'.format(dependent_token, dependent_token.dep_) for dependent_token in tokens_to_root(token)]))
        

## Named entities

In [None]:
# Print all named entities with named entity types

doc_2 = nlp("I went to Paris where I met my old friend Jack from uni.")
for ent in doc_2.ents:
    print('{} - {}'.format(ent, ent.label_))

## Noun chunks

In [None]:
# Print noun chunks for doc_2

print([chunk for chunk in doc_2.noun_chunks])

## Word probabilities

In [None]:
# For every token in doc_2, print log-probability of the word, estimated from counts from a large corpus 

for token in doc_2:
    print(token, ',', token.prob)

## Word embedding / Similarity

In [None]:
# For a given document, caclulate similarity between 'apples' and 'oranges' and 'boots' amd 'hippos'
doc = nlp("Apples and oranges are similar. Boots and hippos aren't.")
apples = doc[0]
oranges = doc[2]
boots = doc[6]
hippos = doc[8]
print(apples.similarity(oranges))
print(boots.similarity(hippos))

print()
# Print similarity between sentence and word 'fruit'
apples_sent, boots_sent = doc.sents
fruit = doc.vocab['fruit']
print(apples_sent.similarity(fruit))
print(boots_sent.similarity(fruit))

In [None]:
# Matplotlib Jupyter HACK
%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt

# Real text analysis

## Load text file

In [None]:
def read_file(file_name):
    with open(file_name, 'r') as file:
        return file.read()

## Process full text

In [None]:
# Process `text` with Spacy NLP Parser
text = read_file('data/pride_and_prejudice.txt')
processed_text = nlp(text)

In [None]:
# How many sentences are in Pride & Prejudice book?
sentences = [s for s in processed_text.sents]
print(len(sentences))

# Print sentences from index 10 to index 15, to make sure that we have parsed correct book
print(sentences[10:15])

## Find all the personal names

In [None]:
# Extract all the personal names from Pride & Prejudice and count theirs occurences. 
# Expected output is a list in the following form: [('elizabeth', 622), ('darcy', 312), ('jane', 286), ('bennet', 266) ...].

from collections import Counter

actors = Counter()
for ent in processed_text.ents:
    if ent.label_ == 'PERSON':
        actors[ent.lemma_] += 1
        
print(actors.most_common(10))

## Plot actors personal names as a time series 

In [None]:
# Plot actor mentions as a time series relative to the position of the actor's ocurence in a book.

from collections import defaultdict

actors_occurences = defaultdict(list)
for ent in processed_text.ents:
    if ent.label_ == 'PERSON':
        actors_occurences[ent.lemma_].append(ent.start)

In [None]:
from matplotlib.pyplot import hist

NUM_BINS = 10

def normalize_occurences(occurencies):
    return [o / float(len(processed_text)) for o in occurencies]

elizabeth_occurences = normalize_occurences(actors_occurences['elizabeth'])
darcy_occurences = normalize_occurences(actors_occurences['darcy'])
bingly_occurences = normalize_occurences(actors_occurences['bingley'])

x = [elizabeth_occurences, darcy_occurences, bingly_occurences,]

with plt.style.context('fivethirtyeight'):
    n, bins, patches = plt.hist(x, NUM_BINS, histtype='bar', label=['Elizabeth', 'Darcy', 'Bingley'])
    plt.legend(loc='upper right')


In [None]:
with plt.style.context('fivethirtyeight'):
    for a in n:
        plt.plot([x / (NUM_BINS - 1) for x in range(len(a))], a)

    plt.legend(['elizabeth', 'darcy', 'bingley'], loc='upper right')

## Spacy parse tree in action

In [None]:
# Find words (adjectives) that describe Mr Darcy. 

# Solution #1
darcy_adjectives = []
for ent in processed_text.ents:
    if ent.lemma_ == 'darcy':
        for token in ent.subtree:
            if token.pos_ == 'ADJ':
                darcy_adjectives.append(token.lemma_)
print(darcy_adjectives)

print()
# Solution #2
# Definition of Adjectival modifier http://universaldependencies.org/en/dep/amod.html
print([token.lemma_ for ent in processed_text.ents if ent.lemma_ == 'darcy' for token in ent.subtree if token.dep_ == 'amod'])


In [None]:
# Find actors that are 'talking', 'saying', 'doing' the most. Find the relationship between 
# entities and corresponding root verbs.
from collections import defaultdict, Counter

print(Counter([ent.lemma_ for ent in processed_text.ents if ent.label_ == 'PERSON' and ent.root.head.lemma_=='say']).most_common()) 


print()
# Find all the actors that got married in the book

# Some sentence from which information could be extracted
# 
# her mother was talking to that one person (Lady Lucas) freely,
# openly, and of nothing else but her expectation that Jane would soon
# be married to Mr. Bingley.
#
print(Counter([ent.lemma_ for ent in processed_text.ents if ent.label_ == 'PERSON' and ent.root.head.lemma_=='marry']).most_common()) 


## Extract Keywords

In [None]:
# Extract Keywords using noun chunks from the news article (file 'article.txt').
# Spacy will pick some noun chunks that are not informative at all (e.g. we, what, who).
# Try to find a way to remove that kind of keywords.

article = read_file('data/article.txt')
doc = nlp(article)

keywords = Counter()
for chunk in doc.noun_chunks:
    if nlp.vocab[chunk.lemma_].prob < - 8: # probablity value -8 is arbitraraly selected threshold
        keywords[chunk.lemma_] += 1

keywords.most_common(20)