In [None]:
#Visualising Text with SpaCy
#author: Dr Mahmoud El-Haj (with help from the Internet) as part of the "Data Visualisation Workshop for Critical Computational Discourse"

In [None]:
#We'll use SpaCy, a python package with libraries to analyse and anotate text.
#http://spacy.io

In [None]:
#installing spaCy
#https://spacy.io/usage
#!pip install -U spacy
#!pip install -U spacy-lookups-data
#!python -m spacy download en_core_web_sm
#!python -m spacy download de_core_news_sm

In [None]:
# SpaCy Tokenizer Construction
from spacy.tokenizer import Tokenizer

import spacy    

nlp = spacy.load("en")

# Create a blank Tokenizer with just the English vocab
tokenizer = Tokenizer(nlp.vocab)

In [None]:
sentence = "Today is March 19th 2020 and Mahmoud is showing us how to visualising text at Lancaster University."

In [None]:
tokens = tokenizer(sentence)
print(len(tokens))

In [None]:
#what about stop-words?
#SpaCy's English language stop words (for other languages see: https://spacy.io/usage/models)
from spacy.lang.en.stop_words import STOP_WORDS

print(STOP_WORDS)

In [None]:
#Let's get tokens ignoring stop-words and punctuations
tokens_no_stopwords = [token.text for token in tokens if token.is_stop != True and token.is_punct != True]

In [None]:
print(*tokens, len(tokens))
print(*tokens_no_stopwords, len(tokens_no_stopwords))

In [None]:
#what if we want to add/remove to/from the default stop-words list?
#assume the word 'text' is very frequent in our corpus to an extent that it becomes a stop-word
#to add 'text' to the stop words list:
nlp.Defaults.stop_words.add("text")

In [None]:
#print the list, notice 'text' is now an entry
#to remove a word from the list use: nlp.Defaults.stop_words.remove("word_to_be_removed")
print(STOP_WORDS)

In [None]:
nlp = spacy.load("en")
tokenizer = Tokenizer(nlp.vocab)
tokens = tokenizer(sentence)
#loop through the tokens and only consider non-stop-words and non-punctuations.
tokens_no_stopwords = [token.text for token in tokens if token.is_stop != True and token.is_punct != True]

In [None]:
print(*tokens_no_stopwords, len(tokens_no_stopwords))

In [None]:
#Linguistic annotations (Part of speech tags and dependencies) 
#This will return a Language object containing all components and data needed to process text
nlp = spacy.load("en_core_web_sm")
doc = nlp(sentence)
for token in doc:
    print(token.text, token.pos_, token.dep_)

In [None]:
#Let's visualise the annotated sentence above

from spacy import displacy

nlp = spacy.load("en_core_web_sm")
doc = nlp(sentence)
displacy.render(doc, style="dep")

In [None]:
#Can we make it look a bit cooler? (for more options https://spacy.io/api/top-level#displacy_options)
options = {"compact": True, "bg": "#09a3d5",
           "color": "white", "font": "Source Sans Pro"}
displacy.render(doc, style="dep", options=options)

In [None]:
#what about named entities?

displacy.render(doc, style="ent")