In [None]:
#Visualising Text with SpaCy
#Dr Mahmoud El-Haj as part of the "Data Visualisation Workshop for Critical Computational Discourse"
#Get words from text (extract tokens)


In [None]:
#We'll use a package called SpaCy
#http://spacy.io

In [None]:
#installing spaCy
#https://spacy.io/usage
!pip install -U spacy
!pip install -U spacy-lookups-data
!python -m spacy download en_core_web_sm
!python -m spacy download de_core_news_sm

In [1]:
# SpaCy Tokenizer Construction
from spacy.tokenizer import Tokenizer

import spacy    

nlp = spacy.load("en")

# Create a blank Tokenizer with just the English vocab
tokenizer = Tokenizer(nlp.vocab)

In [18]:
sentence = "Today is March 19th and Mahmoud is showing us how to visualising text at Lancaster University."

In [3]:
tokens = tokenizer(sentence)
print(len(tokens))

16


In [4]:
#what about stop-words?

In [5]:
#SpaCy's English language stop words (for other languages see: https://spacy.io/usage/models)
from spacy.lang.en.stop_words import STOP_WORDS

print(STOP_WORDS)

{'was', '‘d', 'under', 'how', 'they', 'us', 'amongst', 'mostly', 'namely', 'one', 'her', 'seem', 'other', 'someone', 'whatever', '’s', 'back', 'toward', 'have', 'nor', 'sixty', 'ever', 'whereas', 'least', 'already', 'there', 'first', 'the', 'will', 'whence', 'which', "'m", 'four', 'yourself', 'what', 'on', '‘ve', 'did', 'becoming', 'although', 'through', 'thence', 'hundred', '’ll', 'give', 'here', "'d", 'third', 'them', 'former', 'i', 'should', 'been', 'both', 'front', 'ten', 'unless', 'anything', 'him', 'than', 'anyone', '‘s', '’m', 'five', 'almost', 'all', 'as', 'further', 'his', 'myself', 'others', 'same', 'sometime', 'over', 'please', 'does', 'full', 'has', 'its', 'this', 'either', 'a', 'besides', '’re', 'that', 'herein', 'six', 'except', 'alone', 'is', 'used', 'not', 'often', 'seems', 'but', 'yet', 'nobody', 'after', 'name', 'somehow', 'cannot', 'neither', 'always', 'even', 'something', 'last', 'these', 'nowhere', 'onto', 'moreover', 'two', 'amount', 'few', 'anyhow', 'hers', 'once

In [6]:
#Let's get tokens ignoring stop words and punctuations
tokens_no_stopwords = [token.text for token in tokens if token.is_stop != True and token.is_punct != True]

In [7]:
print(*tokens, len(tokens))
print(*tokens_no_stopwords, len(tokens_no_stopwords))

Today is March 19th and Mahmoud is showing us how to visualising text at Lancaster University. 16
Today March 19th Mahmoud showing visualising text Lancaster University. 9


In [8]:
#what if we want to add/remove to/from the default stop-words list?
#assume the word 'text' is very frequent in our corpus to an extent that it becomes a stop-word

In [9]:
#to add 'text' to the stop words list:
nlp.Defaults.stop_words.add("text")

In [10]:
print(STOP_WORDS)

{'was', '‘d', 'under', 'how', 'they', 'us', 'amongst', 'mostly', 'namely', 'one', 'her', 'seem', 'other', 'someone', 'whatever', '’s', 'back', 'toward', 'have', 'nor', 'sixty', 'ever', 'whereas', 'least', 'already', 'there', 'first', 'the', 'will', 'whence', 'which', "'m", 'four', 'yourself', 'what', 'on', '‘ve', 'did', 'becoming', 'although', 'through', 'thence', 'hundred', '’ll', 'give', 'here', "'d", 'third', 'them', 'former', 'i', 'should', 'been', 'both', 'front', 'ten', 'unless', 'anything', 'him', 'than', 'anyone', '‘s', '’m', 'five', 'almost', 'all', 'as', 'further', 'his', 'myself', 'others', 'same', 'sometime', 'over', 'please', 'does', 'full', 'has', 'its', 'this', 'either', 'a', 'besides', '’re', 'that', 'herein', 'six', 'except', 'alone', 'is', 'used', 'not', 'often', 'seems', 'but', 'yet', 'nobody', 'after', 'name', 'somehow', 'cannot', 'neither', 'always', 'even', 'something', 'last', 'these', 'nowhere', 'onto', 'moreover', 'two', 'amount', 'few', 'anyhow', 'hers', 'once

In [11]:
nlp = spacy.load("en")
tokenizer = Tokenizer(nlp.vocab)
tokens = tokenizer(sentence)
tokens_no_stopwords = [token.text for token in tokens if token.is_stop != True and token.is_punct != True]

In [12]:
print(*tokens_no_stopwords, len(tokens_no_stopwords))

Today March 19th Mahmoud showing visualising Lancaster University. 8


In [27]:
#Linguistic annotations (Part of speech tags and dependencies) 

nlp = spacy.load("en_core_web_sm")#This will return a Language object containing all components and data needed to process text
doc = nlp(sentence)
for token in doc:
    print(token.text, token.pos_, token.dep_)

Today NOUN nsubj
is AUX ROOT
March PROPN compound
19th NOUN attr
and CCONJ cc
Mahmoud PROPN conj
is AUX aux
showing VERB conj
us PRON dobj
how ADV advmod
to ADP aux
visualising VERB xcomp
text NOUN dobj
at ADP prep
Lancaster PROPN compound
University PROPN pobj
. PUNCT punct


In [24]:
#Let's visualise the annotated sentence above

from spacy import displacy

nlp = spacy.load("en_core_web_sm")
doc = nlp(sentence)
displacy.render(doc, style="dep")

In [28]:
#Can we make it look a bit cooler? (for more options https://spacy.io/api/top-level#displacy_options)
options = {"compact": True, "bg": "#09a3d5",
           "color": "white", "font": "Source Sans Pro"}
displacy.render(doc, style="dep", options=options)

In [29]:
#what about named entities?

displacy.render(doc, style="ent")