In [None]:
#Visualising Text with SpaCy
#author: Dr Mahmoud El-Haj (with help from the Internet) as part of the "Visualise My Corpus Tutorial" an event by Lanacaster University's UCREL and DSG Seminars

In [None]:
#We'll use SpaCy, a python package with libraries needed to analyse and anotate text.
#http://spacy.io

In [None]:
#installing spaCy
#https://spacy.io/usage
#!pip install -U spacy
#!pip install -U spacy-lookups-data
#!python -m spacy download en_core_web_sm
#!python -m spacy download de_core_news_sm

In [None]:
# SpaCy Tokenizer Construction
# If running the following returns with ModuleNotFoundError then you need to download the language needed (see cell above)
from spacy.tokenizer import Tokenizer

import spacy    

nlp = spacy.load("en_core_web_sm") #loading language model. Use de_core_news_sm for German.

#otherwise you can use import of spacy.load()
    #import en_core_web_sm
    #nlp = en_core_web_sm.load()

# Create a blank Tokenizer with just the English vocab
tokenizer = Tokenizer(nlp.vocab)

In [None]:
sentence = "Today is   March 18th 2021 and   Mahmoud, is showing us   how to visualise text online at Lancaster University."

In [None]:
tokens = tokenizer(sentence) #' '.join(sentence.split()) needed to avoid excess whitespaces. 
# notice that a punctuation such as ، (a comma in Arabic), is considered a token!
print('Number of words: ',len(tokens))
print('\n>>>>>>>Tokens<<<<<<<:')
for t in tokens:
    print(t)

In [None]:
import re
sentence = ' '.join(sentence.split())#remove extra white spaces
sentence = re.sub(r'[^\w\s]','',sentence)#use regex to remove puncatuations

tokens = tokenizer(sentence) #we call the tokenizer again over the cleaned sentence

print('Numer of words: ',len(tokens))
print('\n>>>>>>>Tokens<<<<<<<:')
for t in tokens:
    print(t)

In [None]:
#what about stop-words?
#SpaCy's English language stop words (for other languages see: https://spacy.io/usage/models)
from spacy.lang.en.stop_words import STOP_WORDS

print(STOP_WORDS)

In [None]:
#Let's get tokens ignoring stop-words and punctuations (remember we used regex to remove puncations).
tokens_no_stopwords = [token.text for token in tokens if token.is_stop != True and token.is_punct != True]

In [None]:
#Notice that the puncation disappears in the 2nd output as well as the new stop-word 'text'
print('With stop-words\n',*tokens, '>>>>>>',len(tokens), 'words.')
print('Without stop-words\n',*tokens_no_stopwords, '>>>>>>', len(tokens_no_stopwords), 'words.')

In [None]:
#what if we want to add/remove to/from the default stop-words list?
#assume the word 'text' is very frequent in our corpus to an extent that it becomes a stop-word
#to add 'text' to the stop words list:
nlp.Defaults.stop_words.add("text")

In [None]:
#print the list, notice 'text' is now an entry
#to remove a word from the list use: nlp.Defaults.stop_words.remove("word_to_be_removed")
print(STOP_WORDS)

In [None]:
nlp = spacy.load("en_core_web_sm")
tokenizer = Tokenizer(nlp.vocab)#recreating the tokenizer as the previous one used the unupdated stopwords list
tokens = tokenizer(sentence)
#loop through the tokens and only consider non-stop-words and non-punctuations.
tokens_no_stopwords = [token.text for token in tokens if token.is_stop != True and token.is_punct != True]

In [None]:
print(*tokens_no_stopwords, len(tokens_no_stopwords))

In [None]:
#Linguistic annotations (Part of speech tags and dependencies using the Universal Dependecies https://universaldependencies.org) 
#This will return a Language object containing all components and data needed to process text
doc = nlp(sentence)
for token in doc:
    print(token.text, token.pos_, token.dep_)
    
    
'''nsubj: nominal subject.\t nummod: numeric modifier ...etc. For more visit: https://universaldependencies.org'''

In [None]:
#Let's visualise the annotated sentence above

from spacy import displacy

#nlp = spacy.load("en_core_web_sm") #uncomment if not loaded previously
doc = nlp(sentence)#the original cleaned sentence (only extra spaces and puncations were removed)
displacy.render(doc, style="dep")

In [None]:
#Can we make it look a bit cooler? (for more options https://spacy.io/api/top-level#displacy_options)
options = {"compact": True, "bg": "#ebc334",
           "color": "black", "font": "Source Sans Pro"}
displacy.render(doc, style="dep", options=options,)

In [None]:
#to save in Scalable Vector Graphics (SVG) so you can view it in full screen:
from pathlib import Path
svg = displacy.render(doc, style="dep", options=options,jupyter=False)

output_path = Path("./models/dependency_plot.svg")
output_path.open("w", encoding="utf-8").write(svg)

In [None]:
#what about named entities?

for ent in doc.ents:
    print('[',ent.text,']', 'from',ent.start_char,'to', ent.end_char,'[', ent.label_,']')
    

In [None]:
#can we visualise named entities? Well, of course! :-)
displacy.render(doc, style="ent")

In [None]:
#to save in Scalable Vector Graphics (SVG) so you can view it in full screen:
from pathlib import Path
html = displacy.render(doc, style="ent",jupyter=False)#withouth jupyter = False you'll get a 

output_path = Path("./models/ner_plot.html")
output_path.open("w", encoding="utf-8").write(html)