In [None]:
#Tutorial 3: Visualising with Word Clouds
#author: Dr Mahmoud El-Haj (with help from the Internet) as part of the "Visualise My Corpus Tutorial" an event by Lanacaster University's UCREL and DSG Seminars
#GitHub repository: https://github.com/drelhaj/NLP_ML_Visualization_Tutorial

In [None]:
#We go a step forward by showing you how to create noun-clouds and verb-clouds using SpaCy.
#Our data-set is a list of talks and abstracts from the CCC conference https://gitlab.com/maxigas/cccongresstalks/

import warnings
warnings.filterwarnings("ignore")

import random
import helpers
import matplotlib as mpl
import matplotlib.pyplot as plt

from nltk.corpus import stopwords
from imageio import imread
from wordcloud import WordCloud, STOPWORDS

#The following are options that will apear on the Word-Cloud plot later on.
mpl.style.use('ggplot')
infosize = 12
limit = 10000
title = 'Most frequent words'
chartinfo = 'Author: Mahmoud El-Haj'
footer = 'The {} most frequent words, excluding English stopwords.\n{}'.format(limit, chartinfo)
font = 'font/Ubuntu-B.ttf'
fontcolor='#fafafa'
bgcolor = '#000000'

#loading English and German stop-words then combining the two sets in one

import nltk
nltk.download('wordnet')#you may turn this one off if you've already downloaded the wordnet   
nltk.download('stopwords')#download the stopword lists from NLTK. Can be turned off if already downloaded
en_stop = set(nltk.corpus.stopwords.words('english'))
de_stop = set(nltk.corpus.stopwords.words('german'))


both_stop_words = en_stop.union(de_stop)

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

#Reading a the 2019 CCC talks, which is stored as a CSV file
File2019_df = pd.read_csv("csvs/2019.csv", delimiter='|', header=0)#notice the delimiter is not a comma, check your files first.
print('Number of titles: {:,}\n'.format(File2019_df.shape[0]))
File2019_df.sample(5)

In [None]:
#reading abstracts (abstract is a column in the csv file as shown in the sample above)

#loop through the abstracts and store them in a dictionary.

abstract_dict = {}

# classify that the article has recommends
for i in range(len(File2019_df)):
    if File2019_df["abstract"][i] in abstract_dict.keys():
        abstract_dict[File2019_df["abstract"][i]] += 1
    else:
        abstract_dict.setdefault(File2019_df["abstract"][i], 1)

abstract_dict = [x for x in abstract_dict if str(x) != 'nan']#some talks have no abstracts

In [None]:
#a method for a grey colour wordcloud
def grey_color(word, font_size, position, orientation, random_state=None, **kwargs):
    return 'hsl(0, 0%%, %d%%)' % random.randint(50, 100)

In [None]:
#plotting the abstracts most frequent words as a wordcloud!

import warnings
warnings.filterwarnings("ignore")

from PIL import Image

mask = np.array(Image.open(r'img\1.jpg'))
plt.imshow(mask)
plt.axis("off")

#abstracts text
import re
abstract_text = " ".join(x for x in abstract_dict)

abstract_text = re.sub('[^a-zA-Z -\']+', '', abstract_text)
abstract_text = abstract_text.replace('>', ' ').replace('<', ' ')
abstract_text = re.sub(' +', ' ', abstract_text)

#The mask image will guide the word-cloud to take the shape of that image.
#In our case it's a silhouette of a hacker (goes along with the CCC conference)
#notice the word-cloud will contain English and German stop words as we didn't handle them
wordcloud = WordCloud(
    max_words=limit,
    stopwords=both_stop_words,
    mask=imread('img/1.jpg'),
    background_color=bgcolor,
    font_path=font
).generate(abstract_text)

#set width and height
fig = plt.figure()
fig.set_figwidth(14)
fig.set_figheight(18)

#plot!
plt.imshow(wordcloud.recolor(color_func=grey_color, random_state=3))
plt.title(title, color=fontcolor, size=30, y=1.01)
plt.annotate(footer, xy=(0, -.025), xycoords='axes fraction', fontsize=infosize, color=fontcolor)
plt.axis('off')
plt.show()


In [None]:
#Same as above but the word-cloud follows (masks) the colours in the image itself rather than the grey colour we chose earlier
#notice the word-cloud will contain English and German stop words as we didn't handle them
#setting mask image

from PIL import Image

mask = np.array(Image.open(r'img\4.jpg'))
plt.imshow(mask)
plt.axis("off")

# lower max_font_size, change the maximum number of word and lighten the background:
from wordcloud import ImageColorGenerator

#--------------------------------------------------------------------------------------

wordcloud = WordCloud(
    max_words=limit,
    stopwords=both_stop_words,
    width=2000, height=1000,
    contour_color="black", 
    relative_scaling = 0,
    mask=mask,
    background_color="white",
    font_path=font
).generate(abstract_text)

#creating wordcloud
image_colors = ImageColorGenerator(mask)
plt.figure(figsize=[20,15])
plt.imshow(wordcloud.recolor(color_func=image_colors), interpolation="bilinear")
plt.axis("off")
_=plt.show()

In [None]:
#here we go a step further than just words frequency 
#instead we choose a cleaner version of the text, no stop words, no words less than 5 letters and only NOUNS (see other options below)


import spacy
from collections import Counter
nlp = spacy.load('en_core_web_sm')
doc = nlp(abstract_text)
#remove stopwords and punctuations

nouns = [token.text for token in doc if token.is_stop != True and len(token)>4 and token.is_punct != True and token.pos_ == "NOUN"] #creates a noun-cloud
#words = [token.text for token in doc if token.is_stop != True and token.is_punct != True] #this is based on words frquency
#verbs = [token.text for token in doc if token.is_stop != True and token.is_punct != True and token.pos_ == "VERB"] #this selects verbs only (verbs-cloud)


# 500 most common noun tokens (you can change the number)
noun_freq = Counter(nouns)
common_nouns = noun_freq.most_common(500)
print (common_nouns[0:10])

#uncomment the examples below if you'd like other text rather than nouns
#word_freq = Counter(words)
#common_words = word_freq.most_common(20)
#print (common_words)

# five most common verbs tokens
#verb_freq = Counter(verbs)
#common_verbs = verb_freq.most_common(20)
#print (common_verbs)




In [None]:
#join all nouncs as one single string to pass it to the word-cloud plotter
allNouns= ( x[0] for x in common_nouns )
allNounsText = ' '.join(str(e) for e in allNouns)

In [None]:
#--------------------------------------------------------------------------------------
#setting mask image

from PIL import Image

mask = np.array(Image.open(r'img\4.jpg'))
plt.imshow(mask)
plt.axis("off")

# lower max_font_size, change the maximum number of word and lighten the background:
from wordcloud import ImageColorGenerator

#--------------------------------------------------------------------------------------

#creating wordcloud
wordcloud = WordCloud(
    max_words=limit,
    stopwords=both_stop_words,
    width=2000, height=1000,
    contour_color="black", 
    relative_scaling = 0,
    mask=mask,
    background_color="white",
    font_path=font
).generate(abstract_text)

image_colors = ImageColorGenerator(mask)
plt.figure(figsize=[20,15])
plt.imshow(wordcloud.recolor(color_func=image_colors), interpolation="bilinear")
plt.axis("off")
_=plt.show()