In [None]:
#The First International Workshop on Arabic Big Data & AI (IWABigDAI) May 11 and May 12 2022
#https://sites.google.com/view/arabicbigdata/home

#Tutorial 2: Visualising with Word Clouds
#author: Dr Mahmoud El-Haj (with help from the Internet)
#GitHub repository: https://github.com/drelhaj/NLP_ML_Visualization_Tutorial

In [None]:
#We go a step forward by showing you how to create noun-clouds and verb-clouds using SpaCy.
#Our data-set is a list of talks and abstracts from the CCC conference https://gitlab.com/maxigas/cccongresstalks/

import warnings
warnings.filterwarnings("ignore")

import random
import helpers
import matplotlib as mpl
import matplotlib.pyplot as plt
#import arabic_reshaper # this was missing in your code

from nltk.corpus import stopwords
from imageio import imread
from wordcloud import WordCloud, STOPWORDS

#The following are options that will apear on the Word-Cloud plot later on.
mpl.style.use('ggplot')
infosize = 12
limit = 10000
title = 'Most frequent words'
chartinfo = 'Author: Mo El-Haj'
footer = 'The {} most frequent words, excluding stopwords.\n{}'.format(limit, chartinfo)
font = 'font/Ubuntu-B.ttf'#font needed to display arabic text
fontcolor='#000000'
bgcolor = '#000000'

#loading English and German stop-words then combining the two sets in one

import nltk
# nltk.download('wordnet')#you may turn this one off if you've already downloaded the wordnet   
nltk.download('stopwords')#download the stopword lists from NLTK. Can be turned off if already downloaded
en_stop = set(nltk.corpus.stopwords.words('english'))

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

#Reading a the 2019 CCC talks, which is stored as a CSV file
File2022_df = pd.read_csv("csvs/2023-Crete.csv", delimiter=',', header=0, encoding='utf8')#notice the delimiter is not a comma, check your files first.
print('Number of titles: {:,}\n'.format(File2022_df.shape[0]))
File2022_df.sample(5)
                          

In [None]:
#reading tweets (tweet_text is a column in the csv file as shown in the sample above)

#loop through the abstracts and store them in a dictionary.

tweets_dict = {}

# classify that the article has recommends
for i in range(len(File2022_df)):
    if File2022_df["TWEETTEXT"][i] in tweets_dict.keys():
        tweets_dict[File2022_df["TWEETTEXT"][i]] += 1
    else:
        tweets_dict.setdefault(File2022_df["TWEETTEXT"][i], 1)

tweets_dict = [x for x in tweets_dict if str(x) != 'nan']#some talks have no abstracts
print(tweets_dict)

In [None]:
#a method for a grey colour wordcloud
def grenshades_color(word, font_size, position, orientation, random_state=None, **kwargs):
    return 'hsl(100,100%%, %d%%)' % random.randint(0, 300)

In [None]:
#plotting the abstracts most frequent words as a wordcloud!

import warnings
warnings.filterwarnings("ignore")

from PIL import Image

mask = np.array(Image.open(r'./img/1.jpg'))# source: https://github.com/yassineMrabet/Word_Cloud_Arabic 
plt.imshow(mask)
#print("source: https://github.com/yassineMrabet/Word_Cloud_Arabic")
plt.axis("off")

#abstracts text
import re
tweetsText = " ".join(x for x in tweets_dict)

#tweetsText = re.sub('[^a-zA-Z0-9]+', '', tweetsText)
#tweetsText = tweetsText.replace('>', ' ').replace('<', ' ')
tweetsText = re.sub(' +', ' ', tweetsText)

#tweetsText = arabic_reshaper.reshape(tweetsText)
#tweetsText = get_display(tweetsText) # add this line
#print(tweetsText)

#The mask image will guide the word-cloud to take the shape of that image.
#In our case it's a silhouette of a hacker (goes along with the CCC conference)
#notice the word-cloud will contain English and German stop words as we didn't handle them
wordcloud = WordCloud(
    max_words=limit,
    stopwords=en_stop,
    mask=imread('img/1.jpg'),
    background_color=bgcolor,
    font_path=font
).generate(tweetsText)

#set width and height
fig = plt.figure()
fig.set_figwidth(14)
fig.set_figheight(18)

#plot!
plt.imshow(wordcloud.recolor(color_func=grenshades_color, random_state=10))
plt.title(title, color=fontcolor, size=30, y=1.01)
plt.annotate(footer, xy=(0, -.025), xycoords='axes fraction', fontsize=infosize, color=fontcolor)
plt.axis('off')
plt.show()


In [None]:
#Same as above but the word-cloud follows (masks) the colours in the image itself rather than the grey colour we chose earlier
#notice the word-cloud will contain English and German stop words as we didn't handle them
#setting mask image

from PIL import Image

mask = np.array(Image.open(r'./img/4.jpg'))
plt.imshow(mask)
plt.axis("off")

# lower max_font_size, change the maximum number of word and lighten the background:
from wordcloud import ImageColorGenerator

#--------------------------------------------------------------------------------------

wordcloud = WordCloud(
    max_words=limit,
    stopwords=en_stop,
    width=2000, height=1000,
    contour_color="black", 
    relative_scaling = 0,
    mask=mask,
    background_color="black",
    font_path=font
).generate(tweetsText)

#creating wordcloud
image_colors = ImageColorGenerator(mask)
plt.figure(figsize=[20,15])
plt.imshow(wordcloud.recolor(color_func=image_colors), interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
#--------------------------------------------------------------------------------------
#setting mask image

from PIL import Image

mask = np.array(Image.open(r'img/4.jpg'))
plt.imshow(mask)
plt.axis("off")

# lower max_font_size, change the maximum number of word and lighten the background:
from wordcloud import ImageColorGenerator

#--------------------------------------------------------------------------------------

#creating wordcloud
wordcloud = WordCloud(
    max_words=limit,
    stopwords=en_stop,
    width=2000, height=1000,
    contour_color="black", 
    relative_scaling = 0,
    mask=mask,
    background_color="white",
    font_path=font
).generate(tweetsText)

image_colors = ImageColorGenerator(mask)
plt.figure(figsize=[20,15])
plt.imshow(wordcloud.recolor(color_func=image_colors), interpolation="bilinear")
plt.axis("off")
plt.show()