In [None]:
#This is part 5 of the 5 parts tutorial to show you how to plot word (verbs in our case) usage over time! 
#The code will plot the usage of the most frequent 3 verbs in the CCC titles showing how the usage differs across the years.
#Our data-set is a list of talks and abstracts from the CCC conference https://gitlab.com/maxigas/cccongresstalks/

import pandas as pd

combinedFile = "csvs/combined_csv.csv"

df = pd.read_csv(combinedFile, delimiter=',', header=0)
print('Number of titles: {:,}\n'.format(df.shape[0]))
df.sample(3)

In [None]:
#reading titles (title is a column in the csv file as shown in the sample above)

#loop through the titles and store them in a dictionary.

title_dict = {}

# classify that the article has recommends
for i in range(len(df)):
    if df["title"][i] in title_dict.keys():
        title_dict[df["title"][i]] += 1
    else:
        title_dict.setdefault(df["title"][i], 1)

title_dict = [x for x in title_dict if str(x) != 'nan']#some talks may have no titles, just in case

In [None]:
#Cleaning titles text keeping only letters
import re
title_text = " ".join(x for x in title_dict)

title_text = re.sub('[^a-zA-Z -\']+', '', title_text)
title_text = title_text.replace('>', ' ').replace('<', ' ')
title_text = re.sub(' +', ' ', title_text)
title_text = title_text.replace('hackz', 'hack').replace('securityz', 'security').replace('attackz','attacks').replace('Anfngerz','')

In [None]:
#using NLTK stop words removal process to remove english and german stop words and any word with less than 5 letters.

import nltk
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))
de_stop = set(nltk.corpus.stopwords.words('german'))

def check_stop_lists(token):
    if token in en_stop or token in de_stop:
        return True
    else:
        return False
    
def check_length(token):
    if len(token) > 4:
        return True
    else:
        return False

In [None]:
#find the most common words in the CCC talk titles
import spacy
from collections import Counter

nlp = spacy.load("en_core_web_sm")
nlp.max_length = 200000 #sometimes if this number is smaller than the max length of your text you'll receive an error asking you to increase the max length
doc = nlp(title_text)

#remove stopwords, punctuations and words (verbs in our case) that are less than 5 letters long.

verbs = [token.text for token in doc if check_stop_lists(token.text.lower()) == False and check_length(token.text) == True and token.is_punct != True and token.pos_ == "VERB"]

#you can repeat the obove for nouns and most common words
#nouns = [token.text for token in doc if check_stop_lists(token.text.lower()) == False and check_length(token.text) == True and token.is_punct != True and token.pos_ == "NOUN"]
#words = [token.text for token in doc if check_stop_lists(token.text.lower()) == False and check_length(token.text) == True and token.is_punct != True]


word_freq = Counter(verbs)
common_verbs = word_freq.most_common(10)
print(common_verbs)


In [None]:
#get a list of all years in the CCC csvs (to help loop through years)
years = df.year.unique()
yearsList = []
for y in years:
    yearsList.append(str(y).replace('.0',''))#in some cases the years will end with a decimal e.g. 1990.0 this is due to difference between windows and linux/mac integer format, no harm in keeping it.

In [None]:
#returns frequency of a word over years (35 years in our case)
def getFrequencies(word,yearsList):
    yearsList.sort()
    frequencies=[]
    wordFreq = 0

    for year in yearsList:
        urlCSV = 'csvs/'+year+'.csv'
        #print(year)
        # load the data with pd.read_csv
        data = pd.read_csv(urlCSV, delimiter='|',error_bad_lines=False)
        #print(year, '  Number of titles: {:,}\n'.format(data.shape[0]))
        alltxt = data.title.str.lower()
        
        #data['abstract'].value_counts()

        freq = alltxt.str.split(expand=True).stack().value_counts()

        for name, val in freq.iteritems():
            if name.lower()==word.lower():
                wordFreq = val
        #print(word, wordFreq)
        frequencies.append(wordFreq)
    
    return frequencies   

In [None]:
#a method to plot the most common words (verbs) and their frequencies over time
import matplotlib.pyplot as plt

def plotFrequencies(word,yearsList,frequencies):
    yearsList.sort()
    title_word = word+' Usage Over Years'
    fig = plt.figure()
    fig.suptitle(title_word, fontsize=10)
    plt.xlabel('Years', fontsize=8)
    plt.ylabel('Frequency', fontsize=8)
    plt.rcParams["figure.figsize"] = (10,2)

    plt.plot(yearsList, frequencies)
    # You can specify a rotation for the tick labels in degrees or with keywords.
    plt.xticks(yearsList, yearsList, rotation='vertical')

    # Pad margins so that markers don't get clipped by the axes
    plt.margins(0,0.20)
    # Tweak spacing to prevent clipping of tick-labels
    plt.subplots_adjust(bottom=0.30)
    pltFileName = 'plots'+'/'+'word_usage'+'_'+word+'.pdf';
    plt.savefig(pltFileName)
    plt.show()



In [None]:
#loop through the verbs in the common_verbs array and create a plot for each word
for key, value in common_verbs:
    print(key)
    plotFrequencies(key,yearsList,getFrequencies(key,yearsList))