In [5]:
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import re
from cleantext import clean
import nltk
from nltk.stem.snowball import SnowballStemmer

nltk.download('punkt')

pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 150)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\emilo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
#Extra cleaning functions for dates and "|" symbol
def remove_dates_from_content(content):
    '''Function that attempts to substitute dates in a document for the token "_DATE_".
    If it fails to do so - for example if the content is not convertable to string, it 
    handles the typeerror exception and doesnt do anything with the content.'''
    date_pattern = re.compile(r"(([0-9]{4}-(0[0-9]|1[0-2])-([0-2][0-9]|[3[01])|[a-z]{,9} [0-9]{1,2}, [0-9]{2,4})|\b(\w+\s)(\d{2})(th)?,?(\s\d{4})\b)")
    try:
        content_without_dates = re.sub(date_pattern, "_DATE_", str(content))
    except TypeError:
        content_without_dates = content
    return content_without_dates 

def remove_bar_from_content(content):
    '''Function for removing every occurence of "|"'''
    content_without_bar = str(content).replace("|", "")
    return content_without_bar

def remove_a_from_content(content):
    '''Function for removing every occurence of "a"'''
    return [word for word in content if word != "a"]

#Reading in stopwords
stopwordsDF = pd.read_csv('stopwords.csv')
stopwordsSeries = stopwordsDF.squeeze()
stopwords = set(stopwordsSeries)

def remove_stopwords(list):
     '''Function that returns a list containing a document with the stopwords removed'''
     return [word for word in list if word not in stopwords]

#Initializing stemmer
stemmer = SnowballStemmer("english")
def list_stemmer (wordlist): #stemmer hvert ord i en liste
    '''Function that stems each word in the given input list and returns this'''
    stemmed_list = []
    for word in wordlist:
        stemmed_list.append(stemmer.stem(word))
    return stemmed_list

# def vocabulary_size(series):
#     '''Function that computes the size of the vocabulary of a corpus'''

#     #Initializing empty set to store unique words
#     unique_words_in_corpus = set()

#     for lst in series:
#         #Updates the set of unique words by union with all lists in the corpus
#         unique_words_in_corpus.update(lst)
    
#     #Computing length (sz of vocabulary)
#     vocab_sz = len(unique_words_in_corpus)

#     return vocab_sz

# def most_frequent_n_words(series, n):
#     '''Function that return the n most frequent words and their frequencies in a series
#     assuming the elements in the series are lists of strings'''
#     words = (word for sublist in series for word in sublist)
    
#     # Calculate word frequencies using Counter
#     word_freq = Counter(words)
    
#     # Select the top 10,000 most common words and frequencies
#     most_common_words = word_freq.most_common(n)

#     return most_common_words

#Jeg har taget foreskellen ml antal ord før processing og efter of så divideret med
# antal ord til at starte med for at få reduction raten
def reduction_rate(after,before):
     '''Computes the reduction rate of the size of the vocabulary
     and returns this rounded to 3 decimal points'''
     return round((before - after)/before, 3)

def word_frequency_plot(counter_dict, title):

    # Select the top 10,000 most common words and frequencies
    most_common_words = counter_dict.most_common(10000)
    words, frequencies = zip(*most_common_words)
    
    # Creating the plot
    plt.figure(figsize=(20, 10))
    plt.bar(range(len(frequencies)), frequencies, width=1.0)
    
    plt.title(title)
    plt.xlabel('Words Ranked by Frequency')
    plt.ylabel('Frequency')
    plt.yscale('log')  # Using logarithmic scale for better visibility of frequencies
    
    # Removing x-ticks
    plt.xticks([])
    
    plt.tight_layout()
    plt.show()

# def count_occurences_in_content(content, str):
#     count = sum(string == str for document in content for string in document)
#     return count

def update_frequency_counter(frequency_counter, content):
    for list in content:
        frequency_counter.update(list)
    return frequency_counter

In [7]:
# Applying data preprocessing pipeline to 995,000_rows.csv dataset split into chunks for better memory management

chunk_size = 124375
large_dataset_chunks = pd.read_csv('995,000_rows.csv', low_memory=False, chunksize=chunk_size)

#Initializing list to save the processed data
preprocessed_content_list = []
#Initializing dictionaries to keep track of words and their frequencies
#before and after both removing stopwords and stemming the corpus.
word_frequencies_tokenized = Counter()
word_frequencies_no_stopwords = Counter()
word_frequencies_stemmed = Counter()

for chunk in large_dataset_chunks:

    #Extracting 'content' column chunks and giving dtype string
    chunk_content = chunk['content'].astype(str)

    #Cleaning the 'content' column chunks
    chunk_content = chunk_content.apply(lambda x : clean(x,
    fix_unicode=False,             # fix various unicode errors
    to_ascii=False,                # transliterate to closest ASCII representation
    lower=True,                    # lowercase text
    no_line_breaks=True,           # fully strip line breaks as opposed to only normalizing them
    no_urls=True,                  # replace all URLs with a special token
    no_emails=True,                # replace all email addresses with a special token
    no_phone_numbers=True,         # replace all phone numbers with a special token
    no_numbers=True,               # replace all numbers with a special token
    no_digits=True,                # replace all digits with a special token
    no_currency_symbols=True,      # replace all currency symbols with a special token
    no_punct=True,                 # remove punctuations
    replace_with_punct="",         # instead of removing punctuations you may replace them
    replace_with_url="_URL_",
    replace_with_email="_EMAIL_",
    replace_with_phone_number="_PHONE_",
    replace_with_number="_NUMBER_",
    replace_with_digit="0",
    replace_with_currency_symbol="_CUR_",
    lang="en"                    
))
    
    #Final cleaning of the 'content' column chunks
    chunk_content = chunk_content.apply(lambda x: remove_dates_from_content(x))
    chunk_content = chunk_content.apply(lambda x: remove_bar_from_content(x))
    print('Cleaning finished')

    #Tokenizing the 'content' column chunks
    chunk_content = chunk_content.apply(lambda x: nltk.word_tokenize(x))
    print('Tokenization finished')

    #Updating the "word_frequency_tokenized" dictionary
    update_frequency_counter(word_frequencies_tokenized, chunk_content)

    #Removing stopwords from the 'content' column chunks
    chunk_content = chunk_content.apply(lambda x: remove_stopwords(x))
    chunk_content = chunk_content.apply(lambda x: remove_a_from_content(x))

    #Updating the "word_frequency_no_stopwords" dictionary
    update_frequency_counter(word_frequencies_no_stopwords, chunk_content)

    #Stemming the 'content' column chunks
    chunk_content = chunk_content.apply(lambda x:list_stemmer(x))

    #Updating the "word_frequency_stemmed" dictionary
    update_frequency_counter(word_frequencies_stemmed, chunk_content)
    
    print('Stemming finished')
    preprocessed_content_list.extend(chunk_content.tolist())

print('Preprocessing finished')
    

Cleaning finished
Tokenization finished
Stemming finished
Cleaning finished
Tokenization finished
Stemming finished
Cleaning finished
Tokenization finished
Stemming finished
Cleaning finished
Tokenization finished
Stemming finished
Preprocessing finished


In [10]:
print(preprocessed_content_list[:2])

[['articl', 'googl', 'ali', 'alfoneh', 'assist', 'compil', 'polit', 'nuclear', 'issu', 'suprem', 'leader', 'tell', 'islam', 'student', 'associ', 'foreign', 'univers', 'conspiraci', 'machin', 'enemi', 'includ', 'scientif', 'apartheid', 'subject', 'nation', 'strengthen', 'uniti', 'peopl', 'head', 'iran', 'nuclear', 'energi', 'agenc', 'condit', 'implement', 'addit', 'protocol', 'reactor', 'come', 'onlin', 'militari', 'admir', 'habiballah', 'sayyari', 'chief', 'islam', 'republ', 'iran', 'navi', 'closur', 'hormuz', 'strait', 'consider', 'upcom', 'war', 'game', 'ad', 'iranianmad', 'submarin', 'leav', 'dock', 'southern', 'iran', 'seyyedyahya', 'rahim', 'safavi', 'irgc', 'head', 'current', 'advisor', 'suprem', 'leader', 'risk', 'attack', 'iran', 'minim', 'iran', 'defens', 'doctrin', 'entail', 'nuclear', 'weapon', 'societi', 'cultur', 'iranian', 'psychologist', 'davar', 'sheikhavandi', 'window', 'shop', 'bring', 'girl', 'boy', 'social', 'danger', 'environ', 'shop', 'mall', 'prelud', 'decad', 'i

In [11]:
preprocessed_content_series = pd.Series(preprocessed_content_list)

preprocessed_content_series.to_pickle('content_preprocessed')

In [None]:
large_dataset.to_pickle('995,000_rows_preprocessed.csv')

In [12]:
url_count = word_frequencies_tokenized['url']
date_count = word_frequencies_tokenized['date']
number_count = word_frequencies_tokenized['number']

print("Count of URL's in content: ", url_count)
print("Count of dates in content: ", date_count)
print("Count of numbers in content: ", number_count)

Count of URL's in content:  263039
Count of dates in content:  44720
Count of numbers in content:  6209052


In [13]:
#100 most common words in tokenized data
most_common_tokenized = word_frequencies_tokenized.most_common(100)

#100 most common words in data after removing stopwords
most_common_no_stopwords = word_frequencies_no_stopwords.most_common(100)

#100 most common words in stemmed data
most_common_stemmed = word_frequencies_stemmed.most_common(100)

print('100 most common words in tokenized data: ', most_common_tokenized)
print('100 most common words in data after removing stopwords: ', most_common_no_stopwords)
print('100 most common words in stemmed data: ', most_common_stemmed)

100 most common words in tokenized data:  [('the', 26149872), ('to', 12608807), ('of', 12462264), ('and', 11434235), ('a', 9847694), ('in', 8649507), ('number', 6209052), ('that', 5463746), ('is', 5063094), ('for', 4369621), ('on', 3487089), ('it', 2991866), ('with', 2827783), ('as', 2795151), ('was', 2443526), ('are', 2437801), ('by', 2329672), ('this', 2275935), ('not', 2188344), ('at', 2184696), ('be', 2165001), ('have', 2095758), ('i', 2047978), ('from', 2038249), ('he', 2023919), ('you', 1986005), ('an', 1717358), ('has', 1706949), ('but', 1638333), ('his', 1629409), ('they', 1588561), ('or', 1510196), ('we', 1483259), ('said', 1410689), ('its', 1364965), ('will', 1330845), ('who', 1326790), ('their', 1268986), ('more', 1156366), ('all', 1156080), ('about', 1105097), ('new', 1057199), ('one', 1055274), ('were', 1026236), ('which', 1021229), ('if', 995090), ('us', 976950), ('can', 967774), ('would', 958311), ('been', 924626), ('up', 923144), ('had', 915040), ('what', 896012), ('peo

In [1]:
#Plotting the frequency of the 10000 most frequent words - tokenized:
word_frequency_plot(word_frequencies_tokenized, 'Tokenized')
#Plotting the frequency of the 10000 most frequent words - no stopwords:
word_frequency_plot(word_frequencies_no_stopwords, 'No stopwords')
#Plotting the frequency of the 10000 most frequent words - stemmed:
word_frequency_plot(word_frequencies_stemmed, 'Stemmed')

NameError: name 'word_frequency_plot' is not defined