In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from cleantext import clean
import nltk
from nltk.stem.snowball import SnowballStemmer

nltk.download('punkt')

pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 150)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\emilo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
#Extra cleaning functions for dates and "|" symbol
def remove_dates_from_content(content):
    '''Function that attempts to substitute dates in a document for the token "_DATE_".
    If it fails to do so - for example if the content is not convertable to string, it 
    handles the typeerror exception and doesnt do anything with the content.'''
    date_pattern = re.compile(r"(([0-9]{4}-(0[0-9]|1[0-2])-([0-2][0-9]|[3[01])|[a-z]{,9} [0-9]{1,2}, [0-9]{2,4})|\b(\w+\s)(\d{2})(th)?,?(\s\d{4})\b)")
    try:
        content_without_dates = re.sub(date_pattern, "_DATE_", str(content))
    except TypeError:
        content_without_dates = content
    return content_without_dates 

def remove_bar_from_content(content):
    '''Function for removing every occurence of "|"'''
    content_without_bar = str(content).replace("|", "")
    return content_without_bar

def remove_a_from_content(content):
    '''Function for removing every occurence of "a"'''
    return [word for word in content if word != "a"]

#Reading in stopwords
stopwordsDF = pd.read_csv('stopwords.csv')
stopwordsSeries = stopwordsDF.squeeze()
stopwords = set(stopwordsSeries)

def remove_stopwords(list):
     '''Function that returns a list containing a document with the stopwords removed'''
     return [word for word in list if word not in stopwords]

#Initializing stemmer
stemmer = SnowballStemmer("english")
def list_stemmer (wordlist): #stemmer hvert ord i en liste
    '''Function that stems each word in the given input list and returns this'''
    stemmed_list = []
    for word in wordlist:
        stemmed_list.append(stemmer.stem(word))
    return stemmed_list

def vocabulary_size(series):
    '''Function that computes the size of the vocabulary of a corpus'''

    #Initializing empty set to store unique words
    unique_words_in_corpus = set()

    for lst in series:
        #Updates the set of unique words by union with all lists in the corpus
        unique_words_in_corpus.update(lst)
    
    #Computing length (sz of vocabulary)
    vocab_sz = len(unique_words_in_corpus)

    return vocab_sz

def most_frequent_n_words(series, n):
    '''Function that return the n most frequent words and their frequencies in a series
    assuming the elements in the series are lists of strings'''
    words = (word for sublist in series for word in sublist)
    
    # Calculate word frequencies using Counter
    word_freq = Counter(words)
    
    # Select the top 10,000 most common words and frequencies
    most_common_words = word_freq.most_common(n)

    return most_common_words

#Jeg har taget foreskellen ml antal ord før processing og efter of så divideret med
# antal ord til at starte med for at få reduction raten
def reduction_rate(after,before):
     '''Computes the reduction rate of the size of the vocabulary
     and returns this rounded to 3 decimal points'''
     return round((before - after)/before, 3)
#måske noget med at ordene er meget frekvente, så man kunne også kigge på reduktionen af antal ord.


# def word_frequency_plot(series, title):
#     '''function for plotting from the third most frequent word to the 1000 most frequent word
#     as well as their corresponding frequencies in a barplot.'''
#     plt.bar(*zip(*most_frequent(series, 0, 9999).items()))
#     plt.title(title)
#     plt.xlabel('words')
#     plt.ylabel('frequency')
#     plt.xticks([])
#     plt.ylim(0, 26149872)
#     plt.show()

# import matplotlib.pyplot as plt

from collections import Counter

def word_frequency_plot(series, title):
    """
    Plots the frequency of the 10,000 most frequent words in the series without x-ticks.
    Args:
    - series: A Pandas Series where each element is a list of words from a document.
    - title: Title for the plot.
    """
    # Flatten the list of lists into a single list of words using a generator expression
    words = (word for sublist in series for word in sublist)
    
    # Calculate word frequencies using Counter
    word_freq = Counter(words)
    
    # Select the top 10,000 most common words and frequencies
    most_common_words = word_freq.most_common(10000)
    words, frequencies = zip(*most_common_words)  # Unzipping the word-frequencies pairs
    
    # Creating the plot
    plt.figure(figsize=(20, 10))  # Increase figure size for better visibility
    plt.bar(range(len(frequencies)), frequencies, width=1.0)
    
    plt.title(title)
    plt.xlabel('Words Ranked by Frequency')
    plt.ylabel('Frequency')
    plt.yscale('log')  # Using logarithmic scale for better visibility of frequencies
    
    # Removing x-ticks
    plt.xticks([])
    
    plt.tight_layout()  # Adjust layout to not cut off labels
    plt.show()



# Example usage (commented out):
# word_frequency_plot_vast(content_series, "Top 10,000 Most Frequent Words")


def count_occurences_in_content(content, str):
    count = sum(string == str for document in content for string in document)
    return count

In [3]:
#Applying data preprocessing pipeline to 995,000_rows.csv dataset split into chunks for better memory management

# chunk_size = 248750
# large_dataset_chunks = pd.read_csv('995,000_rows.csv', low_memory=False, chunksize=chunk_size)

# #Initializing list to save the processed data
# preprocessed_content_list = []
# vocabulary_size_with_stopwords = 0
# vocabulary_size_no_stopwords = 0

# for chunk in large_dataset_chunks:

#     #Dropping empty columns
#     chunk = chunk.drop(columns=['keywords', 'summary'])

#     #Extracting 'content' column chunks and giving dtype string
#     chunk_content_column = chunk['content'].astype(str)

#     #Cleaning the 'content' column chunks
#     chunk_content_column = chunk_content_column.apply(lambda x : clean(x,
#     fix_unicode=False,             # fix various unicode errors
#     to_ascii=False,                # transliterate to closest ASCII representation
#     lower=True,                    # lowercase text
#     no_line_breaks=True,           # fully strip line breaks as opposed to only normalizing them
#     no_urls=True,                  # replace all URLs with a special token
#     no_emails=True,                # replace all email addresses with a special token
#     no_phone_numbers=True,         # replace all phone numbers with a special token
#     no_numbers=True,               # replace all numbers with a special token
#     no_digits=True,                # replace all digits with a special token
#     no_currency_symbols=True,      # replace all currency symbols with a special token
#     no_punct=True,                 # remove punctuations
#     replace_with_punct="",         # instead of removing punctuations you may replace them
#     replace_with_url="_URL_",
#     replace_with_email="_EMAIL_",
#     replace_with_phone_number="_PHONE_",
#     replace_with_number="_NUMBER_",
#     replace_with_digit="0",
#     replace_with_currency_symbol="_CUR_",
#     lang="en"                    
# ))
#     #Final cleaning of the 'content' column chunks
#     chunk_content_column = chunk_content_column.apply(lambda x: remove_dates_from_content(x))
#     chunk_content_column = chunk_content_column.apply(lambda x: remove_bar_from_content(x))
#     print('Cleaning finished')

#     #Tokenizing the 'content' column chunks
#     chunk_content_column = chunk_content_column.apply(lambda x: nltk.word_tokenize(x))
#     print('Tokenization finished')

#     #Counting vocabulary size before stopwords removal
#     vocabulary_size_with_stopwords += vocabulary_size(chunk_content_column)[0]

#     #Removing stopwords from the 'content' column chunks
#     chunk_content_column = chunk_content_column.apply(lambda x: remove_stopwords(x))
    
#     #Counting vocabulary size after stopwords removal
#     vocabulary_size_no_stopwords += vocabulary_size(chunk_content_column)[0]

#     #Stemming the 'content' column chunks
#     chunk_content_column = chunk_content_column.apply(lambda x:list_stemmer(x))
#     print('Stemming finished')
#     preprocessed_content_list.extend(chunk_content_column.tolist())

# print('Preprocessing finished')

# print("vocabulary size before removal of stopwords: ", vocabulary_size_with_stopwords)
# print("vocabulary size after removal of stopwords: ", vocabulary_size_no_stopwords)

# print("Reduction rate of vocabulary size after removing stopwords:", reduction_rate(vocabulary_size_no_stopwords,vocabulary_size_with_stopwords))
    

In [None]:
content_column = pd.read_csv('995,000_rows.csv', low_memory=False, usecols=['content'], dtype={'content': str})['content']

In [None]:
#Cleaning the 'content' column chunks
content_column = content_column.apply(lambda x : clean(x,
fix_unicode=False,             # fix various unicode errors
to_ascii=False,                # transliterate to closest ASCII representation
lower=True,                    # lowercase text
no_line_breaks=True,           # fully strip line breaks as opposed to only normalizing them
no_urls=True,                  # replace all URLs with a special token
no_emails=True,                # replace all email addresses with a special token
no_phone_numbers=True,         # replace all phone numbers with a special token
no_numbers=True,               # replace all numbers with a special token
no_digits=True,                # replace all digits with a special token
no_currency_symbols=True,      # replace all currency symbols with a special token
no_punct=True,                 # remove punctuations
replace_with_punct="",         # instead of removing punctuations you may replace them
replace_with_url="_URL_",
replace_with_email="_EMAIL_",
replace_with_phone_number="_PHONE_",
replace_with_number="_NUMBER_",
replace_with_digit="0",
replace_with_currency_symbol="_CUR_",
lang="en"                    
))

#Final cleaning of the 'content' column
content_column = content_column.apply(lambda x: remove_dates_from_content(x))
content_column = content_column.apply(lambda x: remove_bar_from_content(x))
print('Cleaning finished')

In [None]:
#Tokenizing the 'content' column chunks
content_column = content_column.apply(lambda x: nltk.word_tokenize(x))

In [None]:
#Counting stuff
print("Count of URL's in content: ", count_occurences_in_content(content_column, "url"))
print("Count of dates in content: ", count_occurences_in_content(content_column, "date"))
print("Count of numbers in content: ", count_occurences_in_content(content_column, "number"))

#Counting vocabulary size before stopwords removal
vocabulary_size_with_stopwords = vocabulary_size(content_column)
print("vocabulary size before removing stopwords: ", vocabulary_size_with_stopwords)

#100 most frequent words before removing stopwords
most_frequent_with_stopwords = most_frequent_n_words(content_column, 10000)
print("100 most frequent words before removing stopwords", most_frequent_with_stopwords)



In [None]:
#Plotting the frequency of the 10000 most frequent words before removing stopwords
word_frequency_plot(content_column, "With stopwords")

In [None]:
#Removing stopwords from the 'content' column chunks
content_column = content_column.apply(lambda x: remove_stopwords(x))

In [None]:
#Counting vocabulary size after stopwords removal
vocabulary_size_no_stopwords = vocabulary_size(content_column)
print("vocabulary size before removing stopwords: ", vocabulary_size_no_stopwords)

#100 most frequent words before removing stopwords
most_frequent_no_stopwords = most_frequent_n_words(content_column, 100)
print("100 most frequent words before removing stopwords", most_frequent_no_stopwords)

#Vocabulary reduction after removing stopwords
print("Reduction rate of vocabulary size after removing stopwords:", reduction_rate(vocabulary_size_no_stopwords, vocabulary_size_with_stopwords))

#Plotting the frequency of the 10000 most frequent words before removing stopwords
word_frequency_plot(content_column, "No stopwords")


In [None]:
#Stemming the 'content' column chunks
content_column = content_column.apply(lambda x:list_stemmer(x))

In [None]:
#Counting vocabulary size after stemming
vocabulary_size_stemmed = vocabulary_size(content_column)
print("vocabulary size after stemming: ", vocabulary_size_stemmed)

#100 most frequent words after stemming
most_frequent_stemmed = most_frequent_n_words(content_column, 100)
print("most frequent stemmed", most_frequent_stemmed)

#Vocabulary reduction after stemming
print("Reduction rate of vocabulary size after removing stopwords:", reduction_rate(vocabulary_size_stemmed, vocabulary_size_no_stopwords))

word_frequency_plot(content_column, "Stemmed")

In [None]:
large_dataset = pd.read_csv("995,000_rows.csv", low_memory=False)

#Dropping empty columns
large_dataset = large_dataset.drop(columns=['keywords', 'summary'], inplace=True)

#Altering the 'content' to contain the preprocessed content
large_dataset['content'] = content_column

#Saving preprocessed dataframe to new csv file
large_dataset.to_pickle('large_dataset_cleaned.csv', index=False)

In [None]:
# #None of the following code has been run yet!!

# large_dataset_cleaned = pd.read_csv('large_dataset_cleaned.csv')
# #Splitting data into train_test_val sets:

# #Dividing data into features and label (X and y)
# y = large_dataset_cleaned['type'] #target

# X = large_dataset_cleaned.drop(columns=['type']) #features
# data = pd.read_csv('large_dataset_cleaned.csv')


In [None]:
# #Split data into 80% training and 20% test
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# #Split the newly created test data equally into validation and test data (10% each of the total dataset)
# X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

