In [None]:
import pandas as pd
import numpy as np
import re
from cleantext import clean
import matplotlib.pyplot as plt
import nltk
from nltk.stem.snowball import SnowballStemmer
from collections import Counter
from sklearn.model_selection import train_test_split

nltk.download('punkt')

#hej
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 150)

In [None]:
#LOOK THROUGH ENTIRE PREPROCESSING CODE TO ENSURE EVERYTHING IS AS INTENDED TO SAVE TIME SPENT
#ON RUNNING THE CODE
#Applying data preprocessing pipeline to 995,000_rows.csv dataset
large_dataset = pd.read_csv('995,000_rows.csv', low_memory=False)

#I've changed the dtype of the large content column to stringDtype such that every element
#is treated as strings regardless of their original format, but i have not run the code yet as 
# the preprocessing pipeline it takes quite a while to run. Maybe now it will be more efficient
#as the previous dtype of the content col was "object"
large_content_column = pd.DataFrame(large_dataset['content'], dtype=pd.StringDtype())
print(large_content_column[:100])

In [None]:
print(large_dataset['type'].value_counts())

In [None]:
#Preprocessing pileline:
#Calling the clean method with appropriate arguments and assigning the result back to 'large_content_column'

#This takes 30m on my pc but seems like its just how long it takes to clean. The output seems correct
large_content_column_cleaned = large_content_column.apply(lambda x : clean(x,
    fix_unicode=False,             # fix various unicode errors
    to_ascii=False,                # transliterate to closest ASCII representation
    lower=True,                    # lowercase text
    no_line_breaks=True,           # fully strip line breaks as opposed to only normalizing them
    no_urls=True,                  # replace all URLs with a special token
    no_emails=True,                # replace all email addresses with a special token
    no_phone_numbers=True,         # replace all phone numbers with a special token
    no_numbers=True,               # replace all numbers with a special token
    no_digits=True,                # replace all digits with a special token
    no_currency_symbols=True,      # replace all currency symbols with a special token
    no_punct=True,                 # remove punctuations
    replace_with_punct="",         # instead of removing punctuations you may replace them
    replace_with_url="_URL_",
    replace_with_email="_EMAIL_",
    replace_with_phone_number="_PHONE_",
    replace_with_number="_NUMBER_",
    replace_with_digit="0",
    replace_with_currency_symbol="_CUR_",
    lang="en"                    
))

print(large_content_column_cleaned.head(10))


In [None]:
#Remove dates from corpus
#Dont know if it actually works as i cant find any places in the output where dates have been substituted??? 
def remove_dates_from_content(content):
    '''Function that attempts to substitute dates in a document for the token "_DATE_".
    If it fails to do so - for example if the content is not convertable to string, it 
    handles the typeerror exception and doesnt do anything with the content.'''
    date_pattern = re.compile(r"(([0-9]{4}-(0[0-9]|1[0-2])-([0-2][0-9]|[3[01])|[a-z]{,9} [0-9]{1,2}, [0-9]{2,4})|\b(\w+\s)(\d{2})(th)?,?(\s\d{4})\b)")
    try:
        content_without_dates = re.sub(date_pattern, "_DATE_", str(content))
    except TypeError:
        content_without_dates = content
    return content_without_dates

large_content_column_cleaned = large_content_column_cleaned.apply(lambda x: remove_dates_from_content(x))
print(large_content_column_cleaned.head(10))   

In [None]:
#Seperate print statement to avoid having to run all code in one block again
print(large_content_column_cleaned.head(10))

In [None]:
#Tokenize the corpus
#Took 33minutes to run on my pc
large_content_column_tokenized = large_content_column_cleaned.apply(lambda x: nltk.word_tokenize(x))

In [None]:
print(large_content_column_tokenized.head(10))

In [None]:
#Remove stopwords from corpus
stopwordsDF = pd.read_csv('stopwords.csv')  #571 stopord - den fra nltk har færre, men kan også bruge
stopwordsSeries = stopwordsDF.squeeze()
stopwords = set(stopwordsSeries)

def remove_stopwords(list):
     '''Returns a list containing a document with the stopwords removed'''
     return [word for word in list if word not in stopwords]

large_content_column_no_stopwords = large_content_column_tokenized.apply(lambda x: remove_stopwords(x))

In [None]:
#Stemming the corpus
stemmer = SnowballStemmer("english")
def list_stemmer (wordlist): #stemmer hvert ord i en liste
    '''Stems each word in the given input list and returns this'''
    stemmed_list = []
    for word in wordlist:
        stemmed_list.append(stemmer.stem(word))
    return stemmed_list

large_content_column_preprocessed = large_content_column_no_stopwords.apply(lambda x:list_stemmer(x))

#IMPORTANT
#Maybe more cleaning is needed before proceeding to save the data into new 'cleaned' csv-file for use from here on out!!!
#For example we should find out if we want to drop some cols (if theyre empty) and potentially also rows
#if we decide to only stick with the rows that have type reliable and fake.

In [None]:
#Saving preprocessed corpus to dataframe
large_dataset['content'] = large_content_column_preprocessed

#Saving preprocessed dataframe to new csv file


# large_dataset.to_csv('large_dataset_cleaned.csv', index=False)

In [None]:
#None of the following code has been run yet!!

large_dataset_cleaned = pd.read_csv('large_dataset_cleaned.csv')
#Splitting data into train_test_val sets:

#Dividing data into features and label (X and y)
y = large_dataset_cleaned['type'] #target

X = large_dataset_cleaned.drop(columns=['type']) #features

In [None]:
#Split data into 80% training and 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Split the newly created test data equally into validation and test data (10% each of the total dataset)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)