In [1]:
import pandas as pd
import numpy as np
import re
from cleantext import clean
import nltk
from nltk.stem.snowball import SnowballStemmer

nltk.download('punkt')

pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 150)

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\emilo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
#Extra cleaning functions for dates and "|" symbol
def remove_dates_from_content(content):
    '''Function that attempts to substitute dates in a document for the token "_DATE_".
    If it fails to do so - for example if the content is not convertable to string, it 
    handles the typeerror exception and doesnt do anything with the content.'''
    date_pattern = re.compile(r"(([0-9]{4}-(0[0-9]|1[0-2])-([0-2][0-9]|[3[01])|[a-z]{,9} [0-9]{1,2}, [0-9]{2,4})|\b(\w+\s)(\d{2})(th)?,?(\s\d{4})\b)")
    try:
        content_without_dates = re.sub(date_pattern, "_DATE_", str(content))
    except TypeError:
        content_without_dates = content
    return content_without_dates 

def remove_bar_from_content(content):
    '''Function for removing every occurence of "|"'''
    content_without_bar = str(content).replace("|", "")
    return content_without_bar

#Reading in stopwords
stopwordsDF = pd.read_csv('stopwords.csv')
stopwordsSeries = stopwordsDF.squeeze()
stopwords = set(stopwordsSeries)

def remove_stopwords(list):
     '''Function that returns a list containing a document with the stopwords removed'''
     return [word for word in list if word not in stopwords]

#Initializing stemmer
stemmer = SnowballStemmer("english")
def list_stemmer (wordlist): #stemmer hvert ord i en liste
    '''Function that stems each word in the given input list and returns this'''
    stemmed_list = []
    for word in wordlist:
        stemmed_list.append(stemmer.stem(word))
    return stemmed_list

In [3]:
#Applying data preprocessing pipeline to 995,000_rows.csv dataset split into chunks for better memory management

chunk_size = 248750
large_dataset_chunks = pd.read_csv('995,000_rows.csv', low_memory=False, chunksize=chunk_size)

#Initializing list to save the processed data
preprocessed_content_list = []

for chunk in large_dataset_chunks:

    #Dropping empty columns
    chunk = chunk.drop(columns=['keywords', 'summary'])

    #Extracting 'content' column chunks and giving dtype string
    chunk_content_column = chunk['content'].astype(str)

    #Cleaning the 'content' column chunks
    chunk_content_column_cleaned = chunk_content_column.apply(lambda x : clean(x,
    fix_unicode=False,             # fix various unicode errors
    to_ascii=False,                # transliterate to closest ASCII representation
    lower=True,                    # lowercase text
    no_line_breaks=True,           # fully strip line breaks as opposed to only normalizing them
    no_urls=True,                  # replace all URLs with a special token
    no_emails=True,                # replace all email addresses with a special token
    no_phone_numbers=True,         # replace all phone numbers with a special token
    no_numbers=True,               # replace all numbers with a special token
    no_digits=True,                # replace all digits with a special token
    no_currency_symbols=True,      # replace all currency symbols with a special token
    no_punct=True,                 # remove punctuations
    replace_with_punct="",         # instead of removing punctuations you may replace them
    replace_with_url="_URL_",
    replace_with_email="_EMAIL_",
    replace_with_phone_number="_PHONE_",
    replace_with_number="_NUMBER_",
    replace_with_digit="0",
    replace_with_currency_symbol="_CUR_",
    lang="en"                    
))
    #Final cleaning of the 'content' column chunks
    chunk_content_column_cleaned = chunk_content_column_cleaned.apply(lambda x: remove_dates_from_content(x))
    chunk_content_column_cleaned = chunk_content_column_cleaned.apply(lambda x: remove_bar_from_content(x))
    print('Cleaning finished')

    #Tokenizing the 'content' column chunks
    chunk_content_column_tokenized = chunk_content_column_cleaned.apply(lambda x: nltk.word_tokenize(x))
    print('Tokenization finished')
    #Removing stopwords from the 'content' column chunks
    chunk_content_column_no_stopwords = chunk_content_column_tokenized.apply(lambda x: remove_stopwords(x))

    #Stemming the 'content' column chunks
    chunk_content_column_preprocessed = chunk_content_column_no_stopwords.apply(lambda x:list_stemmer(x))
    print('Stemming finished')
    preprocessed_content_list.extend(chunk_content_column_preprocessed.tolist())

print('Preprocessing finished')
    

Cleaning finished
Tokenization finished
Stemming finished
Cleaning finished
Tokenization finished
Stemming finished
Cleaning finished
Tokenization finished
Stemming finished
Cleaning finished
Tokenization finished
Stemming finished
Preprocessing finished


In [4]:
#Converting preprocessed list to pandas series
preprocessed_content_series = pd.Series(preprocessed_content_list)

#Reading in the original dataset
large_dataset = pd.read_csv('995,000_rows.csv')

#Altering the 'content' to contain the preprocessed content
large_dataset['content'] = preprocessed_content_series

#Saving preprocessed dataframe to new csv file
large_dataset.to_csv('large_dataset_cleaned.csv', index=False)

  large_dataset = pd.read_csv('995,000_rows.csv')


In [7]:
# #None of the following code has been run yet!!

# large_dataset_cleaned = pd.read_csv('large_dataset_cleaned.csv')
# #Splitting data into train_test_val sets:

# #Dividing data into features and label (X and y)
# y = large_dataset_cleaned['type'] #target

# X = large_dataset_cleaned.drop(columns=['type']) #features
data = pd.read_csv('large_dataset_cleaned.csv')


  data = pd.read_csv('large_dataset_cleaned.csv')


['articl', 'googl', 'ali', 'alfoneh', 'assist', 'compil', 'polit', 'nuclear', 'issu', 'suprem', 'leader', 'tell', 'islam', 'student', 'associ', 'foreign', 'univers', 'conspiraci', 'machin', 'enemi', 'includ', 'scientif', 'apartheid', 'subject', 'nation', 'strengthen', 'uniti', 'peopl', 'head', 'iran', 'nuclear', 'energi', 'agenc', 'condit', 'implement', 'addit', 'protocol', 'reactor', 'come', 'onlin', 'militari', 'admir', 'habiballah', 'sayyari', 'chief', 'islam', 'republ', 'iran', 'navi', 'closur', 'hormuz', 'strait', 'consider', 'upcom', 'war', 'game', 'ad', 'iranianmad', 'submarin', 'leav', 'dock', 'southern', 'iran', 'seyyedyahya', 'rahim', 'safavi', 'irgc', 'head', 'current', 'advisor', 'suprem', 'leader', 'risk', 'attack', 'iran', 'minim', 'iran', 'defens', 'doctrin', 'entail', 'nuclear', 'weapon', 'societi', 'cultur', 'iranian', 'psychologist', 'davar', 'sheikhavandi', 'window', 'shop', 'bring', 'girl', 'boy', 'social', 'danger', 'environ', 'shop', 'mall', 'a', 'prelud', 'decad'

In [8]:
# #Split data into 80% training and 20% test
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# #Split the newly created test data equally into validation and test data (10% each of the total dataset)
# X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)
print(data['content'][:20])



0     ['articl', 'googl', 'ali', 'alfoneh', 'assist', 'compil', 'polit', 'nuclear', 'issu', 'suprem', 'leader', 'tell', 'islam', 'student', 'associ', 'f...
1     ['cost', 'senat', 'bank', 'committe', 'jp', 'morgan', 'buy', 'cur000000', 'bribe', 'news', 'hedg', 'hour', 'time', 'jami', 'dimon', 'sit', 'senat'...
2     ['man', 'awoken', 'numberyear', 'coma', 'commit', 'suicid', 'learn', 'donald', 'trump', 'lead', 'presidenti', 'race', 'fatal', 'error', 'call', 'u...
3     ['julia', 'geist', 'ask', 'draw', 'a', 'pictur', 'a', 'comput', 'scientist', 'year', 'numberyearold', 'sketch', 'a', 'businessman', 'wear', 'glass...
4     ['number', 'compil', 'studi', 'vaccin', 'danger', 'activist', 'post', 'sep', 'number', 'number', 'shortag', 'research', 'negat', 'effect', 'a', 'w...
5     ['spend', 'major', 'wake', 'hour', 'stare', 'content', 'a', 'comput', 'smartphon', 'will', 'ignor', 'ocular', 'havoc', 'blue', 'light', 'electron'...
6     ['disclaim', 'general', 'inform', 'a', 'law', 'topic', 'di