In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import re
from cleantext import clean
import nltk
from nltk.stem.snowball import SnowballStemmer

nltk.download('punkt')

pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 150)

Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\emilo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
#Functions for cleaning and further preprocessing

def remove_dates_from_content(content):
    '''Function that attempts to substitute dates in a document for the token "_DATE_".
    If it fails to do so - for example if the content is not convertable to string, it 
    handles the typeerror exception and doesnt do anything with the content.'''
    date_pattern = re.compile(r"(([0-9]{4}-(0[0-9]|1[0-2])-([0-2][0-9]|[3[01])|[a-z]{,9} [0-9]{1,2}, [0-9]{2,4})|\b(\w+\s)(\d{2})(th)?,?(\s\d{4})\b)")
    try:
        content_without_dates = re.sub(date_pattern, "_DATE_", str(content))
    except TypeError:
        content_without_dates = content
    return content_without_dates 

def remove_bar_from_content(content):
    '''Function for removing every occurence of "|"'''
    content_without_bar = str(content).replace("|", "")
    return content_without_bar

def remove_a_from_content(content):
    '''Function for removing every occurence of "a"'''
    return [word for word in content if word != "a"]

#Reading in stopwords
stopwordsDF = pd.read_csv('data/stopwords.csv')
stopwordsSeries = stopwordsDF.squeeze()
stopwords = set(stopwordsSeries)

def remove_stopwords(list):
     '''Function that returns a list containing a document with the stopwords removed'''
     return [word for word in list if word not in stopwords]

#Initializing stemmer
stemmer = SnowballStemmer("english")
def list_stemmer (wordlist): #stemmer hvert ord i en liste
    '''Function that stems each word in the given input list and returns this'''
    stemmed_list = []
    for word in wordlist:
        stemmed_list.append(stemmer.stem(word))
    return stemmed_list


def reduction_rate(after,before):
     '''Computes the reduction rate of the size of the vocabulary
     and returns this rounded to 3 decimal points'''
     return round((before - after)/before, 3)

def word_frequency_plot(counter_dict, title):
    '''Plots the frequency of the 10000 most common words given a counter object
    of words and their frequencies and a title'''

    # Select the top 10,000 most common words and frequencies
    most_common_words = counter_dict.most_common(10000)
    words, frequencies = zip(*most_common_words)
    
    # Creating the plot
    plt.figure(figsize=(20, 10))
    plt.bar(range(len(frequencies)), frequencies, width=1.0)
    
    plt.title(title)
    plt.xlabel('Words Ranked by Frequency')
    plt.ylabel('Frequency')
    # Using logarithmic scale for better visibility of frequencies
    plt.yscale('log')  
    
    # Removing x-ticks
    plt.xticks([])
    
    plt.tight_layout()
    plt.show()

def update_frequency_counter(frequency_counter, content):
    '''function that updates a counter object with the words from each document
    in a corpus. If theyre already in the counter the values are simply updated'''
    for list in content:
        frequency_counter.update(list)
    return frequency_counter

In [40]:
liar_train_data = pd.read_csv('data/LIAR/train.tsv', sep='\t', header=None)
liar_test_data = pd.read_csv('data/LIAR/test.tsv', sep='\t', header=None)
liar_valid_data = pd.read_csv('data/LIAR/valid.tsv', sep='\t', header=None)

In [66]:
train_statements = pd.Series(liar_train_data.iloc[:, 2])
test_statements = pd.Series(liar_test_data.iloc[:, 2])
valid_statements = pd.Series(liar_valid_data.iloc[:, 2])

train_labels = pd.Series(liar_train_data.iloc[:, 1])
test_labels = pd.Series(liar_test_data.iloc[:, 1])
valid_labels = pd.Series(liar_valid_data.iloc[:, 1])

all_statements = pd.concat([train_statements, test_statements, valid_statements], axis=0, ignore_index=True)
all_labels = pd.concat([train_labels, test_labels, valid_labels], axis=0, ignore_index=True)

labeled_statements = pd.concat([all_statements, all_labels], axis=1)
labeled_statements.columns = ['Statement', 'Label']
print(labeled_statements)

                                                                                                                                                   Statement  \
0                                                                         Says the Annies List political group supports third-trimester abortions on demand.   
1              When did the decline of coal start? It started when natural gas took off that started to begin in (President George W.) Bushs administration.   
2                                                  Hillary Clinton agrees with John McCain "by voting to give George Bush the benefit of the doubt on Iran."   
3                                                                             Health care reform legislation is likely to mandate free sex change surgeries.   
4                                                                                                     The economic turnaround started at the end of my term.   
5      The Chicago Bears have had more s

In [67]:
#Preprocess pipeline
#Extracting 'statement' column
statements = labeled_statements['Statement']

#Cleaning the 'statement' column
statements = statements.apply(lambda x : clean(x,
fix_unicode=False,             # fix various unicode errors
to_ascii=False,                # transliterate to closest ASCII representation
lower=True,                    # lowercase text
no_line_breaks=True,           # fully strip line breaks as opposed to only normalizing them
no_urls=True,                  # replace all URLs with a special token
no_emails=True,                # replace all email addresses with a special token
no_phone_numbers=True,         # replace all phone numbers with a special token
no_numbers=True,               # replace all numbers with a special token
no_digits=True,                # replace all digits with a special token
no_currency_symbols=True,      # replace all currency symbols with a special token
no_punct=True,                 # remove punctuations
replace_with_punct="",         # instead of removing punctuations you may replace them
replace_with_url="_URL_",
replace_with_email="_EMAIL_",
replace_with_phone_number="_PHONE_",
replace_with_number="_NUMBER_",
replace_with_digit="0",
replace_with_currency_symbol="_CUR_",
lang="en"                    
))

#Final cleaning of the 'statement' column
statements = statements.apply(lambda x: remove_dates_from_content(x))
statements = statements.apply(lambda x: remove_bar_from_content(x))

#Tokenizing the 'content' column chunks
statements = statements.apply(lambda x: nltk.word_tokenize(x))

#Removing stopwords from the 'content' column chunks
statements = statements.apply(lambda x: remove_stopwords(x))
statements = statements.apply(lambda x: remove_a_from_content(x))

#Stemming the 'content' column chunks
statements = statements.apply(lambda x:list_stemmer(x))

print('Preprocessing finished')
    

Preprocessing finished


In [70]:
labeled_statements_preprocessed = pd.concat([statements, labeled_statements['Label']], axis=1)
print(labeled_statements_preprocessed)

                                                                                                                                                   Statement  \
0                                                                                           [anni, list, polit, group, support, thirdtrimest, abort, demand]   
1                                                                     [declin, coal, start, start, natur, gas, start, begin, presid, georg, bush, administr]   
2                                                                      [hillari, clinton, agre, john, mccain, vote, give, georg, bush, benefit, doubt, iran]   
3                                                                                          [health, care, reform, legisl, mandat, free, sex, chang, surgeri]   
4                                                                                                                     [econom, turnaround, start, end, term]   
5                                       

In [73]:
labeled_statements_preprocessed.to_pickle('labeled_liar_statements_preprocessed.pkl')