In [1]:
import pandas as pd
from cleantext import clean
import nltk
import functions as funs

nltk.download('punkt')

pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 150)

Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\emilo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
#Reading in stopwords
stopwordsDF = pd.read_csv('stopwords.csv')
stopwordsSeries = stopwordsDF.squeeze()
stopwords = set(stopwordsSeries)

In [2]:
liar_train_data = pd.read_csv('data/LIAR/train.tsv', sep='\t', header=None)
liar_test_data = pd.read_csv('data/LIAR/test.tsv', sep='\t', header=None)
liar_valid_data = pd.read_csv('data/LIAR/valid.tsv', sep='\t', header=None)

In [3]:
train_statements = pd.Series(liar_train_data.iloc[:, 2])
test_statements = pd.Series(liar_test_data.iloc[:, 2])
valid_statements = pd.Series(liar_valid_data.iloc[:, 2])

train_labels = pd.Series(liar_train_data.iloc[:, 1])
test_labels = pd.Series(liar_test_data.iloc[:, 1])
valid_labels = pd.Series(liar_valid_data.iloc[:, 1])

all_statements = pd.concat([train_statements, test_statements, valid_statements], axis=0, ignore_index=True)
all_labels = pd.concat([train_labels, test_labels, valid_labels], axis=0, ignore_index=True)

labeled_statements = pd.concat([all_statements, all_labels], axis=1)
labeled_statements.columns = ['Statement', 'Label']

In [4]:
#Preprocess pipeline
#Extracting 'statement' column
statements = labeled_statements['Statement']

#Cleaning the 'statement' column
statements = statements.apply(lambda x : clean(x,
fix_unicode=False,             # fix various unicode errors
to_ascii=False,                # transliterate to closest ASCII representation
lower=True,                    # lowercase text
no_line_breaks=True,           # fully strip line breaks as opposed to only normalizing them
no_urls=True,                  # replace all URLs with a special token
no_emails=True,                # replace all email addresses with a special token
no_phone_numbers=True,         # replace all phone numbers with a special token
no_numbers=True,               # replace all numbers with a special token
no_digits=True,                # replace all digits with a special token
no_currency_symbols=True,      # replace all currency symbols with a special token
no_punct=True,                 # remove punctuations
replace_with_punct="",         # instead of removing punctuations you may replace them
replace_with_url="_URL_",
replace_with_email="_EMAIL_",
replace_with_phone_number="_PHONE_",
replace_with_number="_NUMBER_",
replace_with_digit="0",
replace_with_currency_symbol="_CUR_",
lang="en"                    
))

#Final cleaning of the 'statement' column
statements = statements.apply(lambda x: funs.remove_dates_from_content(x))
statements = statements.apply(lambda x: funs.remove_bar_from_content(x))

#Tokenizing the 'content' column chunks
statements = statements.apply(lambda x: nltk.word_tokenize(x))

#Removing stopwords from the 'content' column chunks
statements = statements.apply(lambda x: funs.remove_stopwords(x, stopwords))
statements = statements.apply(lambda x: funs.remove_a_from_content(x))

#Stemming the 'content' column chunks
statements = statements.apply(lambda x:funs.list_stemmer(x))

print('Preprocessing finished')
    

Preprocessing finished


In [5]:
labeled_statements_preprocessed = pd.concat([statements, labeled_statements['Label']], axis=1)

In [6]:
#uncomment below to save file to pickle
# labeled_statements_preprocessed.to_pickle('labeled_liar_statements_preprocessed.pkl')