## Dataset Preprocessing

Pre-process training and testing datasets and save them as CSV files to save time

In [None]:
import numpy as np
import pandas as pd

In [None]:
# Read CSV file in
train_path = './raw_data/fulltrain.csv'
test_path = './raw_data/balancedtest.csv'
df = pd.read_csv(train_path, header=None)
test_df = pd.read_csv(test_path, header=None)

print('Total rows, Total Columns: ' + str(df.shape))
df.sample(5) # Random sample values to see

In [None]:
print('Total rows, Total Columns: ' + str(test_df.shape))
test_df.sample(5) # Random sample values to see

### Text-Cleaning Function

In [None]:
import string 
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

# Create set of stopwords for use in preprocessing
stopword_set = set(stopwords.words('english'))
# print(stopword_set)

def lemmatize_text(text):
    tokenised_text = nltk.word_tokenize(text)
    # Tag with Penn Treebank POS tags
    tagged_text = nltk.pos_tag(tokenised_text)

    lemmatizer = WordNetLemmatizer()
    lemmatized_words = []

    for word in tagged_text:
        pos_tag = get_wordnet_pos(word[1])
        if pos_tag == '':
            continue
        new_word = lemmatizer.lemmatize(word=word[0], pos=pos_tag)
        lemmatized_words.append(new_word)
    lemmatized_text = ' '.join(lemmatized_words)
    return lemmatized_text

# Convert into wordnet compatible POS tags (j, v, n , a)
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''

# Fold to lower case
def to_lower_case(text):
    return text.lower()

def tokenise_text(text):
    tokens = nltk.word_tokenize(text)
    # print(tokens)
    return tokens

def remove_stopwords(text, stopword_set):
    # Split by whitespace
    split_text = text.split()
    new_tokens = []
    for token in split_text:
        if token in stopword_set:
            continue
        new_tokens.append(token)
    # Parse back into text
    return ' '.join(new_tokens)

# Remove all punctuations - Affects words such as U.S.A etc
# Removal of stop words has to be done prior to punctuation removal
def remove_punctuation(text):
    depunctuated_text = text.translate(str.maketrans('','', string.punctuation))
    return depunctuated_text

# Prevent concatenation of statistics and names
def replace_hyphens(text):
    return text.replace('-', ' ')

# Combine all processes into a single preprocess text function to call on df
# Default one used for training
def preprocess_text(text):
    dehyphenated_text = replace_hyphens(text)
    lowered_text = to_lower_case(dehyphenated_text)
    initial_stopword_pass = remove_stopwords(lowered_text, stopword_set)
    tokens = tokenise_text(initial_stopword_pass)
    tokenised_text = ' '.join(tokens)
    depunctuated_text = remove_punctuation(tokenised_text)
    second_stopword_pass = remove_stopwords(depunctuated_text, stopword_set)
    return second_stopword_pass

# Testing dataset 1
def preprocess_text_keep_punctuation(text):
    lowered_text = to_lower_case(text)
    initial_stopword_pass = remove_stopwords(lowered_text, stopword_set)
    return initial_stopword_pass

def remove_stopwords_two(text, stopword_set):
    # Split by whitespace
    split_text = text.split()
    new_tokens = []
    for token in split_text:
        temp_token = token.lower()
        if temp_token in stopword_set:
            continue
        new_tokens.append(token)
    # Parse back into text
    return ' '.join(new_tokens)

# Testing dataset 2
def preprocess_text_capitalised(text):
    dehyphenated_text = replace_hyphens(text)
    initial_stopword_pass = remove_stopwords_two(dehyphenated_text, stopword_set)
    tokens = tokenise_text(initial_stopword_pass)
    tokenised_text = ' '.join(tokens)
    depunctuated_text = remove_punctuation(tokenised_text)
    second_stopword_pass = remove_stopwords_two(depunctuated_text, stopword_set)
    return second_stopword_pass

In [None]:
print('Cleaning training text...')
df[1] = df[1].map(preprocess_text_capitalised)
print('Preprocessing done!')
df.sample(10) # Random sample values to see

In [None]:
print('Cleaning test text...')
test_df[1] = test_df[1].map(preprocess_text_capitalised)
print('Preprocessing done!')
test_df.sample(10) # Random sample values to see

In [None]:
df.to_csv('./raw_data/capitalized_fulltrain.csv', header=False, index=False)

In [None]:
test_df.to_csv('./raw_data/capitalized_balancedtest.csv', header=False, index=False)

In [None]:
# Read CSV file in
clean_train_path = './raw_data/capitalized_fulltrain.csv'
clean_test_path = './raw_data/capitalized_balancedtest.csv'
clean_df = pd.read_csv(clean_train_path, header=None)
clean_test_df = pd.read_csv(clean_test_path, header=None)

print('Total rows, Total Columns: ' + str(clean_df.shape))
clean_df.sample(5) # Random sample values to see

In [None]:
print('Total rows, Total Columns: ' + str(clean_test_df.shape))
clean_test_df.sample(5) # Random sample values to see

In [None]:
print(clean_test_df[1][2649])