## Pre-Processing the Test Data

In [247]:
import pandas as pd
import nltk
from nltk.corpus import stopwords

# Suppress output of following line and do not output True or False
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

True

First we import the training data from the given .csv file.

In [248]:
# Import the dataset and remove non-utf-8 characters
switch_set = 0
if switch_set == 0:
    df = pd.read_csv('../Datasets/test_w_labels.csv', encoding='cp1252')
    df.columns = ['body', 'subreddit']
else:
    df = pd.read_csv('../Datasets/Kaggle/test.csv', encoding='cp1252')
    df.columns = ['id', 'body']

# Convert all characters to lowercase
df['body'] = df['body'].str.lower()

Align things like apostrophes that have different representations in different encodings.

In [249]:
# Align encodings
df['body'] = df['body'].str.replace('“', '"')
df['body'] = df['body'].str.replace('”', '"')
df['body'] = df['body'].str.replace('’', "'")
df['body'] = df['body'].str.replace('‘', "'")
df['body'] = df['body'].str.replace('—', '-')
df['body'] = df['body'].str.replace('–', '-')
df['body'] = df['body'].str.replace('\n', ' ')
df['body'] = df['body'].str.replace('/', ' ')
df['body'] = df['body'].str.replace('#x200b', ' ')
df['body'] = df['body'].str.replace('-', ' ')

# Remove basic punctuation
translator = str.maketrans('', '', '<>"°œ!\()*+,.:;=?[\\]^_`{|}~1234567890')
df['body'] = df['body'].str.translate(translator)

# Replace accented characters with unaccented characters
translator = str.maketrans('àáâãäåçèéêëìíîïñòóôõöùúûüýÿ', 'aaaaaaceeeeiiiinooooouuuuyy')
df['body'] = df['body'].str.translate(translator)

Some preprocessing is helpful prior to tokenization. This includes lemmatization and removing stop-words (a, an, the) in both English and French, since we are dealing with cities like Toronto, Montreal, and Paris.

In [250]:
# Replace words with their lemmings
lemmatizer = nltk.stem.WordNetLemmatizer()
def lemmatize_verbs(text):
    return [lemmatizer.lemmatize(word, pos='v') for word in text]

def lemmatize_nouns(text):
    return [lemmatizer.lemmatize(word) for word in text]

df['body'] = df['body'].apply(lambda x: lemmatize_nouns(x.split()))
df['body'] = df['body'].apply(lambda x: lemmatize_verbs(x))

# Reconcatenate the words into a string
df['body'] = df['body'].apply(lambda x: ' '.join(x))

# Remove stopwords
stop_words = set(stopwords.words('english'))
df['body'] = df['body'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
stop_words = set(stopwords.words('french'))
df['body'] = df['body'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

We can transfer the knowledge of common words from the training set to further process the test set.

In [251]:
# Load vocab.txt into a list
with open('vocab.txt', 'r') as file:
    vocab = file.read().splitlines()

    print(len(vocab))

# Remove words not in vocab from df
for sample in df['body']:
    for word in sample.split():
        if word not in vocab:
            df['body'] = df['body'].replace(sample, sample.replace(word + ' ', ''))

4050


A cleaned test set is saved for future use

In [252]:
# Save the dataframe to a csv file
if switch_set == 0:
    df.to_csv('../Datasets/test_w_labels_cleaned.csv', index=False)
else:
    df.to_csv('../Datasets/Kaggle/test_cleaned.csv', index=False)