## Pre-Processing the Test Data

In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords

# Suppress output of following line and do not output True or False
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

True

First we import the training data from the given .csv file.

In [2]:
# Import the dataset and remove non-utf-8 characters
df = pd.read_csv('../Datasets/test.csv', encoding='cp1252')
df.columns = ['id', 'body']

# Convert all characters to lowercase
df['body'] = df['body'].str.lower()

Before pre-processing the data, it can be helpful to identify the characters we are dealing with in the text.

In [3]:
# Find the frequency of each appearance of a character in the dataset
def find_frequency(df):
    frequency = {}
    for index, row in df.iterrows():
        for character in row['body']:
            if character in frequency:
                frequency[character] += 1
            else:
                frequency[character] = 1
    return frequency

# Make function to create pandas dataframe of frequency of each character
def make_frequency_df(frequency):
    freq_df = pd.DataFrame.from_dict(frequency, orient='index', columns=['frequency'])
    freq_df = freq_df.sort_values(by=['frequency'], ascending=False)
    freq_df['character'] = freq_df.index
    freq_df = freq_df.reset_index(drop=True)
    freq_df = freq_df[['character', 'frequency']]
    return freq_df


freq = find_frequency(df)
freq_df = make_frequency_df(freq)
print(freq_df)

   character  frequency
0                 22179
1          e      12796
2          t       8807
3          a       8030
4          o       7568
..       ...        ...
84         ï          3
85         ~          2
86         ü          2
87         @          1
88         ã          1

[89 rows x 2 columns]


We see that there is quite a distribution of characters here. We are going to try and keep as many as possible, but also try to align things like apostrophes that have different representations in different encodings.

In [4]:
# Align encodings
df['body'] = df['body'].str.replace('“', '"')
df['body'] = df['body'].str.replace('”', '"')
df['body'] = df['body'].str.replace('’', "'")
df['body'] = df['body'].str.replace('‘', "'")
df['body'] = df['body'].str.replace('—', '-')
df['body'] = df['body'].str.replace('–', '-')
df['body'] = df['body'].str.replace('\n', ' ')
df['body'] = df['body'].str.replace('/', ' ')
df['body'] = df['body'].str.replace('#x200b', ' ')
df['body'] = df['body'].str.replace('-', ' ')

# Remove basic punctuation
translator = str.maketrans('', '', '<>"°œ!\()*+,.:;=?[\\]^_`{|}~1234567890')
df['body'] = df['body'].str.translate(translator)

# Replace accented characters with unaccented characters
translator = str.maketrans('àáâãäåçèéêëìíîïñòóôõöùúûüýÿ', 'aaaaaaceeeeiiiinooooouuuuyy')
df['body'] = df['body'].str.translate(translator)

Now we can re-examine the frequency of each character

In [5]:
freq_aligned = find_frequency(df)
freq_df_aligned = make_frequency_df(freq_aligned)
print(freq_df_aligned)

   character  frequency
0                 23538
1          e      13390
2          t       8807
3          a       8175
4          o       7588
5          i       7207
6          n       7192
7          s       7141
8          r       6050
9          l       4355
10         u       4052
11         h       3567
12         d       3558
13         c       3220
14         m       2621
15         p       2560
16         g       1851
17         f       1756
18         y       1742
19         b       1487
20         w       1410
21         v       1268
22         '        858
23         k        716
24         q        456
25         j        396
26         x        260
27         z         86
28         $         20
29                   20
30         %         17
31         …         11
32         &         11
33         «         10
34         »         10
35         £          6
36         €          6
37         #          3
38         @          1


Some preprocessing is helpful prior to tokenization. This includes lemmatization and removing stop-words (a, an, the) in both English and French, since we are dealing with cities like Toronto, Montreal, and Paris.

In [6]:
# Replace words with their lemmings
lemmatizer = nltk.stem.WordNetLemmatizer()
def lemmatize_verbs(text):
    return [lemmatizer.lemmatize(word, pos='v') for word in text]

def lemmatize_nouns(text):
    return [lemmatizer.lemmatize(word) for word in text]

df['body'] = df['body'].apply(lambda x: lemmatize_nouns(x.split()))
df['body'] = df['body'].apply(lambda x: lemmatize_verbs(x))

# Reconcatenate the words into a string
df['body'] = df['body'].apply(lambda x: ' '.join(x))

# Remove stopwords
stop_words = set(stopwords.words('english'))
df['body'] = df['body'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
stop_words = set(stopwords.words('french'))
df['body'] = df['body'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

We can transfer the knowledge of common words from the training set to further process the test set.

In [7]:
# Load vocab.txt into a list
with open('vocab.txt', 'r') as file:
    vocab = file.read().splitlines()

# Load agg_index.txt into a list
with open('agg_index.txt', 'r') as file:
    agg_index = file.read().splitlines()

# Remove words not in vocab from df
for sample in df['body']:
    for word in sample.split():
        if word not in vocab:
            df['body'] = df['body'].replace(sample, sample.replace(word + ' ', ''))

In [8]:
# Print some samples
print(df.head())

   id                                               body
0   0  even people uber address toronto people live p...
1   1  undoubtedly commuter electric transit rather p...
2   2  shopper use decent sale loblaws remember every...
3   3  yeah anti immigration talk lead something like...
4   4  talk female assistant help nygard lure woman b...


A cleaned test set is saved for future use

In [9]:
# Save the dataframe to a csv file
df.to_csv('../Datasets/test_cleaned.csv', index=False)