## Pre-Processing the Data

In [572]:
import pandas as pd
import nltk
import contractions
from nltk.corpus import stopwords
from googletrans import Translator

# Suppress output of following line and do not output True or False
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

True

First we import the training data from the given .csv file.

In [573]:
# Import the dataset and remove non-utf-8 characters
df = pd.read_csv('../Datasets/train.csv', encoding='cp1252')
df.columns = ['body', 'subreddit']

# Convert all characters to lowercase
df['body'] = df['body'].str.lower()

Before pre-processing the data, it can be helpful to identify the characters we are dealing with in the text.

In [574]:
# Find the frequency of each appearance of a character in the dataset
def find_frequency(df):
    frequency = {}
    for index, row in df.iterrows():
        for character in row['body']:
            if character in frequency:
                frequency[character] += 1
            else:
                frequency[character] = 1
    return frequency

# Make function to create pandas dataframe of frequency of each character
def make_frequency_df(frequency):
    freq_df = pd.DataFrame.from_dict(frequency, orient='index', columns=['frequency'])
    freq_df = freq_df.sort_values(by=['frequency'], ascending=False)
    freq_df['character'] = freq_df.index
    freq_df = freq_df.reset_index(drop=True)
    freq_df = freq_df[['character', 'frequency']]
    return freq_df


freq = find_frequency(df)
freq_df = make_frequency_df(freq)
print(freq_df)

   character  frequency
0                 59253
1          e      34050
2          t      24093
3          a      21378
4          o      20234
..       ...        ...
88         œ          2
89         °          2
90         ã          1
91         á          1
92         ë          1

[93 rows x 2 columns]


We see that there is quite a distribution of characters here. We are going to try and keep as many as possible, but also try to align things like apostrophes that have different representations in different encodings.

In [575]:
# Align encodings
df['body'] = df['body'].str.replace('“', '"')
df['body'] = df['body'].str.replace('”', '"')
df['body'] = df['body'].str.replace('’', "'")
df['body'] = df['body'].str.replace('‘', "'")
df['body'] = df['body'].str.replace('—', '-')
df['body'] = df['body'].str.replace('–', '-')
df['body'] = df['body'].str.replace('\n', ' ')
df['body'] = df['body'].str.replace('/', ' ')
df['body'] = df['body'].str.replace('#x200b', ' ')
df['body'] = df['body'].str.replace('-', ' ')

# Remove basic punctuation
translator = str.maketrans('', '', '<>"°œ!\()*+,.:;=?[\\]^_`{|}~1234567890')
df['body'] = df['body'].str.translate(translator)

# Replace accented characters with unaccented characters
translator = str.maketrans('àáâãäåçèéêëìíîïñòóôõöùúûüýÿ', 'aaaaaaceeeeiiiinooooouuuuyy')
df['body'] = df['body'].str.translate(translator)

Now we can re-examine the frequency of each character

In [576]:
freq_aligned = find_frequency(df)
freq_df_aligned = make_frequency_df(freq_aligned)
print(freq_df_aligned)

   character  frequency
0                 62727
1          e      35451
2          t      24093
3          a      21712
4          o      20262
5          i      19139
6          n      19085
7          s      18262
8          r      16288
9          l      12045
10         u      10417
11         h      10076
12         d       9608
13         c       8282
14         m       6890
15         p       6748
16         g       5190
17         y       5031
18         f       4831
19         w       4116
20         b       4001
21         v       3329
22         '       2085
23         k       2034
24         q        991
25         j        951
26         x        643
27         z        212
28         %        127
29         …         40
30         $         37
31         &         30
32         £         26
33                   16
34         #         15
35         «          9
36         »          9
37         €          8
38         @          7


Some preprocessing is helpful prior to tokenization. This includes expanding contracted words and removing stop-words (a, an, the) in both English and French, since we are dealing with cities like Toronto, Montreal, and Paris.

In [577]:
# Replace words with their lemmings
lemmatizer = nltk.stem.WordNetLemmatizer()
def lemmatize_verbs(text):
    return [lemmatizer.lemmatize(word, pos='v') for word in text]

def lemmatize_nouns(text):
    return [lemmatizer.lemmatize(word) for word in text]

df['body'] = df['body'].apply(lambda x: lemmatize_nouns(x.split()))
df['body'] = df['body'].apply(lambda x: lemmatize_verbs(x))

# Reconcatenate the words into a string
df['body'] = df['body'].apply(lambda x: ' '.join(x))

# Remove stopwords
stop_words = set(stopwords.words('english'))
df['body'] = df['body'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
stop_words = set(stopwords.words('french'))
df['body'] = df['body'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

A sample output after all pre-processing can be observed

In [578]:
print(df['body'][32])

another showcase shortsighted stupidity enable mode transportation get car street people can't live without car actually beneficial many people possible drive traffic hard understand really…


In [579]:
def get_highest_freq(subreddit, df):
    subreddit_df = df[df['subreddit'] == subreddit]
    sub_word_index = {}
    
    for row in subreddit_df['body']:
        for word in row.split():
            if word in sub_word_index:
                sub_word_index[word] += 1
            else:
                sub_word_index[word] = 1

    # Sort the dictionary by value
    sub_word_index = dict(sorted(sub_word_index.items(), key=lambda item: item[1], reverse=True))
    return list(sub_word_index.items())

subreddits = ['Toronto', 'Montreal', 'Paris', 'London']
subdict = {}

for subreddit in subreddits:
    subdict[subreddit] = get_highest_freq(subreddit, df)

In [580]:
import warnings
warnings.filterwarnings('ignore')

def remove_uncommon_words(subreddit):
    subreddit_df = df[df['subreddit'] == subreddit]
    
    # Build a vocabulary of words and how many samples they appear in
    vocab = {}
    for row in subreddit_df['body']:
        for word in row.split():
            if word in vocab:
                vocab[word] += 1
            else:
                vocab[word] = 1

    # Remove words that appear in less than 1% of the samples
    for word in list(vocab):
        if vocab[word] < 0.01 * len(subreddit_df):
            del vocab[word]

    # Remove all words that are not in the vocabulary
    subreddit_df['body'] = subreddit_df['body'].apply(lambda x: ' '.join([word for word in x.split() if word in vocab]))

    return subreddit_df

toronto_df = remove_uncommon_words('Toronto')
montreal_df = remove_uncommon_words('Montreal')
paris_df = remove_uncommon_words('Paris')
london_df = remove_uncommon_words('London')

new_df = pd.concat([toronto_df, montreal_df, paris_df, london_df])

# Build vocabulary of words in new_df
vocab = []
for row in new_df['body']:
    for word in row.split():
        if word not in vocab:
            vocab.append(word)

# Get the frequency of each word in the vocabulary (how many samples it appears in)
new_subdict = {}
for subreddit in subreddits:
    new_subdict[subreddit] = get_highest_freq(subreddit, new_df)
for key in new_subdict:
    new_subdict[key] = [item[0] for item in new_subdict[key]]

# Get aggregate index of words in the vocabulary
agg_index = {}
for word in vocab:
    agg_index[word] = 0
    for key in new_subdict:
        if word in new_subdict[key]:
            agg_index[word] += new_subdict[key].index(word)
        else:
            agg_index[word] += len(new_subdict[key])

# Sort the dictionary by value
agg_index = dict(sorted(agg_index.items(), key=lambda item: item[1], reverse=False))

# Remove the most common words from new_df
for word in list(agg_index)[:250]:
    for sample in new_df['body']:
        if word in sample.split():
            new_df['body'] = new_df['body'].replace(sample, sample.replace(word + ' ', ''))

A cleaned training set is saved for future use

In [581]:
# Save the dataframe to a csv file
df.to_csv('../Datasets/train_cleaned.csv', index=False)
new_df.to_csv('../Datasets/train_cleaned2.csv', index=False)