## Pre-Processing the Data

In [242]:
import pandas as pd
import nltk
from nltk.corpus import stopwords

# Suppress output of following line and do not output True or False
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

True

First we import the training data from the given .csv file.

In [243]:
# Import the dataset
df = pd.read_csv('../Datasets/train.csv', encoding='cp1252')
df.columns = ['body', 'subreddit']

# Convert all characters to lowercase
df['body'] = df['body'].str.lower()

In [244]:
'''from sklearn.model_selection import train_test_split

# Split dataset into training and testing
X_df = df['body']
y_df = df['subreddit']
y_df = y_df.map({'Toronto': 0, 'London': 1, 'Paris': 2, 'Montreal': 3})

X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, stratify=y_df, random_state=42)

# Package X_test and y_test into a dataframe
y_test = y_test.map({0: 'Toronto', 1: 'London', 2: 'Paris', 3: 'Montreal'})
test_df = pd.concat([X_test, y_test], axis=1)
test_df.to_csv('../Datasets/test_w_labels.csv', index=False)

# Package X_train and y_train into a dataframe
y_train = y_train.map({0: 'Toronto', 1: 'London', 2: 'Paris', 3: 'Montreal'})
df = pd.concat([X_train, y_train], axis=1)

# Convert all characters to lowercase
df['body'] = df['body'].str.lower()'''

"from sklearn.model_selection import train_test_split\n\n# Split dataset into training and testing\nX_df = df['body']\ny_df = df['subreddit']\ny_df = y_df.map({'Toronto': 0, 'London': 1, 'Paris': 2, 'Montreal': 3})\n\nX_train, X_test, y_train, y_test = train_test_split(X_df, y_df, stratify=y_df, random_state=42)\n\n# Package X_test and y_test into a dataframe\ny_test = y_test.map({0: 'Toronto', 1: 'London', 2: 'Paris', 3: 'Montreal'})\ntest_df = pd.concat([X_test, y_test], axis=1)\ntest_df.to_csv('../Datasets/test_w_labels.csv', index=False)\n\n# Package X_train and y_train into a dataframe\ny_train = y_train.map({0: 'Toronto', 1: 'London', 2: 'Paris', 3: 'Montreal'})\ndf = pd.concat([X_train, y_train], axis=1)\n\n# Convert all characters to lowercase\ndf['body'] = df['body'].str.lower()"

Before pre-processing the data, it can be helpful to identify the characters we are dealing with in the text.

In [245]:
# Find the frequency of each appearance of a character in the dataset
def find_frequency(df):
    frequency = {}
    for index, row in df.iterrows():
        for character in row['body']:
            if character in frequency:
                frequency[character] += 1
            else:
                frequency[character] = 1
    return frequency

# Make function to create pandas dataframe of frequency of each character
def make_frequency_df(frequency):
    freq_df = pd.DataFrame.from_dict(frequency, orient='index', columns=['frequency'])
    freq_df = freq_df.sort_values(by=['frequency'], ascending=False)
    freq_df['character'] = freq_df.index
    freq_df = freq_df.reset_index(drop=True)
    freq_df = freq_df[['character', 'frequency']]
    return freq_df


freq = find_frequency(df)
freq_df = make_frequency_df(freq)
print(freq_df)

   character  frequency
0                 59253
1          e      34050
2          t      24093
3          a      21378
4          o      20234
..       ...        ...
88         œ          2
89         °          2
90         ã          1
91         á          1
92         ë          1

[93 rows x 2 columns]


We see that there is quite a distribution of characters here. We are going to try and keep as many as possible, but also try to align things like apostrophes that have different representations in different encodings.

In [246]:
# Align encodings
df['body'] = df['body'].str.replace('“', '"')
df['body'] = df['body'].str.replace('”', '"')
df['body'] = df['body'].str.replace('’', "'")
df['body'] = df['body'].str.replace('‘', "'")
df['body'] = df['body'].str.replace('—', '-')
df['body'] = df['body'].str.replace('–', '-')
df['body'] = df['body'].str.replace('\n', ' ')
df['body'] = df['body'].str.replace('/', ' ')
df['body'] = df['body'].str.replace('#x200b', ' ')
df['body'] = df['body'].str.replace('-', ' ')

# Remove basic punctuation
translator = str.maketrans('', '', '<>"°œ!\()*+,.:;=?[\\]^_`{|}~1234567890')
df['body'] = df['body'].str.translate(translator)

# Replace accented characters with unaccented characters
translator = str.maketrans('àáâãäåçèéêëìíîïñòóôõöùúûüýÿ', 'aaaaaaceeeeiiiinooooouuuuyy')
df['body'] = df['body'].str.translate(translator)

Some preprocessing is helpful prior to tokenization. This includes lemmatization and removing stop-words (a, an, the) in both English and French, since we are dealing with cities like Toronto, Montreal, and Paris.

In [247]:
# Replace words with their lemmings
lemmatizer = nltk.stem.WordNetLemmatizer()
def lemmatize_verbs(text):
    return [lemmatizer.lemmatize(word, pos='v') for word in text]

def lemmatize_nouns(text):
    return [lemmatizer.lemmatize(word) for word in text]

df['body'] = df['body'].apply(lambda x: lemmatize_nouns(x.split()))
df['body'] = df['body'].apply(lambda x: lemmatize_verbs(x))

# Reconcatenate the words into a string
df['body'] = df['body'].apply(lambda x: ' '.join(x))

# Remove stopwords
stop_words = set(stopwords.words('english'))
df['body'] = df['body'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
stop_words = set(stopwords.words('french'))
df['body'] = df['body'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

In [248]:
from sklearn.model_selection import train_test_split

# Split dataset into training and testing
X_df = df['body']
y_df = df['subreddit']
y_df = y_df.map({'Toronto': 0, 'London': 1, 'Paris': 2, 'Montreal': 3})

X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, stratify=y_df, random_state=42)

# Package X_test and y_test into a dataframe
y_test = y_test.map({0: 'Toronto', 1: 'London', 2: 'Paris', 3: 'Montreal'})
test_df = pd.concat([X_test, y_test], axis=1)
test_df.to_csv('../Datasets/test_w_labels.csv', index=False)

# Package X_train and y_train into a dataframe
y_train = y_train.map({0: 'Toronto', 1: 'London', 2: 'Paris', 3: 'Montreal'})
df = pd.concat([X_train, y_train], axis=1)

Additionally, we can do an in-depth analysis on the most common words shared between each subreddit and eliminate them from the dataset to improve class distinguishment

In [249]:
def get_highest_freq(subreddit, df):
    subreddit_df = df[df['subreddit'] == subreddit]
    sub_word_index = {}
    
    for row in subreddit_df['body']:
        for word in row.split():
            if word in sub_word_index:
                sub_word_index[word] += 1
            else:
                sub_word_index[word] = 1

    # Sort the dictionary by value
    sub_word_index = dict(sorted(sub_word_index.items(), key=lambda item: item[1], reverse=True))
    return list(sub_word_index.items())
    
def remove_uncommon_words(subreddit):
    subreddit_df = df[df['subreddit'] == subreddit]
    
    # Build a vocabulary of words and how many samples they appear in
    vocab = {}
    for row in subreddit_df['body']:
        for word in row.split():
            if word in vocab:
                vocab[word] += 1
            else:
                vocab[word] = 1

    # Remove words that appear in few samples
    for word in list(vocab):
        if vocab[word] < 0.01 * len(subreddit_df):
            del vocab[word]

    # Remove all words that are not in the vocabulary
    subreddit_df['body'] = subreddit_df['body'].apply(lambda x: ' '.join([word for word in x.split() if word in vocab]))

    return subreddit_df

We first conduct an analysis to examine the most important words for each subreddit. Words that appear in less than 1% of the samples are removed. This reduces the total vocabulary in the dataset by about half. Next, the top 250 words that are most common among every subreddit are removed as they muddy the waters when it comes to classification. By stripping these words, we make the classes more distinguishable from one another as they have less in common.

In [250]:
import warnings
warnings.filterwarnings('ignore')

subreddits = ['Toronto', 'London', 'Paris', 'Montreal']

toronto_df = remove_uncommon_words('Toronto')
montreal_df = remove_uncommon_words('Montreal')
paris_df = remove_uncommon_words('Paris')
london_df = remove_uncommon_words('London')

new_df = pd.concat([toronto_df, london_df, paris_df, montreal_df])

# Build vocabulary of words in new_df
vocab = []
for row in new_df['body']:
    for word in row.split():
        if word not in vocab:
            vocab.append(word)

# Get the frequency of each word in the vocabulary (how many samples it appears in)
subdict = {}
for subreddit in subreddits:
    subdict[subreddit] = get_highest_freq(subreddit, new_df)
for key in subdict:
    subdict[key] = [item[0] for item in subdict[key]]

# Get aggregate index of words in the vocabulary
agg_index = {}
for word in vocab:
    agg_index[word] = 0
    for key in subdict:
        if word in subdict[key]:
            agg_index[word] += subdict[key].index(word)
        else:
            agg_index[word] += len(subdict[key])

# Sort the dictionary by value
agg_index = dict(sorted(agg_index.items(), key=lambda item: item[1], reverse=False))

# Remove the most common words from new_df
for word in list(agg_index)[:300]:
    for sample in new_df['body']:
        if word in sample.split():
            new_df['body'] = new_df['body'].replace(sample, sample.replace(word + ' ', ''))

# Remove words not in new_df vocab from df
for sample in df['body']:
    for word in sample.split():
        if word not in vocab:
            df['body'] = df['body'].replace(sample, sample.replace(word + ' ', ''))

# Remove most common words from df
for word in list(agg_index)[:300]:
    for sample in df['body']:
        if word in sample.split():
            df['body'] = df['body'].replace(sample, sample.replace(word + ' ', ''))

Analysis of each dataframe

In [251]:
# Get the frequency of each word in the vocabulary (how many samples it appears in)
subdict = {}
for subreddit in subreddits:
    subdict[subreddit] = get_highest_freq(subreddit, new_df)
for key in subdict:
    subdict[key] = [item[0] for item in subdict[key]]

'''# Find words in subdict that appear in every subreddit
common_words = []
for word in subdict[subreddits[0]]:
    if word in subdict[subreddits[1]] and word in subdict[subreddits[2]] and word in subdict[subreddits[3]]:
        common_words.append(word)

# Get words that appear most frequently in Montreal and Toronto to reduce overlap
for word in subdict[subreddits[3]][:150]:
    if word in subdict[subreddits[0]][:150]:
        common_words.append(word)

# Get words that appear most frequently in Montreal and Paris to reduce overlap
for word in subdict[subreddits[3]][:150]:
    if word in subdict[subreddits[2]][:150]:
        common_words.append(word)

# Remove extra words from df
for word in common_words:
    for sample in df['body']:
        if word in sample.split():
            df['body'] = df['body'].replace(sample, sample.replace(word + ' ', ''))'''

# Save the dataframe to a csv file
df.to_csv('../Datasets/train_prep.csv', index=False)