## Pre-Processing the Data

In [284]:
import pandas as pd
import nltk
from nltk.corpus import stopwords

# Suppress output of following line and do not output True or False
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

True

First we import the training data from the given .csv file.

In [285]:
# Import the dataset and remove non-utf-8 characters
df = pd.read_csv('../Datasets/train_prep.csv', encoding='cp1252')
df.columns = ['body', 'subreddit']

First we get the term frequency in the dataset

In [286]:
classes = df['subreddit'].unique()

# Get vocabulary
vocabulary = set()
for body in df['body']:
    for word in body.split():
        vocabulary.add(word)

def get_term_freq(subreddit):
    # Get term frequency for each word in the vocabulary
    term_freq = {}
    for word in vocabulary:
        term_freq[word] = 0

    for body in df[df['subreddit'] == subreddit]['body']:
        for word in body.split():
            term_freq[word] += 1

    return term_freq

term_freq = {}
for subreddit in classes:
    term_freq[subreddit] = get_term_freq(subreddit)
    
# Make a dataframe of the term frequencies
term_freq_df = pd.DataFrame.from_dict(term_freq, orient='index')
term_freq_df = term_freq_df.transpose()

Now let's examine the mutual information score of each word

In [287]:
import numpy as np

# Convert term_freq_df to a numpy array
term_freq_array = term_freq_df.to_numpy()

# Calculate the joint probabilities of terms with classes
eps = 1e-10
class_count = np.array([len(df[df['subreddit'] == subreddit]) for subreddit in classes])

P_TC = (term_freq_array + eps) / np.array(class_count).reshape(1, -1)
P_T_notC = (np.array(class_count).reshape(1, -1) - term_freq_array + eps) / np.array(class_count).reshape(1, -1)

# Calculate the marginal probabilities
total_samples = 719
P_T = np.sum(term_freq_array, axis=1) / total_samples
P_C = class_count / total_samples

# Calculate the mutual information
MI = np.sum(P_TC * np.log2(P_TC / (P_T[:, np.newaxis] * P_C)), axis=1) + \
     np.sum(P_T_notC * np.log2(P_T_notC / ((1 - P_T[:, np.newaxis]) * P_C)), axis=1)

# Create a dataframe of the mutual information
MI_df = pd.DataFrame(MI, columns=['MI'])
MI_df['word'] = list(vocabulary)
MI_df = MI_df.sort_values(by=['MI'], ascending=False)

In [288]:
# Only keep the top words
MI_df_top = MI_df.head(4750)

# Create a list of the top 3000 words based on MI
top_words = MI_df_top['word'].tolist()

# Save the list to a csv
with open('vocab.txt', 'w') as f:
    for word in top_words:
        f.write("%s\n" % word)

# Create a new dataframe with only the top words
top_df = df.copy()
top_df['body'] = top_df['body'].apply(lambda x: ' '.join([word for word in x.split() if word in top_words]))

# Remove samples with no words
top_df = top_df[top_df['body'] != '']

# Get distribution of classes
class_dist = top_df['subreddit'].value_counts()
print(class_dist)

# Save the dataframe to a csv file
top_df.to_csv('../Datasets/train_cleaned.csv', index=False)

subreddit
Paris       135
Montreal    135
London      135
Toronto     134
Name: count, dtype: int64
