## Pre-Processing the Data

In [6]:
import pandas as pd
import nltk
from nltk.corpus import stopwords

# Suppress output of following line and do not output True or False
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

True

First we import the training data from the given .csv file.

In [7]:
# Import the dataset
df = pd.read_csv('../Datasets/train.csv', encoding='cp1252')
df.columns = ['body', 'subreddit']

We preprocess the data before generating splits immediately to prevent data leakage later on

In [8]:
from prep import prep_data
from sklearn.model_selection import train_test_split

df = prep_data(df) # Custom preprocessing function

# Split dataset into training and testing to prevent any data leakage
X_df = df['body']
y_df = df['subreddit']
y_df = y_df.map({'Toronto': 0, 'London': 1, 'Paris': 2, 'Montreal': 3})

X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, stratify=y_df, test_size=0.20, random_state=42)

# Package X_test and y_test into a dataframe
y_test = y_test.map({0: 'Toronto', 1: 'London', 2: 'Paris', 3: 'Montreal'})
test_df = pd.concat([X_test, y_test], axis=1)

# Package X_train and y_train back into a dataframe
y_train = y_train.map({0: 'Toronto', 1: 'London', 2: 'Paris', 3: 'Montreal'})
df = pd.concat([X_train, y_train], axis=1)

Words that appear in less than 1% of the samples are removed. This reduces the total vocabulary in the dataset by about half. Next, the top words that are most common among every subreddit are removed as they muddy the waters when it comes to classification.

In [9]:
import warnings
warnings.filterwarnings('ignore')
from prep import remove_uncommon_words, remove_common_words, get_highest_freq, build_vocab

subreddits = ['Toronto', 'London', 'Paris', 'Montreal']

df = remove_common_words(df, subreddits)

Now we can do an MI study on the data

In [10]:
from prep import get_term_freq, get_mutual_information

vocab = build_vocab(df)

classes = df['subreddit'].unique()
class_counts = df['subreddit'].value_counts().to_numpy()

term_freq = {}
for subreddit in classes:
    term_freq[subreddit] = get_term_freq(df, subreddit, vocab)
    
# Make a dataframe of the term frequencies
term_freq_df = pd.DataFrame.from_dict(term_freq, orient='index')
term_freq_df = term_freq_df.transpose()

# Create a dataframe of the mutual information
MI = get_mutual_information(term_freq_df, class_counts)
MI_df = pd.DataFrame(MI, columns=['MI'])
MI_df['word'] = list(vocab)
MI_df = MI_df.sort_values(by=['MI'], ascending=False)

In [11]:
# Create a list of the top words based on MI
MI_N = 3750
MI_df_top = MI_df.head(MI_N)
top_words = MI_df_top['word'].tolist()

# Create a new dataframe with only the top words
top_df = df.copy()
top_df['body'] = top_df['body'].apply(lambda x: ' '.join([word for word in x.split() if word in top_words]))

# Remove samples with no words
top_df = top_df[top_df['body'] != '']

# Save the dataframe to a csv file
top_df.to_csv('../Datasets/train_cleaned.csv', index=False)

Finally we pre-process the test sets in the same way

In [12]:
# Load Kaggle set
kaggle_set =  pd.read_csv('../Datasets/Kaggle/test.csv', encoding='cp1252')
kaggle_set.columns = ['id', 'body']
kaggle_set = prep_data(kaggle_set)

# Clean test sets
test_df['body'] = test_df['body'].apply(lambda x: ' '.join([word for word in x.split() if word in top_words]))
kaggle_set['body'] = kaggle_set['body'].apply(lambda x: ' '.join([word for word in x.split() if word in top_words])if len(x.split()) > 5 else x)

# Save the dataframes to csv files
test_df.to_csv('../Datasets/test_cleaned.csv', index=False)
kaggle_set.to_csv('../Datasets/Kaggle/kaggle_test_cleaned.csv', index=False)