## Reddit Text Classification

In [268]:
import pandas as pd
import nltk
import contractions
from nltk.corpus import stopwords

# Suppress output of following line and do not output True or False
nltk.download('stopwords', quiet=True)

True

First we import the training data from the given .csv file.

In [269]:
# Import the dataset and remove non-utf-8 characters
df = pd.read_csv('train.csv', encoding='cp1252')
df.columns = ['body', 'subreddit']

# Convert all characters to lowercase
df['body'] = df['body'].str.lower()

Before pre-processing the data, it can be helpful to identify the characters we are dealing with in the text.

In [270]:
# Find the frequency of each appearance of a character in the dataset
def find_frequency(df):
    frequency = {}
    for index, row in df.iterrows():
        for character in row['body']:
            if character in frequency:
                frequency[character] += 1
            else:
                frequency[character] = 1
    return frequency

# Make function to create pandas dataframe of frequency of each character
def make_frequency_df(frequency):
    freq_df = pd.DataFrame.from_dict(frequency, orient='index', columns=['frequency'])
    freq_df = freq_df.sort_values(by=['frequency'], ascending=False)
    freq_df['character'] = freq_df.index
    freq_df = freq_df.reset_index(drop=True)
    freq_df = freq_df[['character', 'frequency']]
    return freq_df


freq = find_frequency(df)
freq_df = make_frequency_df(freq)
freq_df.head(10)

Unnamed: 0,character,frequency
0,,59253
1,e,34050
2,t,24093
3,a,21378
4,o,20234
5,i,19135
6,n,19085
7,s,18262
8,r,16288
9,l,12045


We see that there is quite a distribution of characters here. We are going to try and keep as many as possible, but also try to align things like apostrophes that have different representations in different encodings.

In [271]:
# Align encodings
df['body'] = df['body'].str.replace('“', '"')
df['body'] = df['body'].str.replace('”', '"')
df['body'] = df['body'].str.replace('’', "'")
df['body'] = df['body'].str.replace('‘', "'")
df['body'] = df['body'].str.replace('—', '-')
df['body'] = df['body'].str.replace('–', '-')
df['body'] = df['body'].str.replace('\n', ' ')

# Remove basic punctuation and digits
translator = str.maketrans('', '', '°œ!#$%&\()*+,./:;=?@[\\]^_`{|}~1234567890')
df['body'] = df['body'].str.translate(translator)

Now we can re-examine the frequency of each character

In [272]:
freq_aligned = find_frequency(df)
freq_df_aligned = make_frequency_df(freq_aligned)
freq_df_aligned.head(10)

Unnamed: 0,character,frequency
0,,61428
1,e,34050
2,t,24093
3,a,21378
4,o,20234
5,i,19135
6,n,19085
7,s,18262
8,r,16288
9,l,12045


Some preprocessing is helpful prior to tokenization. This includes expanding contracted words and removing stop-words (a, an, the).

In [273]:
# Expand contractions in english
df['body'] = df['body'].apply(lambda x: [contractions.fix(word) for word in x.split()])

# Remove stopwords in english and french
stopwords_english = stopwords.words('english')
stopwords_french = stopwords.words('french')
df['body'] = df['body'].apply(lambda x: [word for word in x if word not in stopwords_english])
df['body'] = df['body'].apply(lambda x: [word for word in x if word not in stopwords_french])

# Reconcatenate the words into a string
df['body'] = df['body'].apply(lambda x: ' '.join(x))

Now, we can tokenize a variety of different ways

In [None]:
# WordPiece Tokenizer
from transformers import WordpieceTokenizer