In [3]:
import pandas as pd
import csv
from spacy.lang.en import English
from sklearn.model_selection import train_test_split

# seed for sklearn
SEED = 416

# NLP object used for tokenization
nlp = English()

In [2]:
# define class labels
YTA = 'asshole'
NTA = 'not the a-hole'
ESH = 'everyone sucks'
NAH = 'no a-holes here'
FLAIRS = [YTA, NTA, ESH, NAH]

# define the dtype for each column of the csv
DTYPES = {'id': str, 
          'author': str, 
          'created_utc': float, 
          'num_comments': int, 
          'over_18': bool, 
          'selftext': str, 
          'title': str, 
          'link_flair_text': str}

In [None]:
# include dtypes so that each column is read correctly
submissions = pd.read_csv('data/reddit_raw.csv', dtype=DTYPES)

# get rid of rows with NaN
submissions = submissions.dropna(axis=0, how='any')
submissions.head()

In [5]:
# lowercase the flairs
submissions['link_flair_text'] = submissions['link_flair_text'].str.lower()

# have only one label for each kind of verdict
nah_replace_flairs = ['no assholes here', 'no a--holes here']
nta_replace_flair = 'not the asshole'
submissions.loc[submissions['link_flair_text'].isin(nah_replace_flairs), 'link_flair_text'] = NAH
submissions.loc[submissions['link_flair_text'] == nta_replace_flair, 'link_flair_text'] = NTA

In [32]:
def print_class_distribution(df):
    """Prints the submission count and percentage for each class of the input df
    Args:
        df: dataframe to get class distribution for
    """
    assholes = len(df[df['link_flair_text'] == YTA])
    not_assholes = len(df[df['link_flair_text'] == NTA])
    everyone_sucks = len(df[df['link_flair_text'] == ESH])
    no_assholes = len(df[df['link_flair_text'] == NAH])
    total = len(df)
    print(f'YTA: {assholes: >6} submissions, {((assholes / total) * 100):.1f}%')
    print(f'NTA: {not_assholes: >6} submissions, {((not_assholes / total) * 100):.1f}%')
    print(f'ESH: {everyone_sucks: >6} submissions, {((everyone_sucks / total) * 100):.1f}%')
    print(f'NAH: {no_assholes: >6} submissions, {((no_assholes / total) * 100):.1f}%')
    print(f'Total: {total} submissions')

In [33]:
# get only the submissions with the four flairs we are interested in
# use .copy() so changes aren't made to orginal df
relevant_submissions = submissions.loc[submissions['link_flair_text'].isin(FLAIRS)].copy()

# verify we have only selected rows with one of the four verdicts we're interested in
print(pd.unique(relevant_submissions['link_flair_text']))

# get info on our dataset
print_class_distribution(relevant_submissions)

# save to csv so I can read from here instead of the full raw data in the future
relevant_submissions.to_csv('data/data_4_flairs.csv', 
                            index=False, sep=',', 
                            quotechar='"', 
                            quoting=csv.QUOTE_NONNUMERIC)

['asshole' 'everyone sucks' 'not the a-hole' 'no a-holes here']
YTA:  42042 submissions, 20.8%
NTA: 125610 submissions, 62.1%
ESH:  11916 submissions, 5.9%
NAH:  22820 submissions, 11.3%
Total: 202388 submissions


In [29]:
# Check how many submissions there are when we filter out submissions with <n comments.
# Since r/AITA chooses a verdict based on the top comment, I want to make sure there
# is a decent amount of activity on the post but also don't want to reduce the size of
# my dataset too much.
min_15_comments = relevant_submissions.loc[relevant_submissions['num_comments'] >= 15].copy()
min_25_comments = relevant_submissions.loc[relevant_submissions['num_comments'] >= 25].copy()
min_50_comments = relevant_submissions.loc[relevant_submissions['num_comments'] >= 50].copy()
min_75_comments = relevant_submissions.loc[relevant_submissions['num_comments'] >= 75].copy()
min_100_comments = relevant_submissions.loc[relevant_submissions['num_comments'] >= 100].copy()
print(f'Min 15 comments: {len(min_15_comments)} submissions')
print(f'Min 25 comments: {len(min_25_comments)} submissions')
print(f'Min 50 comments: {len(min_50_comments)} submissions')
print(f'Min 75 comments: {len(min_75_comments)} submissions')
print(f'Min 100 comments: {len(min_100_comments)} submissions')

Min 15 comments: 130575 submissions
Min 25 comments: 80276 submissions
Min 50 comments: 38698 submissions
Min 75 comments: 26778 submissions
Min 100 comments: 21293 submissions


In [35]:
# I chose submissions with a minimum of 15 comments based on the above numbers
# The class distribution remains approximately the same as the full dataset,
# with the proportion of YTA verdicts changing the most
print('Min 15 comments:')
print_class_distribution(min_15_comments)

Min 15 comments:
YTA:  33532 submissions, 25.7%
NTA:  76407 submissions, 58.5%
ESH:   7506 submissions, 5.7%
NAH:  13130 submissions, 10.1%
Total: 130575 submissions


In [None]:
# make a new column containing the length (in words) of each submission's selftext
min_15_comments['selftext_len'] = [len(nlp(x)) for x in min_15_comments['selftext'].tolist()]
min_15_comments.head()

In [34]:
# filter out posts with selftexts that are too short
# some of these are posts with just a few sentences which I want to eliminate to give
# the classifier more to work with, but most of them are posts which have been deleted
# and whose selftext field has been changed to "[removed]" as a result
min_50_words = min_15_comments.loc[min_15_comments['selftext_len'] >= 50].copy()

# the class distribution remains approximately the same as the previous dataset after
# filtering out ~4000 submissions
print_class_distribution(min_50_words)

YTA:  31782 submissions, 25.1%
NTA:  74740 submissions, 59.1%
ESH:   7214 submissions, 5.7%
NAH:  12808 submissions, 10.1%
Total: 126544 submissions


In [37]:
# split data into training and test
train_test_ratio = 0.9

# Stratify my splits so that each class has the same proportion of each split
df_train_full, df_test = train_test_split(min_50_words, 
                                          stratify=min_50_words['link_flair_text'], 
                                          train_size=train_test_ratio,
                                          random_state=SEED)
# NTS: if you get an error like, e.g.
# >ValueError: The test_size = 2 should be greater or equal to the number of classes = 3
# it's because based on the size of data you're splitting, the test split (=2) cannot have
# enough items to represent all the classes (=3). This is due to specifying you want
# stratified splits. You probably won't run into this when you're working with full data

# split training data into training and validation
train_valid_ratio = 0.9

df_train, df_valid = train_test_split(df_train_full, 
                                      stratify=df_train_full['link_flair_text'],
                                      train_size=train_valid_ratio,
                                      random_state=SEED)
# check the number of submissions in each split and verify that the splits
# have been stratified
print('Train:')
print_class_distribution(df_train)
print('Valid shape:')
print_class_distribution(df_valid)
print('Test shape:')
print_class_distribution(df_test)

# write the splits to files
df_train.to_csv('data/train.csv', index=False, sep=',', quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
df_valid.to_csv('data/valid.csv', index=False, sep=',', quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
df_test.to_csv('data/test.csv', index=False, sep=',', quotechar='"', quoting=csv.QUOTE_NONNUMERIC)

Train:
YTA:  25744 submissions, 25.1%
NTA:  60539 submissions, 59.1%
ESH:   5843 submissions, 5.7%
NAH:  10374 submissions, 10.1%
Total: 102500 submissions
Valid shape:
YTA:   2860 submissions, 25.1%
NTA:   6727 submissions, 59.1%
ESH:    649 submissions, 5.7%
NAH:   1153 submissions, 10.1%
Total: 11389 submissions
Test shape:
YTA:   3178 submissions, 25.1%
NTA:   7474 submissions, 59.1%
ESH:    722 submissions, 5.7%
NAH:   1281 submissions, 10.1%
Total: 12655 submissions
