# Preprocessing

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn
import json
import urllib.parse

import spacy
import spacy.symbols

import graphviz

import os.path
import subprocess
import io
import tempfile

%matplotlib inline
#import gensim
#from gensim.corpora import Dictionary
#from gensim.models import LdaModel, CoherenceModel, LsiModel, HdpModel

  return torch._C._cuda_getDeviceCount() > 0


### Getting data

In [7]:
comments_df = pd.read_csv('../data/top_100_post_comments_user_flair.txt', header=None, names=['username', 'flair_text', 'body'])

print(comments_df.shape)
comments_df.head(10)

(3623, 3)


Unnamed: 0,username,flair_text,body
0,Tungsten_,,Thanks to everyone who engaged in insightful a...
1,ProudBlackMatt,Chinese-American,I would prefer using a process that takes into...
2,TomatoCanned,,"u/Tungsten_, Thanks for creating a section jus..."
3,bad-fengshui,,As with anything related to Asians in politics...
4,Pancake_muncher,,Yet colleges will allow alumni and doners in e...
5,suberry,,I just hated Affirmative Action as a distracti...
6,Puzzled-Painter3301,,My own feeling is that I was never in love wit...
7,e9967780,,Anti Asian racism whether against East Asians ...
8,,,Can we overturn legacy and athlete admissions ...
9,OkartoIceCream,,"I want to remind people that in California, on..."


In [4]:
# load spacy model
nlp = spacy.load('en_core_web_sm')

#### Drop rows with deleted text body

In [20]:
deleted_rows = comments_df[comments_df['body'].isin(['[deleted]','[removed]'])]

# drop rows
comments_df.drop(deleted_rows.index, inplace=True)
print(comments_df.shape)
comments_df.reset_index(drop=True)
comments_df.head(15)

(306,)
(3317, 3)


Unnamed: 0,username,flair_text,body
0,Tungsten_,,Thanks to everyone who engaged in insightful a...
1,ProudBlackMatt,Chinese-American,I would prefer using a process that takes into...
2,TomatoCanned,,"u/Tungsten_, Thanks for creating a section jus..."
3,bad-fengshui,,As with anything related to Asians in politics...
4,Pancake_muncher,,Yet colleges will allow alumni and doners in e...
5,suberry,,I just hated Affirmative Action as a distracti...
6,Puzzled-Painter3301,,My own feeling is that I was never in love wit...
7,e9967780,,Anti Asian racism whether against East Asians ...
8,,,Can we overturn legacy and athlete admissions ...
9,OkartoIceCream,,"I want to remind people that in California, on..."


Note: 306 rows were of deleted comments and so dropped from dataframe

### Drop rows that have been filtered by the AutoModerator

In [39]:
moderated_rows = comments_df[comments_df['username']=='AutoModerator']
moderated_rows

comments_df.drop(moderated_rows.index, inplace=True)
print(comments_df.shape)
comments_df.reset_index(drop=True)
comments_df.head(15)

(3283, 7)


Unnamed: 0,username,flair_text,body,tokens_new,normalized_tokens,normalized_tokens_count,word_count
0,Tungsten_,,Thanks to everyone who engaged in insightful a...,"[Thanks, to, everyone, who, engaged, in, insig...","[thank, engage, insightful, respectful, discou...",9,20
1,ProudBlackMatt,Chinese-American,I would prefer using a process that takes into...,"[I, would, prefer, using, a, process, that, ta...","[prefer, process, take, account, poverty, inst...",52,103
2,TomatoCanned,,"u/Tungsten_, Thanks for creating a section jus...","[u/Tungsten_,, Thanks, for, creating, a, secti...","[u/tungsten_,, thank, create, section, discuss...",126,269
3,bad-fengshui,,As with anything related to Asians in politics...,"[As, with, anything, related, to, Asians, in, ...","[relate, asians, politic, m, see, lot, non, as...",25,59
4,Pancake_muncher,,Yet colleges will allow alumni and doners in e...,"[Yet, colleges, will, allow, alumni, and, done...","[college, allow, alumnus, doner, easily, consi...",19,40
5,suberry,,I just hated Affirmative Action as a distracti...,"[I, just, hated, Affirmative, Action, as, a, d...","[hate, affirmative, action, distraction, banda...",78,171
6,Puzzled-Painter3301,,My own feeling is that I was never in love wit...,"[My, own, feeling, is, that, I, was, never, in...","[feeling, love, affirmative, action, possible,...",102,231
7,e9967780,,Anti Asian racism whether against East Asians ...,"[Anti, Asian, racism, whether, against, East, ...","[anti, asian, racism, east, asians, south, asi...",21,46
8,,,Can we overturn legacy and athlete admissions ...,"[Can, we, overturn, legacy, and, athlete, admi...","[overturn, legacy, athlete, admission, point, ...",15,29
9,OkartoIceCream,,"I want to remind people that in California, on...","[I, want, to, remind, people, that, in, Califo...","[want, remind, people, california, progressive...",104,200


### Tokenize

In [21]:
def tokenize_str(str):
    tokenized = []
    doc = nlp(str)
    for token in doc:
        if not token.is_punct and len(token.text.strip()) > 0:
            tokenized.append(token.text)
    return tokenized

In [22]:
# create new column of tokens
comments_df['tokens'] = comments_df['body'].apply(lambda x: tokenize_str(x))

In [6]:
comments_df.head(5)

Unnamed: 0,username,flair_text,body,tokens
0,Tungsten_,,Thanks to everyone who engaged in insightful a...,"[Thanks, to, everyone, who, engaged, in, insig..."
1,ProudBlackMatt,Chinese-American,I would prefer using a process that takes into...,"[I, would, prefer, using, a, process, that, ta..."
2,TomatoCanned,,"u/Tungsten_, Thanks for creating a section jus...","[u, Tungsten, Thanks, for, creating, a, sectio..."
3,bad-fengshui,,As with anything related to Asians in politics...,"[As, with, anything, related, to, Asians, in, ..."
4,Pancake_muncher,,Yet colleges will allow alumni and doners in e...,"[Yet, colleges, will, allow, alumni, and, done..."


In [23]:
print(type(comments_df['tokens'][0]))

<class 'list'>


Problem with Reddit comments: We don't want to get split on (get rid of) forward slashes nor get rid of punctuations on usernames like "u/Tungsten_"

So, instead of using the native Spacy package, we use RedditScore which is built on Spacy but modified for Reddit/Twitter comments.

In [10]:
sample_comment = comments_df['body'][2]
print(sample_comment)

u/Tungsten_, Thanks for creating a section just to discuss this. When I read the news I immediately went searching for a forum where folks might have civil discourse on this topic.

Just had a few comments/questions:

1. Has anyone come across seemingly legitimate data sets on asians & college admission with respect to Affirmative Action (AA for short going forward)
2. As an Asian (not born in the US but pretty much assimilated here for 35+ years), I am conflicted. Research results like this one show: [https://www.pewresearch.org/race-ethnicity/2023/06/08/asian-americans-hold-mixed-views-around-affirmative-action/](https://www.pewresearch.org/race-ethnicity/2023/06/08/asian-americans-hold-mixed-views-around-affirmative-action/) that something like 53% Asians think AA is a good thing, and yet when you scroll down and look at the question of "Should colleges consider race/ethnicity in college admissions," the percentage of Asians that say yes are at 21%, no at 76%.

I am part of the 76%.

### RedditScore 
(not using anymore)

In [14]:
!pip install git+https://github.com/crazyfrogspb/RedditScore.git

Collecting git+https://github.com/crazyfrogspb/RedditScore.git
  Cloning https://github.com/crazyfrogspb/RedditScore.git to c:\users\danie\appdata\local\temp\pip-req-build-l3ypk4yu
Collecting tldextract>=2.1.0
  Downloading tldextract-4.0.0-py3-none-any.whl (97 kB)
Collecting eventlet>=0.22.1
  Downloading eventlet-0.35.2-py3-none-any.whl (359 kB)
Collecting dill
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
Collecting requests-file>=1.4
  Downloading requests_file-2.0.0-py2.py3-none-any.whl (4.2 kB)
Collecting dnspython>=1.15.0
  Downloading dnspython-2.3.0-py3-none-any.whl (283 kB)
Collecting greenlet>=1.0
  Downloading greenlet-3.0.3-cp37-cp37m-win_amd64.whl (291 kB)
Building wheels for collected packages: redditscore
  Building wheel for redditscore (setup.py): started
  Building wheel for redditscore (setup.py): finished with status 'done'
  Created wheel for redditscore: filename=redditscore-0.7.3-py3-none-any.whl size=7852150 sha256=affc20c003c412f8a666e5207c543b14bc9fd9152

  Running command git clone -q https://github.com/crazyfrogspb/RedditScore.git 'C:\Users\danie\AppData\Local\Temp\pip-req-build-l3ypk4yu'


In [24]:
from redditscore.tokenizer import CrazyTokenizer

tokenizer = CrazyTokenizer(hashtags=False, lowercase=True, reddit_usernames=True, normalize=False)

def tokenize_str_reddit(str):
    tokenized = []
    doc = tokenizer.tokenizer(str)
    for token in doc:
        if not token.is_punct and len(token.text.strip()) > 0:
            tokenized.append(token.text)
    return tokenized

TypeError: add() takes exactly 2 positional arguments (3 given)

In [22]:
comments_df['tokens'] = comments_df['body'].apply(lambda x: tokenize_str_reddit(x))

TypeError: add() takes exactly 2 positional arguments (3 given)

### Special case in Spacy 

'CrazyTokenizer' doesn't work bc the code is reliant on an old version of Spacy. So, we will create a special case in Spacy's tokenizer

In [6]:
from spacy.symbols import ORTH

doc = nlp(' Hi, u/Tungsten_, Thanks for creating a section')
print([w.text for w in doc])

special_case = [{'TEXT': {'REGEX': r'u\/([[:word:]]|-){3,23}'}}]
nlp.tokenizer.add_special_case('redditor', special_case)
doc = nlp(' Hi, u/Tungsten_, Thanks for creating a section')

print(print([w.text for w in doc]))

[' ', 'Hi', ',', 'u', '/', 'Tungsten', '_', ',', 'Thanks', 'for', 'creating', 'a', 'section']


KeyError: 65

In [24]:
import re
from spacy.tokenizer import Tokenizer

nlp.tokenizer = Tokenizer(nlp.vocab, token_match=re.compile(r'u/{1}\w{3,23}').search)

In [9]:
doc = nlp(sample_comment)

print([w.text for w in doc])

NameError: name 'sample_comment' is not defined

In [10]:
print(comments_df['tokens'][2])

['u', 'Tungsten', 'Thanks', 'for', 'creating', 'a', 'section', 'just', 'to', 'discuss', 'this', 'When', 'I', 'read', 'the', 'news', 'I', 'immediately', 'went', 'searching', 'for', 'a', 'forum', 'where', 'folks', 'might', 'have', 'civil', 'discourse', 'on', 'this', 'topic', 'Just', 'had', 'a', 'few', 'comments', 'questions', '1', 'Has', 'anyone', 'come', 'across', 'seemingly', 'legitimate', 'data', 'sets', 'on', 'asians', 'college', 'admission', 'with', 'respect', 'to', 'Affirmative', 'Action', 'AA', 'for', 'short', 'going', 'forward', '2', 'As', 'an', 'Asian', 'not', 'born', 'in', 'the', 'US', 'but', 'pretty', 'much', 'assimilated', 'here', 'for', '35', '+', 'years', 'I', 'am', 'conflicted', 'Research', 'results', 'like', 'this', 'one', 'show', 'https://www.pewresearch.org/race-ethnicity/2023/06/08/asian-americans-hold-mixed-views-around-affirmative-action/](https://www.pewresearch.org/race-ethnicity/2023/06/08/asian-americans-hold-mixed-views-around-affirmative-action/', 'that', 'some

We can see from above that the tokenization got worse when we added the token_match criteria. We need to modify the English class-attribute before loading model.

In [25]:
import spacy
nlp = spacy.load('en_core_web_sm')

#nlp.tokenizer.token_match = None (run line if results are same)
doc1 = nlp(sample_comment)
print([w.text for w in doc1])

# add token_match to tokenizer
nlp.tokenizer.token_match = re.compile(r'u/{1}\w{3,23}').match
doc2 = nlp(sample_comment)
print([w.text for w in doc2])

NameError: name 'sample_comment' is not defined

For some reason, the reddit username includes the comma.
*IDEA: We can strip the string of any characters that are not valid in a username.*

Our tokenization of reddit usernames is not perfect, it may include trailing punctuation. But let's move on.

In [70]:
#redditor_regex = re.compile(r'u/{1}\w{3,23}')
#print(redditor_regex.search(sample_comment))
#redditor_regex.match(sample_comment)

<re.Match object; span=(0, 11), match='u/Tungsten_'>


<re.Match object; span=(0, 11), match='u/Tungsten_'>

In [15]:
comments_df.head(5)

Unnamed: 0,username,flair_text,body,tokens
0,Tungsten_,,Thanks to everyone who engaged in insightful a...,"[Thanks, to, everyone, who, engaged, in, insig..."
1,ProudBlackMatt,Chinese-American,I would prefer using a process that takes into...,"[I, would, prefer, using, a, process, that, ta..."
2,TomatoCanned,,"u/Tungsten_, Thanks for creating a section jus...","[u, Tungsten, Thanks, for, creating, a, sectio..."
3,bad-fengshui,,As with anything related to Asians in politics...,"[As, with, anything, related, to, Asians, in, ..."
4,Pancake_muncher,,Yet colleges will allow alumni and doners in e...,"[Yet, colleges, will, allow, alumni, and, done..."


In [26]:
nlp = spacy.load('en_core_web_sm')
nlp.tokenizer.token_match = re.compile(r'u/{1}\w{3,23}').match # match reddit usernames

# drop old tokens column from data frame
comments_df.drop(['tokens'], axis=1, inplace=True)

In [27]:
# add column to data frame
comments_df['tokens_new'] = comments_df['body'].apply(lambda x: tokenize_str(x))

In [28]:
comments_df.head(5)

Unnamed: 0,username,flair_text,body,tokens_new
0,Tungsten_,,Thanks to everyone who engaged in insightful a...,"[Thanks, to, everyone, who, engaged, in, insig..."
1,ProudBlackMatt,Chinese-American,I would prefer using a process that takes into...,"[I, would, prefer, using, a, process, that, ta..."
2,TomatoCanned,,"u/Tungsten_, Thanks for creating a section jus...","[u/Tungsten_,, Thanks, for, creating, a, secti..."
3,bad-fengshui,,As with anything related to Asians in politics...,"[As, with, anything, related, to, Asians, in, ..."
4,Pancake_muncher,,Yet colleges will allow alumni and doners in e...,"[Yet, colleges, will, allow, alumni, and, done..."


### Filter and normalize text

1. Make all words lowercase (trivial)
2. Drop non-word tokens (may already be done in tokenize_str() function)
3. Remove stop-words (in a sophisticated manner)
4. Stem words to remove suffixes, prefixes, infixes OR Lemmatize tokens (intelligently)

#### 3. Remove stop-words

In [29]:
# Remove stop words using word counts
counts_dict = {}
for word in comments_df['tokens_new'].sum():
    word = word.lower()
    if word in counts_dict:
        counts_dict[word]+=1
    else:
        counts_dict[word] = 1

word_counts = sorted(counts_dict.items(), key = lambda x: x[1], reverse=True)
word_counts[:50]

[('the', 11177),
 ('to', 7709),
 ('of', 6362),
 ('and', 6105),
 ('a', 5681),
 ('that', 5142),
 ('in', 4681),
 ('i', 4373),
 ('is', 4354),
 ('it', 3855),
 ('you', 2845),
 ('are', 2787),
 ('for', 2679),
 ('not', 2189),
 ("'s", 2185),
 ('asian', 2062),
 ("n't", 2018),
 ('as', 1991),
 ('this', 1988),
 ('but', 1881),
 ('be', 1822),
 ('they', 1805),
 ('do', 1798),
 ('on', 1749),
 ('have', 1703),
 ('action', 1527),
 ('with', 1507),
 ('affirmative', 1474),
 ('we', 1417),
 ('asians', 1386),
 ('people', 1363),
 ('or', 1353),
 ('if', 1310),
 ('at', 1282),
 ('from', 1216),
 ('about', 1145),
 ('who', 1088),
 ('more', 1065),
 ('was', 1045),
 ('white', 1045),
 ('their', 990),
 ('there', 984),
 ('so', 953),
 ('other', 937),
 ('like', 914),
 ('because', 913),
 ('just', 908),
 ('an', 902),
 ('what', 881),
 ('all', 879)]

In [20]:
word_counts[:35]

[('the', 11177),
 ('to', 7709),
 ('of', 6362),
 ('and', 6105),
 ('a', 5681),
 ('that', 5142),
 ('in', 4681),
 ('i', 4373),
 ('is', 4354),
 ('it', 3855),
 ('you', 2845),
 ('are', 2787),
 ('for', 2679),
 ('not', 2189),
 ("'s", 2185),
 ('asian', 2062),
 ("n't", 2018),
 ('as', 1991),
 ('this', 1988),
 ('but', 1881),
 ('be', 1822),
 ('they', 1805),
 ('do', 1798),
 ('on', 1749),
 ('have', 1703),
 ('action', 1527),
 ('with', 1507),
 ('affirmative', 1474),
 ('we', 1417),
 ('asians', 1386),
 ('people', 1363),
 ('or', 1353),
 ('if', 1310),
 ('at', 1282),
 ('from', 1216)]

Mark/remove words as stop words that are more frequent than the first noun ('i').

In [30]:
stop_words_freq = []
for word, count in word_counts:
    if word == 'i':
        break
    else:
        stop_words_freq.append(word)

print(stop_words_freq)

['the', 'to', 'of', 'and', 'a', 'that', 'in']


Judging from the list, we can add 'is' and 'it' to list of stop words

In [116]:
#stop_words_freq += ['is', 'it']
#stop_words_freq

['the', 'to', 'of', 'and', 'a', 'that', 'in', 'is', 'it']

Function to normalize tokens:

In [31]:
def normalize_tokens(word_list, extra_stop_words=[]):
    normalized = []
    if type(word_list) == list and len(word_list) == 1:
        word_list = word_list[0]

    if type(word_list) == list:
        word_list = ' '.join([str(elem) for elem in word_list])

    doc = nlp(word_list.lower()) # lowercase words in word_list

    # add lexeme property of stopword to words considered as stopwords
    if len(extra_stop_words) > 0:
        for stopword in extra_stop_words:
            lexeme = nlp.vocab[stopword]
            lexeme.is_stop = True

    for w in doc:
        # if not stop word or punctuation, add it to normalized
        if w.text != '\n' and not w.is_stop and not w.is_punct and not w.like_num and len(w.text.strip()) > 0:
            # add lemmatized version of word
            normalized.append(str(w.lemma_))

    return normalized

In [32]:
comments_df['normalized_tokens'] = comments_df['tokens_new'].apply(lambda x: normalize_tokens(x, stop_words_freq))
comments_df['normalized_tokens_count'] = comments_df['normalized_tokens'].apply(lambda x: len(x))

comments_df.head(20)

Unnamed: 0,username,flair_text,body,tokens_new,normalized_tokens,normalized_tokens_count
0,Tungsten_,,Thanks to everyone who engaged in insightful a...,"[Thanks, to, everyone, who, engaged, in, insig...","[thank, engage, insightful, respectful, discou...",9
1,ProudBlackMatt,Chinese-American,I would prefer using a process that takes into...,"[I, would, prefer, using, a, process, that, ta...","[prefer, process, take, account, poverty, inst...",52
2,TomatoCanned,,"u/Tungsten_, Thanks for creating a section jus...","[u/Tungsten_,, Thanks, for, creating, a, secti...","[u/tungsten_,, thank, create, section, discuss...",126
3,bad-fengshui,,As with anything related to Asians in politics...,"[As, with, anything, related, to, Asians, in, ...","[relate, asians, politic, m, see, lot, non, as...",25
4,Pancake_muncher,,Yet colleges will allow alumni and doners in e...,"[Yet, colleges, will, allow, alumni, and, done...","[college, allow, alumnus, doner, easily, consi...",19
5,suberry,,I just hated Affirmative Action as a distracti...,"[I, just, hated, Affirmative, Action, as, a, d...","[hate, affirmative, action, distraction, banda...",78
6,Puzzled-Painter3301,,My own feeling is that I was never in love wit...,"[My, own, feeling, is, that, I, was, never, in...","[feeling, love, affirmative, action, possible,...",102
7,e9967780,,Anti Asian racism whether against East Asians ...,"[Anti, Asian, racism, whether, against, East, ...","[anti, asian, racism, east, asians, south, asi...",21
8,,,Can we overturn legacy and athlete admissions ...,"[Can, we, overturn, legacy, and, athlete, admi...","[overturn, legacy, athlete, admission, point, ...",15
9,OkartoIceCream,,"I want to remind people that in California, on...","[I, want, to, remind, people, that, in, Califo...","[want, remind, people, california, progressive...",104


In [33]:
# add word count column
comments_df['word_count'] = comments_df['tokens_new'].apply(lambda x: len(x))

In [34]:
comments_df

Unnamed: 0,username,flair_text,body,tokens_new,normalized_tokens,normalized_tokens_count,word_count
0,Tungsten_,,Thanks to everyone who engaged in insightful a...,"[Thanks, to, everyone, who, engaged, in, insig...","[thank, engage, insightful, respectful, discou...",9,20
1,ProudBlackMatt,Chinese-American,I would prefer using a process that takes into...,"[I, would, prefer, using, a, process, that, ta...","[prefer, process, take, account, poverty, inst...",52,103
2,TomatoCanned,,"u/Tungsten_, Thanks for creating a section jus...","[u/Tungsten_,, Thanks, for, creating, a, secti...","[u/tungsten_,, thank, create, section, discuss...",126,269
3,bad-fengshui,,As with anything related to Asians in politics...,"[As, with, anything, related, to, Asians, in, ...","[relate, asians, politic, m, see, lot, non, as...",25,59
4,Pancake_muncher,,Yet colleges will allow alumni and doners in e...,"[Yet, colleges, will, allow, alumni, and, done...","[college, allow, alumnus, doner, easily, consi...",19,40
...,...,...,...,...,...,...,...
3618,aduogetsatastegouda,,But that's irrelevant. The right not to be dis...,"[But, that, 's, irrelevant, The, right, not, t...","[irrelevant, right, discriminate, base, race, ...",38,84
3619,rentonwong,Support Asian-American Media!,"Despite my dislike of AA, at least 2/3rds of A...","[Despite, my, dislike, of, AA, at, least, 2/3r...","[despite, dislike, aa, 2/3rds, asian, american...",19,32
3620,rentonwong,Support Asian-American Media!,> If 1/3 of a racial minority's members say th...,"[>, If, 1/3, of, a, racial, minority, 's, memb...","[>, racial, minority, member, want, discrimina...",27,61
3621,,,I'm just annoyed at how there's so much handwa...,"[I, 'm, just, annoyed, at, how, there, 's, so,...","[m, annoyed, handwaving, consequence, pro, aa,...",48,117


In [35]:
# check if normalized_tokens column is list or string - Result: list
print(type(comments_df['normalized_tokens'][0]))

<class 'list'>


Save above df as csv file.

In [36]:
comments_df.to_csv('../data/comments_df.csv')

In [6]:
input_file = 'comments.csv'
output_file = input_file[0:-4] + '_processed' + input_file[-4:]

print(output_file)

comments_processed.csv


### Exploratory Analysis

In [123]:
import nltk

In [124]:
comments_fq_dist = nltk.ConditionalFreqDist(((len(w), w) for w in comments_df['normalized_tokens'].sum()))

In [125]:
print(comments_fq_dist.N()) # number of total words

130955


In [135]:
comments_fq_dist[5].most_common(5) #most common 5-letter words

[('asian', 2442),
 ('white', 1321),
 ('think', 956),
 ('black', 787),
 ('group', 573)]