In [207]:
import pandas as pd
import re

# https://stackoverflow.com/questions/2087370/decode-html-entities-in-python-string
from w3lib.html import replace_entities

In [208]:
df = pd.read_csv('../data/raw_posts.csv')

In [209]:
# let's get some cool info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19977 entries, 0 to 19976
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   author        19977 non-null  object
 1   created_utc   19977 non-null  int64 
 2   id            19977 non-null  object
 3   num_comments  19977 non-null  int64 
 4   score         19977 non-null  int64 
 5   selftext      18489 non-null  object
 6   subreddit     19977 non-null  object
 7   title         19977 non-null  object
 8   period        19977 non-null  int64 
dtypes: int64(4), object(5)
memory usage: 1.4+ MB


In [210]:
# well the null selftexts aren't great, but as long as there's at least a title there, we have something (maybe)
# so probably best to fill NaNs with empty string and then concatenate
df.fillna(value = '', inplace = True)

In [211]:
# Also - we know we saw some [deleted] and [removed] in there - let's see if they at least have titles
df[(df['selftext'] == '[deleted]') | (df['selftext'] == '[removed]')]

# looks like they do - so we should change those to empty strings too

Unnamed: 0,author,created_utc,id,num_comments,score,selftext,subreddit,title,period
2,[deleted],1223963408,76zmj,2,2,[deleted],Jazz,anybody have that record of Charlie Christian ...,1
6,[deleted],1244717722,8rmly,2,2,[deleted],Jazz,What happened to urge2burge.wordpress.com? :(,2
7,[deleted],1250145588,9a6xy,34,11,[deleted],Jazz,Just starting to listen to Jazz. Recommendati...,2
8,[deleted],1250176578,9ab7s,0,1,[deleted],Jazz,Jazz pioneer Rashied Ali Has died.,2
9,[deleted],1252880400,9k6pw,0,0,[deleted],Jazz,The Bad Plus - Cheney Pinata ... groundbreaki...,2
...,...,...,...,...,...,...,...,...,...
19960,Heavy_Possibility_18,1650717493,ua4jee,1,1,[removed],classicalmusic,Need help!,20
19966,Well_Made_Legacy,1650747440,uaennx,1,1,[removed],classicalmusic,Clarinet and French horn duet pieces?,20
19967,LazyMaster42,1650748083,uaevrn,1,1,[removed],classicalmusic,I have a piece stuck in my head and I was wond...,20
19968,BananaZealousideal23,1650750783,uafrjw,1,1,[removed],classicalmusic,Music that sounds like Shame by Mitski,20


In [212]:
df.loc[(df['selftext'] == '[deleted]') | (df['selftext'] == '[removed]'), 'selftext'] = ''

In [213]:
# double-check
df[(df['selftext'] == '[deleted]') | (df['selftext'] == '[removed]')]

# good

Unnamed: 0,author,created_utc,id,num_comments,score,selftext,subreddit,title,period


In [214]:
# let's concatenate now
df['all_text'] = df['title'] + ' ' + df['selftext']

In [215]:
# let's get rid of unicode/emojis
# https://stackoverflow.com/questions/44010727/remove-unicode-code-uxxx-in-string-python
df['all_text'] = df['all_text'].map(lambda x: re.sub(r'[^\x00-\x7F]+', ' ', x))

In [216]:
# let's fix up the HTML entities - running this 2x because some of them are double-encoded
df['all_text'] = df['all_text'].map(lambda x: replace_entities(x))
df['all_text'] = df['all_text'].map(lambda x: replace_entities(x))

In [217]:
# and the literal escape characters
df['all_text'] = df['all_text'].map(lambda x: re.sub(r'\n|\r|\t', ' ', x))

In [218]:
# and the URLs
df['all_text'] = df['all_text'].map(lambda x: re.sub(r'\bhttp(s?)://.+\b', ' ', x))

In [219]:
# and the punctuation that isn't apostrophes
df['all_text'] = df['all_text'].map(lambda x: re.sub(r'[.,/;:!?~@#$%^&*()_+=\[\]|<>"-]', ' ', x))

In [220]:
# and numbers
df['all_text'] = df['all_text'].map(lambda x: re.sub(r'[0-9]', ' ', x))

In [221]:
# and double spaces
df['all_text'] = df['all_text'].map(lambda x: re.sub(r'\s+', ' ', x))

In [222]:
# and strip it
df['all_text'] = df['all_text'].map(lambda x: x.strip())

In [223]:
# let's get the character lengths - generally useful and also will let us know if we have any zeros now
df['all_text_len'] = df['all_text'].map(lambda x: len(str(x)))

In [224]:
df['all_text_len'].sort_values()
# yep we got zeros - they need to go

9451         0
2113         0
7150         0
2100         0
10527        0
         ...  
12308     9441
8358      9507
17160     9552
17570    13228
17361    13533
Name: all_text_len, Length: 19977, dtype: int64

In [225]:
# maybe let's see how much dropping the zero-length rows is gonna affect the respective classes
df[df['all_text_len'] == 0].groupby('subreddit').count()['all_text_len']

# not bad at all, we can stand to lose these

subreddit
Jazz              25
classicalmusic    13
Name: all_text_len, dtype: int64

In [226]:
df = df[df['all_text_len'] > 0]
df.shape

# looks good

(19939, 11)

In [227]:
# we should fix up the subreddits so they're all lowercase
df['subreddit'] = df['subreddit'].str.lower()

In [228]:
# let's dump out the results so we can do some EDAing!
df.to_csv('../data/clean_posts.csv', index = False)