## Imports

In [3]:
import pandas as pd
import time
import requests
import numpy as np
import re

# Quick links

- [Preliminary look](#Preliminary-look)
- [Data Cleaning](#Data-Cleaning)
- [Exports](#Exports)

#### PushShift API request

Set URL and check status

In [4]:
url = 'https://api.pushshift.io/reddit/search/submission?subreddit=AmItheAsshole'

res = requests.get(url)

res.status_code

200

Data extraction

In [3]:
# res.json()['data'] # sanity check

In [4]:
df = pd.DataFrame(res.json()['data'])

In [5]:
# check to see if request has only unique posts
df['id'].nunique() == len(df)

True

### Data collection process, **do not execute unless** trying to get more data.

In [1]:
# function provided by Devin Fay in lieu of Shift API changes
def get_reddit_submissions(subreddits: list, max_num: int):
    base_url = "https://api.pushshift.io/reddit/search/submission"
    all_posts = []
    
    for subreddit in subreddits:
        params = {
            'subreddit': subreddit,
            'size': 1000 #doesn't appear to be working
        }
        
        count = 0 #keep track of posts/subreddit
        
        while count < max_num:
            res = requests.get(base_url, params)
            
            if res.status_code == 200:
                posts = pd.DataFrame(res.json()['data'])
                count += len(posts)
                
                all_posts.append(posts)
                
                if len(posts) == 0: 
                    break #break loop if request successful but nothing retrieved
                    
                #get sequential posts from most recent to least    
                params['before'] = posts['created_utc'].min()
            else:
                print(f'status: {res.status_code}')
        print(f'scraped from {subreddit}: {count}')
    
    return pd.concat(all_posts)

## Preliminary look

In [7]:
df = get_reddit_submissions(['AmItheAsshole', 'TrueOffMyChest'], 30_000)

scraped from AmItheAsshole: 30995
scraped from TrueOffMyChest: 17537


In [8]:
# df.columns # sanity check

In [9]:
df.to_csv('./data/subreddits.csv', index=False)

In [10]:
df.shape # sanity check

(48532, 94)

In [3]:
subreddits = pd.read_csv('./data/subreddits.csv')

  subreddits = pd.read_csv('./data/subreddits.csv')


In [4]:
subreddits.columns.to_list()

['subreddit',
 'selftext',
 'author_fullname',
 'gilded',
 'title',
 'link_flair_richtext',
 'subreddit_name_prefixed',
 'hidden',
 'pwls',
 'link_flair_css_class',
 'thumbnail_height',
 'top_awarded_type',
 'hide_score',
 'quarantine',
 'link_flair_text_color',
 'upvote_ratio',
 'author_flair_background_color',
 'subreddit_type',
 'total_awards_received',
 'media_embed',
 'thumbnail_width',
 'author_flair_template_id',
 'is_original_content',
 'secure_media',
 'is_reddit_media_domain',
 'is_meta',
 'category',
 'secure_media_embed',
 'link_flair_text',
 'score',
 'is_created_from_ads_ui',
 'author_premium',
 'thumbnail',
 'edited',
 'author_flair_css_class',
 'author_flair_richtext',
 'gildings',
 'content_categories',
 'is_self',
 'link_flair_type',
 'wls',
 'removed_by_category',
 'author_flair_type',
 'domain',
 'allow_live_comments',
 'suggested_sort',
 'view_count',
 'archived',
 'no_follow',
 'is_crosspostable',
 'pinned',
 'over_18',
 'all_awardings',
 'awarders',
 'media_only'

In [5]:
subreddits.head()

Unnamed: 0,subreddit,selftext,author_fullname,gilded,title,link_flair_richtext,subreddit_name_prefixed,hidden,pwls,link_flair_css_class,...,updated_utc,utc_datetime_str,author_cakeday,post_hint,preview,link_flair_template_id,edited_on,event_start,event_end,event_is_live
0,AmItheAsshole,TA\n\nLong story short my mom lend my ski Jack...,t2_tyyt86d3,0,AITA for being upset bc my mom lend my ski gea...,[],r/AmItheAsshole,False,6,,...,1671064113,2022-12-15 00:28:15,,,,,,,,
1,AmItheAsshole,"Rachel, Katy and I were part of a big friend g...",t2_n2jeq1b6,0,AITA for not inviting my friend to a concert s...,[],r/AmItheAsshole,False,6,,...,1671064049,2022-12-15 00:27:12,,,,,,,,
2,AmItheAsshole,So me 17 and my mom 36 are in a fight and I ne...,t2_q8j6qqlx,0,AITA for telling my mom I won’t talk to her ag...,[],r/AmItheAsshole,False,6,,...,1671064041,2022-12-15 00:27:05,,,,,,,,
3,AmItheAsshole,"I (21,f) got a new roomate Mary (23,f)this yea...",t2_ecx8s39i,0,AITA for ignoring my roommate?,[],r/AmItheAsshole,False,6,,...,1671063985,2022-12-15 00:26:09,,,,,,,,
4,AmItheAsshole,"a couple months ago, me (18, F) and my ex boyf...",t2_uzh0mjhj,0,AITA for sending nudes to my ex boyfriend’s be...,[],r/AmItheAsshole,False,6,,...,1671063920,2022-12-15 00:25:07,,,,,,,,


In [6]:
subreddits[subreddits['selftext'] != '[removed]']

Unnamed: 0,subreddit,selftext,author_fullname,gilded,title,link_flair_richtext,subreddit_name_prefixed,hidden,pwls,link_flair_css_class,...,updated_utc,utc_datetime_str,author_cakeday,post_hint,preview,link_flair_template_id,edited_on,event_start,event_end,event_is_live
0,AmItheAsshole,TA\n\nLong story short my mom lend my ski Jack...,t2_tyyt86d3,0,AITA for being upset bc my mom lend my ski gea...,[],r/AmItheAsshole,False,6,,...,1671064113,2022-12-15 00:28:15,,,,,,,,
1,AmItheAsshole,"Rachel, Katy and I were part of a big friend g...",t2_n2jeq1b6,0,AITA for not inviting my friend to a concert s...,[],r/AmItheAsshole,False,6,,...,1671064049,2022-12-15 00:27:12,,,,,,,,
2,AmItheAsshole,So me 17 and my mom 36 are in a fight and I ne...,t2_q8j6qqlx,0,AITA for telling my mom I won’t talk to her ag...,[],r/AmItheAsshole,False,6,,...,1671064041,2022-12-15 00:27:05,,,,,,,,
3,AmItheAsshole,"I (21,f) got a new roomate Mary (23,f)this yea...",t2_ecx8s39i,0,AITA for ignoring my roommate?,[],r/AmItheAsshole,False,6,,...,1671063985,2022-12-15 00:26:09,,,,,,,,
4,AmItheAsshole,"a couple months ago, me (18, F) and my ex boyf...",t2_uzh0mjhj,0,AITA for sending nudes to my ex boyfriend’s be...,[],r/AmItheAsshole,False,6,,...,1671063920,2022-12-15 00:25:07,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48527,TrueOffMyChest,My fiance is a grad student that teaches and I...,t2_igq4zdyg,0,Feel guilty for not being able to make my fian...,[],r/TrueOffMyChest,False,7,,...,1669821554,2022-11-15 00:42:05,,,,,,,,
48528,TrueOffMyChest,I have a 12 year old son with autism. And a 8 ...,t2_u9hkanax,0,My kids fight over food and eating foods that ...,[],r/TrueOffMyChest,False,7,,...,1669821554,2022-11-15 00:40:45,,,,,,,,
48529,TrueOffMyChest,I (15M) am sad all the time. I never go outsid...,t2_u7y9djlz,0,If people really knew,[],r/TrueOffMyChest,False,7,,...,1669821555,2022-11-15 00:36:48,,,,,,,,
48530,TrueOffMyChest,I'm currently dating a guy. We both love each ...,t2_ub7on4ft,0,I still miss my ex even though i have a new bo...,[],r/TrueOffMyChest,False,7,,...,1669821555,2022-11-15 00:35:42,,,,,,,,


Over half the posts from r/AmItheAsshole have the body removed. This would train my models to interperet a removed post as heavily leaning towards this subreddit. Thus I am dropping the rows where the selftect is '[removed]'. Leaving the values as empty or null would induce the same effect. 

## Data Cleaning

In [7]:
clean_df = subreddits # creating new df to clean and act as a checkpoint in case I do anything irrepairable. 

In [8]:
clean_df['subreddit'].value_counts()

AmItheAsshole     30995
TrueOffMyChest    17537
Name: subreddit, dtype: int64

In [9]:
clean_df['selftext'].value_counts().sort_values(ascending=False).head(3) # Have to take care of rows where there is no post

[removed]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               

In [10]:
clean_df = subreddits[(subreddits['selftext'] != '[removed]')]

In [11]:
clean_df = clean_df[clean_df['selftext'] != '[deleted]']

In [12]:
clean_df['subreddit'].value_counts().head(3) # Great news for me. Just about even, I shouldn't have to worry too much about my distribution

AmItheAsshole     14849
TrueOffMyChest    14014
Name: subreddit, dtype: int64

In [13]:
clean_df[clean_df['selftext'].notna()].shape, clean_df[clean_df['selftext'].isnull()].shape #dropping rows with nulls. I've got enough data to do so.

((28419, 94), (444, 94))

In [14]:
clean_df = clean_df[clean_df['selftext'].notna()]

In [15]:
clean_df.info() # There are many features with basically no information in them. Should definitely drop these and maybe anything with over half gone?

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28419 entries, 0 to 48531
Data columns (total 94 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   subreddit                      28419 non-null  object 
 1   selftext                       28419 non-null  object 
 2   author_fullname                28419 non-null  object 
 3   gilded                         28419 non-null  int64  
 4   title                          28419 non-null  object 
 5   link_flair_richtext            28419 non-null  object 
 6   subreddit_name_prefixed        28419 non-null  object 
 7   hidden                         28419 non-null  bool   
 8   pwls                           28419 non-null  int64  
 9   link_flair_css_class           1 non-null      object 
 10  thumbnail_height               0 non-null      float64
 11  top_awarded_type               0 non-null      float64
 12  hide_score                     28419 non-null 

##### There's a clear tag in r/AmItheAsshole that asks "AITA" or "WIBTAH" or "WIBTA" prior to the question. This tag would train the models to look for the tag to determine where to place it. Removing the tag to train a better model.

In [16]:
clean_df['title'] = clean_df['title'].map(lambda x: x.replace('AITA', '').replace('WIBTAH', '').replace('WIBTA', ''))
clean_df['selftext'] = clean_df['selftext'].map(lambda x: x.replace('AITA', '').replace('WIBTAH', '').replace('WIBTA', ''))

In [17]:
clean_df['title'] # sanity check

0         for being upset bc my mom lend my ski gear wi...
1         for not inviting my friend to a concert she c...
2         for telling my mom I won’t talk to her again ...
3                                for ignoring my roommate?
4         for sending nudes to my ex boyfriend’s best f...
                               ...                        
48527    Feel guilty for not being able to make my fian...
48528    My kids fight over food and eating foods that ...
48529                                If people really knew
48530    I still miss my ex even though i have a new bo...
48531                                     still not healed
Name: title, Length: 28419, dtype: object

##### There aren't many emojis in a majority of the posts. Removing them for the sake of the models

In [18]:
clean_df['selftext'] = clean_df['selftext'].str.replace('(', ' ', flags=re.UNICODE)\
.str.replace(')', ' ', flags=re.UNICODE)\
.str.replace('[^\sA-Za-z0-9]', '', flags=re.UNICODE)\
.str.replace('\n', ' ', flags=re.UNICODE)

  clean_df['selftext'] = clean_df['selftext'].str.replace('(', ' ', flags=re.UNICODE)\
  clean_df['selftext'] = clean_df['selftext'].str.replace('(', ' ', flags=re.UNICODE)\


##### Code block above replace parenthesis with space. Some posts have "word(details)" and simply removing them would make up new words. Ignores white space and any actual letters numbers. Line 3 should remove any special characters, which should only affect emojis. Line 4 removes the newline placeholder.

In [19]:
clean_df['selftext']

0        TA  Long story short my mom lend my ski Jacket...
1        Rachel Katy and I were part of a big friend gr...
2        So me 17 and my mom 36 are in a fight and I ne...
3        I  21f  got a new roomate Mary  23f this year ...
4        a couple months ago me  18 F  and my ex boyfri...
                               ...                        
48527    My fiance is a grad student that teaches and I...
48528    I have a 12 year old son with autism And a 8 y...
48529    I  15M  am sad all the time I never go outside...
48530    Im currently dating a guy We both love each ot...
48531    Was with my ex for 3 years she was very narcis...
Name: selftext, Length: 28419, dtype: object

In [20]:
clean_df['title'] = clean_df['title'].str.replace('(', ' ', flags=re.UNICODE)\
.str.replace(')', ' ', flags=re.UNICODE)\
.str.replace('[^\sA-Za-z0-9]', '', flags=re.UNICODE)\
.str.replace('\n', ' ', flags=re.UNICODE)

  clean_df['title'] = clean_df['title'].str.replace('(', ' ', flags=re.UNICODE)\
  clean_df['title'] = clean_df['title'].str.replace('(', ' ', flags=re.UNICODE)\


In [21]:
clean_df = clean_df[clean_df['selftext'] != 'deleted']

In [22]:
clean_df = clean_df[clean_df['selftext'].notna()]

In [23]:
clean_df['subreddit'].value_counts()

AmItheAsshole     14849
TrueOffMyChest    13570
Name: subreddit, dtype: int64

##### To address features with many null values, anything exceeding a certain threshhold ought to be dropped.

In [24]:
feat_drop = []
for feat in clean_df.columns:
    if clean_df[feat].isnull().sum()>13_000: #about half
        feat_drop.append(feat)
        
feat_drop

['link_flair_css_class',
 'thumbnail_height',
 'top_awarded_type',
 'author_flair_background_color',
 'thumbnail_width',
 'author_flair_template_id',
 'secure_media',
 'category',
 'link_flair_text',
 'author_flair_css_class',
 'content_categories',
 'removed_by_category',
 'suggested_sort',
 'view_count',
 'author_flair_text',
 'removed_by',
 'distinguished',
 'link_flair_background_color',
 'discussion_type',
 'author_flair_text_color',
 'media',
 'author_cakeday',
 'post_hint',
 'preview',
 'link_flair_template_id',
 'edited_on',
 'event_start',
 'event_end',
 'event_is_live']

##### Is there any information that can be gleaned from these columns?

In [25]:
clean_df[feat_drop].head() # no not really.

Unnamed: 0,link_flair_css_class,thumbnail_height,top_awarded_type,author_flair_background_color,thumbnail_width,author_flair_template_id,secure_media,category,link_flair_text,author_flair_css_class,...,author_flair_text_color,media,author_cakeday,post_hint,preview,link_flair_template_id,edited_on,event_start,event_end,event_is_live
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


##### Let's explore the rest of the columns to see if there's anything useful.

Definitely Useful: subreddit, selftext, author, title

In [26]:
clean_df.drop(columns = feat_drop).iloc[:,:15].head()

Unnamed: 0,subreddit,selftext,author_fullname,gilded,title,link_flair_richtext,subreddit_name_prefixed,hidden,pwls,hide_score,quarantine,link_flair_text_color,upvote_ratio,subreddit_type,total_awards_received
0,AmItheAsshole,TA Long story short my mom lend my ski Jacket...,t2_tyyt86d3,0,for being upset bc my mom lend my ski gear wi...,[],r/AmItheAsshole,False,6,True,False,dark,1.0,public,0
1,AmItheAsshole,Rachel Katy and I were part of a big friend gr...,t2_n2jeq1b6,0,for not inviting my friend to a concert she c...,[],r/AmItheAsshole,False,6,True,False,dark,1.0,public,0
2,AmItheAsshole,So me 17 and my mom 36 are in a fight and I ne...,t2_q8j6qqlx,0,for telling my mom I wont talk to her again i...,[],r/AmItheAsshole,False,6,True,False,dark,1.0,public,0
3,AmItheAsshole,I 21f got a new roomate Mary 23f this year ...,t2_ecx8s39i,0,for ignoring my roommate,[],r/AmItheAsshole,False,6,True,False,dark,1.0,public,0
4,AmItheAsshole,a couple months ago me 18 F and my ex boyfri...,t2_uzh0mjhj,0,for sending nudes to my ex boyfriends best fr...,[],r/AmItheAsshole,False,6,True,False,dark,1.0,public,0


In [27]:
clean_df.drop(columns = feat_drop).iloc[:,15:30].head()
# maybe score

Unnamed: 0,media_embed,is_original_content,is_reddit_media_domain,is_meta,secure_media_embed,score,is_created_from_ads_ui,author_premium,thumbnail,edited,author_flair_richtext,gildings,is_self,link_flair_type,wls
0,{},False,False,False,{},1,False,False,self,False,[],{},True,text,6
1,{},False,False,False,{},1,False,False,self,False,[],{},True,text,6
2,{},False,False,False,{},1,False,False,self,False,[],{},True,text,6
3,{},False,False,False,{},1,False,False,self,False,[],{},True,text,6
4,{},False,False,False,{},1,False,False,self,False,[],{},True,text,6


In [28]:
clean_df.drop(columns = feat_drop).iloc[:,30:45].head()

Unnamed: 0,author_flair_type,domain,allow_live_comments,archived,no_follow,is_crosspostable,pinned,over_18,all_awardings,awarders,media_only,can_gild,spoiler,locked,treatment_tags
0,text,self.AmItheAsshole,False,False,False,True,False,False,[],[],False,True,False,False,[]
1,text,self.AmItheAsshole,False,False,True,True,False,False,[],[],False,True,False,False,[]
2,text,self.AmItheAsshole,False,False,True,True,False,False,[],[],False,True,False,False,[]
3,text,self.AmItheAsshole,False,False,True,True,False,False,[],[],False,True,False,False,[]
4,text,self.AmItheAsshole,False,False,False,True,False,False,[],[],False,True,False,False,[]


In [29]:
clean_df.drop(columns = feat_drop).iloc[:,45:].head()
# maybe created_utc

Unnamed: 0,subreddit_id,id,is_robot_indexable,author,num_comments,send_replies,whitelist_status,contest_mode,author_patreon_flair,permalink,parent_whitelist_status,stickied,url,subreddit_subscribers,created_utc,num_crossposts,is_video,retrieved_utc,updated_utc,utc_datetime_str
0,t5_2xhvq,zm72sb,True,Euphoric_Ad9171,1,True,all_ads,True,False,/r/AmItheAsshole/comments/zm72sb/aita_for_bein...,all_ads,False,https://www.reddit.com/r/AmItheAsshole/comment...,4893068,1671064095,0,False,1671064112,1671064113,2022-12-15 00:28:15
1,t5_2xhvq,zm7206,True,JLA_turtles,1,True,all_ads,True,False,/r/AmItheAsshole/comments/zm7206/aita_for_not_...,all_ads,False,https://www.reddit.com/r/AmItheAsshole/comment...,4893066,1671064032,0,False,1671064048,1671064049,2022-12-15 00:27:12
2,t5_2xhvq,zm71wg,True,throwaway291723,1,True,all_ads,True,False,/r/AmItheAsshole/comments/zm71wg/aita_for_tell...,all_ads,False,https://www.reddit.com/r/AmItheAsshole/comment...,4893065,1671064025,0,False,1671064041,1671064041,2022-12-15 00:27:05
3,t5_2xhvq,zm715w,True,Zealousideal-Big-358,1,True,all_ads,True,False,/r/AmItheAsshole/comments/zm715w/aita_for_igno...,all_ads,False,https://www.reddit.com/r/AmItheAsshole/comment...,4893060,1671063969,0,False,1671063984,1671063985,2022-12-15 00:26:09
4,t5_2xhvq,zm708l,True,breezyrenae23,1,True,all_ads,True,False,/r/AmItheAsshole/comments/zm708l/aita_for_send...,all_ads,False,https://www.reddit.com/r/AmItheAsshole/comment...,4893060,1671063907,0,False,1671063919,1671063920,2022-12-15 00:25:07


In [30]:
# what's score? 

clean_df['score'].value_counts(ascending = False)

# It may be related, let's keep it in.

1      27971
2        139
0        119
3         24
4         21
       ...  
257        1
44         1
132        1
120        1
79         1
Name: score, Length: 67, dtype: int64

## Exports

In [31]:
feats_to_keep = ['subreddit', 'selftext', 'title', 'author', 'score', 'created_utc']

In [32]:
selective_df = clean_df[feats_to_keep]

In [33]:
selective_df.head()

Unnamed: 0,subreddit,selftext,title,author,score,created_utc
0,AmItheAsshole,TA Long story short my mom lend my ski Jacket...,for being upset bc my mom lend my ski gear wi...,Euphoric_Ad9171,1,1671064095
1,AmItheAsshole,Rachel Katy and I were part of a big friend gr...,for not inviting my friend to a concert she c...,JLA_turtles,1,1671064032
2,AmItheAsshole,So me 17 and my mom 36 are in a fight and I ne...,for telling my mom I wont talk to her again i...,throwaway291723,1,1671064025
3,AmItheAsshole,I 21f got a new roomate Mary 23f this year ...,for ignoring my roommate,Zealousideal-Big-358,1,1671063969
4,AmItheAsshole,a couple months ago me 18 F and my ex boyfri...,for sending nudes to my ex boyfriends best fr...,breezyrenae23,1,1671063907


In [34]:
selective_df.to_csv("./data/cleaned_selective_data.csv", index = False)

In [35]:
clean_df.to_csv("./data/cleaned_data.csv", index = False)