# Download Data

In [1]:
import praw
import pandas as pd
import datetime

In [1]:
# Log into my reddit account with PRAW API for data collection
#reddit = praw.Reddit(
    # personal use script
#    client_id = (removed),
    #secret
#    client_secret = (removed),
    # account pw
#    password = (removed),
#    username = (removed),
#    user_agent = 'project script'
# )

In [3]:
print(reddit.user.me())

cds_serious


### Instantiate lists used to hold subreddit data from scrapes

In [4]:
# instantiate the lists
ids = []
title = []
body = []
num_comments = []
upvotes = []
time_posted = []
time_now = []
time_delta = []
subreddits = []

- subreddits: r/vegan   r/Cooking


# First Data Collection

**Approach**: *I am using the r/vegan and r/Cooking subreddits to analyze a question related to personal identity and values. The top posts and controversial posts will show what the posters on the subreddits are interested in promoting or arguing about, so much of the data will be from these posts. Due to the limited amount of data that can be collected at one time with PRAW and the fact that r/vegan has a limited amount of text data, I need to collect data from many of the time options available for controversial and top posts.*

In [5]:
# function for data collection. Append data to the lists created above.
def collect_data(selection):    
    # loop through reddit posts
    for submission in selection:
        # update the lists as we go
        # id
        ids.append(submission.id)
        # title appearing on the front page
        title.append(submission.title)
        # the submissions' body - an empty str if a link post
        body.append(submission.selftext)
        # number of comments
        num_comments.append(submission.num_comments)
        # number of upvotes
        upvotes.append(submission.ups)
        # time the reddit was created
        time_posted.append(submission.created_utc)
        # time that reddit was scraped
        time_now.append(datetime.datetime.utcnow())
        # time elapsed
        time_delta.append(datetime.datetime.utcnow() - \
                          datetime.datetime.utcfromtimestamp(submission.created_utc)) 
        # subreddit. Target variable.
        subreddits.append(submission.subreddit)

In [7]:
# test function
collect_data(reddit.subreddit('vegan').controversial('all', limit = 5000))

In [8]:
# lists need to be of the same length for dataframe creation. Conclusion is that the function put
# the same amount of data into each list.
print(len(ids), len(title), len(body), len(subreddits), len(num_comments),
      len(upvotes), len(time_posted), len(time_now), len(time_delta))

999 999 999 999 999 999 999 999 999


**Consideration of the server**: Information about running multiple PRAW requests is found here: https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html It is unclear how many can be run at once, and I want to be careful. The documentation says "PRAW, as of version 4, performs rate limiting dynamically based on the HTTP response headers from Reddit. As a result you can safely run a handful of PRAW instances without any additional configuration." I will only put a few data collection requests in each cell and will wait a little while between running cells.

In [9]:
# Referred to https://praw.readthedocs.io/en/latest/code_overview/models/subreddit.html for time options for 
# controversial posts
# r/vegan is only moderately large and active. Requesting data for shorter than a month will not add new data.
collect_data(reddit.subreddit('vegan').controversial('year', limit = None))
collect_data(reddit.subreddit('vegan').controversial('month', limit = None))

In [10]:
# Referred to https://praw.readthedocs.io/en/latest/code_overview/models/subreddit.html for time options for 
# top posts
# r/vegan is only moderately large and active. Requesting data for shorter than a month will not add new data.
collect_data(reddit.subreddit('vegan').top('all', limit = None))
collect_data(reddit.subreddit('vegan').top('year', limit = None))
collect_data(reddit.subreddit('vegan').top('month', limit = None))

In [11]:
# Collect the same data for r/Cooking so that the subreddits can be compared more accurately
collect_data(reddit.subreddit('Cooking').controversial('all', limit = None))
collect_data(reddit.subreddit('Cooking').controversial('year', limit = None))
collect_data(reddit.subreddit('Cooking').controversial('month', limit = None))

In [12]:
collect_data(reddit.subreddit('Cooking').top('all', limit = None))
collect_data(reddit.subreddit('Cooking').top('year', limit = None))
collect_data(reddit.subreddit('Cooking').top('month', limit = None))

In [13]:
# To maximize the amount of data, also collect new posts from both of the subreddits
collect_data(reddit.subreddit('vegan').new(limit = None))
collect_data(reddit.subreddit('Cooking').new(limit = None))

## Basic processing and analysis of the first batch of data

In [14]:
# Put into a DataFrame
df = pd.DataFrame({'id': ids, 'title': title, 'body': body, 'num_comments': num_comments,
                   'upvotes': upvotes, 'time_posted': time_posted,
                   'time_now': time_now, 'time_delta': time_delta,
                   'subreddit': subreddits})
df.head()

Unnamed: 0,id,title,body,num_comments,upvotes,time_posted,time_now,time_delta,subreddit
0,n3vd33,"""But We Always Ate Meat,""",,1178,0,1620047000.0,2021-05-16 11:36:28.594675,12 days 22:37:48.594680,vegan
1,aydjra,😎,,769,938,1551971000.0,2021-05-16 11:36:28.594689,800 days 20:31:30.594689,vegan
2,eqdom7,Hypocrites,,1330,919,1579334000.0,2021-05-16 11:36:28.594693,484 days 03:39:40.594694,vegan
3,au03xj,I hope this fits here,,946,2672,1550956000.0,2021-05-16 11:36:28.594697,812 days 14:31:36.594697,vegan
4,1ahdew,was told this went here.,,208,0,1363553000.0,2021-05-16 11:36:28.594701,2981 days 15:00:14.594702,vegan


In [15]:
# create a new column for the length of the body text. The length is 1 for an empty column.
df['text_length'] = [len(x.split(' ')) for x in df.body]

In [16]:
# Remove the empty body columns, which are the ones with links and images
df = df[df.text_length != 1]
df.head()

Unnamed: 0,id,title,body,num_comments,upvotes,time_posted,time_now,time_delta,subreddit,text_length
19,jvq4pp,"PSA: If you hate PETA, that's because a right ...",A huge smear campaign started by \n\nPeta Kill...,901,2829,1605605000.0,2021-05-16 11:36:28.594755,180 days 02:07:37.594756,vegan,285
21,114zvn,I was a Vegan for several months until this pa...,They said that the goats ate only fresh goat f...,239,0,1349707000.0,2021-05-16 11:36:28.594762,3141 days 21:06:08.594763,vegan,325
29,lenglo,why are vegan subs so cowardly when it comes t...,~~I’m pretty sure I was just quietly banned~~ ...,75,3,1612708000.0,2021-05-16 11:36:28.594795,97 days 21:07:42.594795,vegan,299
32,ixiq4p,White Veganism and Why it's Problematic,"Personally, I'm a person of color and it frust...",64,0,1600759000.0,2021-05-16 11:36:28.594806,236 days 04:18:37.594806,vegan,383
37,giowc5,tired of cigarette smokers calling themselves ...,you literally can’t smoke cigarettes and be a ...,14,1,1589333000.0,2021-05-16 11:36:28.594824,368 days 10:20:15.594824,vegan,89


In [17]:
# Check how many rows I got in the initial data collection
df.shape

(8365, 10)

In [18]:
# Remove duplicate posts in the dataset and check the number of rows
df.drop_duplicates(subset = ['title', 'body', 'subreddit'],
                   inplace = True)
df.shape

(6336, 10)

In [21]:
# Check the distribution between subreddits
df['subreddit'].value_counts()

Cooking    4443
vegan      1893
Name: subreddit, dtype: int64

**Conclusion**: There is enough data for NLP analysis. I will collect more over the coming days though.

In [22]:
df['subreddit'].value_counts(normalize=True)

Cooking    0.701231
vegan      0.298769
Name: subreddit, dtype: float64

**Conclusion**: There is a 70-30 split between classes, which is unbalanced but not too unbalanced

In [24]:
# Save the initial batch of data
df.to_csv('./saved_data/first_data.csv', index=False)

# Second Data Collection

In [6]:
# Collect data from new posts
collect_data(reddit.subreddit('vegan').new(limit = None))
collect_data(reddit.subreddit('Cooking').new(limit = None))

In [7]:
# Put into a DataFrame
df = pd.DataFrame({'id': ids, 'title': title, 'body': body, 'num_comments': num_comments,
                   'upvotes': upvotes, 'time_posted': time_posted,
                   'time_now': time_now, 'time_delta': time_delta,
                   'subreddit': subreddits})
df.head()

Unnamed: 0,id,title,body,num_comments,upvotes,time_posted,time_now,time_delta,subreddit
0,nfsvk0,The Search for the Vegan Violin,,0,1,1621390000.0,2021-05-19 02:02:44.293838,0 days 00:03:06.293843,vegan
1,nfsc5l,Worried about my parents eating habits...,hi everybody! I am a vegan college student and...,2,2,1621388000.0,2021-05-19 02:02:44.293852,0 days 00:28:17.293852,vegan
2,nfs98w,“We should have a choice”,I’m just wondering how people respond to this ...,7,3,1621388000.0,2021-05-19 02:02:44.293856,0 days 00:32:10.293857,vegan
3,nfr5ro,Prepackaged depression meal but make it ~healthy~,,1,12,1621385000.0,2021-05-19 02:02:44.293860,0 days 01:23:43.293860,vegan
4,nfr8sw,Duo Lingo Knows,,0,6,1621385000.0,2021-05-19 02:02:44.293864,0 days 01:19:52.293864,vegan


In [8]:
# create a new column for the length of the body text. The length is 1 for an empty column.
df['text_length'] = [len(x.split(' ')) for x in df.body]

In [9]:
# Remove the empty body columns, which are the ones with links and images
df = df[df.text_length != 1]
df.head()

Unnamed: 0,id,title,body,num_comments,upvotes,time_posted,time_now,time_delta,subreddit,text_length
1,nfsc5l,Worried about my parents eating habits...,hi everybody! I am a vegan college student and...,2,2,1621388000.0,2021-05-19 02:02:44.293852,0 days 00:28:17.293852,vegan,124
2,nfs98w,“We should have a choice”,I’m just wondering how people respond to this ...,7,3,1621388000.0,2021-05-19 02:02:44.293856,0 days 00:32:10.293857,vegan,96
8,nfr87k,"I’m considering going Vegan, but I have a ques...",How much of the meat industry is cruel? Especi...,12,3,1621385000.0,2021-05-19 02:02:44.293882,0 days 01:20:36.293882,vegan,115
9,nfqxqq,TOFU,Hey.. so I’m terrified of tofu. I’ve always av...,5,0,1621384000.0,2021-05-19 02:02:44.293886,0 days 01:33:56.293886,vegan,65
10,nfqs3w,Has anyone seen the new Impossible Burger 6-pa...,According to [this article](https://www.onegre...,4,3,1621384000.0,2021-05-19 02:02:44.293889,0 days 01:41:13.293890,vegan,41


In [10]:
# Check how many rows I got in the initial data collection
df.shape

(1331, 10)

In [11]:
# Remove duplicate posts in the dataset and check the number of rows
df.drop_duplicates(subset = ['title', 'body', 'subreddit'],
                   inplace = True)
df.shape

(1330, 10)

In [12]:
# Check the distribution between subreddits
df['subreddit'].value_counts()

Cooking    923
vegan      407
Name: subreddit, dtype: int64

In [13]:
df['subreddit'].value_counts(normalize=True)

Cooking    0.693985
vegan      0.306015
Name: subreddit, dtype: float64

**Conclusion**: we get a similar 70-30 split

In [14]:
df_old = pd.read_csv('./saved_data/first_data.csv')

In [15]:
df_old.shape

(6336, 10)

In [16]:
# Create the full dataset
df_current = pd.concat([df_old,df])
df_current.shape

(7666, 10)

In [17]:
# Save the full dataset
df_current.to_csv('./saved_data/second_data.csv', index=False)