# Auto Script for running at night or unattended

The new script is set to go to r/funny and get the first 100 posts in 'new' section and follow those 100 posts for 24hrs, collecting upvotes, # of comments, and subreddit activity every 30minutes.

If automated script below fails, go to 'manual script' section.

There are 2 cells in 'manual script', the first one allows you to rerun the automation script and continue where you left off until 48 datapoints per post is stored (since there are 48 30min intervals in 24hrs). Or, if you want to just manually collect data every 30 minutes, run the second cell once every 30 minutes.

# Automation Script

In [None]:
import time
from datetime import datetime
import praw
import pandas as pd

reddit = praw.Reddit(client_id='eOqj6Q2BiQQyKQ', client_secret='1hqeAwOvlkY1PYw51ewhbrItd5k', user_agent='Reddit WebScrapping')

sub_title = 'funny' #title of subreddit to scrape

time2sleep = 1800 #how long the program should sleep before continuing
hours = 48 #how long(30 minute intervals) you want to run the script for
count = 0 #each count represents 1 interval passing

dt_object = datetime.fromtimestamp(time.time())
weekday = dt_object.strftime('%A')

fname_votes = './{}_votes.csv'.format(weekday)
fname_comments = './{}_comments.csv'.format(weekday)
fname_online = './{}_online_users.csv'.format(weekday)

votes = {} #store post ids and votes. Positive votes and NEGATIVE votes (when post.downs is > 0)
totalcomments = {} #store post ids and the corresponding number of comments
online = {'timestamp':[],'users_online':[]} #store number of online users and timestamp of subreddit

subred = reddit.subreddit(sub_title)

#get 100 posts from new
sub_posts = subred.new(limit=100)

#initialize the storage of the 100 posts that we are tracking
for post in sub_posts:
    pid = str(post.id)
    if pid not in votes: #if new post, add it to the dictionary
        votes[pid] = [] #initialize empty list
        totalcomments[pid] = [] #initialize empty list
    else:#just in case we get some repeating posts
        pass

while (count <= hours):
    
    print('Starting 30min interval {}'.format(count))
    subred = reddit.subreddit(sub_title)
    #every 30minutes store number of online users in the subreddit
    online['timestamp'].append(time.time())
    online['users_online'].append(subred.active_user_count)
    
    for id_val in votes.keys():
        result = reddit.submission(id=id_val) #go to the specific post
        votes[id_val].append(result.ups - result.downs) #get the current number of upvotes and add to the list
        totalcomments[id_val].append(result.num_comments) #get number of comments
        
    #store into csv every 30min interval in case something goes wrong halfway through
    upvotes = pd.DataFrame(list(votes.values()),index=votes.keys())
    upvotes.to_csv(fname_votes,index_label='id')

    comments = pd.DataFrame(list(totalcomments.values()),index=totalcomments.keys())
    comments.to_csv(fname_comments,index_label='id')

    on = pd.DataFrame(list(online.values()),index=online.keys())
    on.to_csv(fname_online,index_label='timestamp')
    
    if count == hours:
        print('finished collecting data')
        break
    else:
        print('sleeping for {} seconds...'.format(time2sleep))
        time.sleep(time2sleep)
        count+=1

# Manual Script

### Cell 1: Continue where you left off

Run this cell to continue the automated data collection

In [None]:
import time
from datetime import datetime
import praw
import pandas as pd

time2sleep = 1800
reddit = praw.Reddit(client_id='eOqj6Q2BiQQyKQ', client_secret='1hqeAwOvlkY1PYw51ewhbrItd5k', user_agent='Reddit WebScrapping')

sub_title = 'funny' #title of subreddit to scrape

dt_object = datetime.fromtimestamp(time.time())
weekday = dt_object.strftime('%A')

fname_votes = './{}_votes.csv'.format(weekday)
df_votes = pd.read_csv(fname_votes)
#df_votes.set_index('id',inplace=True)

fname_comments = './{}_comments.csv'.format(weekday)
df_comments = pd.read_csv(fname_comments)
#df_comments.set_index('id',inplace=True)

fname_online = './{}_online_users.csv'.format(weekday)
df_online = pd.read_csv(fname_online)
#df_online.set_index('timestamp',inplace=True)

votes = {} #store post ids and votes. Positive votes and NEGATIVE votes (when post.downs is > 0)
totalcomments = {} #store post ids and the corresponding number of comments
online = {'timestamp':[],'users_online':[]} #store number of online users and timestamp of subreddit

#initialize the storage of the 100 posts that we are tracking
for pid in df_votes['id']:
    if pid not in votes: #if new post, add it to the dictionary
        votes[pid] = [] #initialize empty list
        totalcomments[pid] = [] #initialize empty list
    else:#just in case we get some repeating posts
        pass
datapoints = 48 - df_votes.shape[1]
count = 0
while (count <= datapoints):
    subred = reddit.subreddit(sub_title)
    #get online users
    online['timestamp'].append(time.time())
    online['users_online'].append(subred.active_user_count)
    
    #get votes/comments
    for id_val in votes.keys():
        result = reddit.submission(id=id_val) #go to the specific post
        votes[id_val].append(result.ups - result.downs) #get the current number of upvotes and add to the list
        totalcomments[id_val].append(result.num_comments) #get number of comments
        
    #create into dataframe, merge with old, and save to csv
    temp = pd.DataFrame(list(votes.values()),index=votes.keys())
    #temp.index.names = ['id']
    if count == 0:
        df_votes.set_index('id',inplace=True)
    upvotes = pd.concat([df_votes,temp],axis=1)
    upvotes.to_csv(fname_votes,index_label='id')

    temp = pd.DataFrame(list(totalcomments.values()),index=totalcomments.keys())
    #temp.index.names = ['id']
    if count == 0:
        df_comments.set_index('id',inplace=True)
    comments = pd.concat([df_comments,temp],axis=1)
    comments.to_csv(fname_comments, index_label='id')

    temp = pd.DataFrame(list(online.values()),index=online.keys())
    #temp.index.names = ['timestamp']
    if count == 0:
        df_online.set_index('timestamp',inplace=True)
    on = pd.concat([df_online,temp],axis=1)
    on.to_csv(fname_online, index_label='timestamp')
    
    if count == datapoints:
        print('done')
        break
    else:
        print('sleeping...{}'.format(count))
        count += 1
        time.sleep(time2sleep)

### Cell 2: Manual data collection every 30min

Run this cell below manually every 30minutes to collect data

In [None]:
import time
from datetime import datetime
import praw
import pandas as pd

reddit = praw.Reddit(client_id='eOqj6Q2BiQQyKQ', client_secret='1hqeAwOvlkY1PYw51ewhbrItd5k', user_agent='Reddit WebScrapping')

sub_title = 'funny' #title of subreddit to scrape

dt_object = datetime.fromtimestamp(time.time())
weekday = dt_object.strftime('%A')

fname_votes = './{}_votes.csv'.format(weekday)
df_votes = pd.read_csv(fname_votes)
#df_votes.set_index('id',inplace=True)

fname_comments = './{}_comments.csv'.format(weekday)
df_comments = pd.read_csv(fname_comments)
#df_comments.set_index('id',inplace=True)

fname_online = './{}_online_users.csv'.format(weekday)
df_online = pd.read_csv(fname_online)
#df_online.set_index('timestamp',inplace=True)

votes = {} #store post ids and votes. Positive votes and NEGATIVE votes (when post.downs is > 0)
totalcomments = {} #store post ids and the corresponding number of comments
online = {'timestamp':[],'users_online':[]} #store number of online users and timestamp of subreddit

subred = reddit.subreddit(sub_title)

#initialize the storage of the 100 posts that we are tracking
for pid in df_votes['id']:
    if pid not in votes: #if new post, add it to the dictionary
        votes[pid] = [] #initialize empty list
        totalcomments[pid] = [] #initialize empty list
    else:#just in case we get some repeating posts
        pass

#get online users
online['timestamp'].append(time.time())
online['users_online'].append(subred.active_user_count)
    
#get votes/comments
for id_val in votes.keys():
    result = reddit.submission(id=id_val) #go to the specific post
    votes[id_val].append(result.ups - result.downs) #get the current number of upvotes and add to the list
    totalcomments[id_val].append(result.num_comments) #get number of comments
        
#create into dataframe, merge with old, and save to csv
temp = pd.DataFrame(list(votes.values()),index=votes.keys())
#temp.index.names = ['id']
df_votes.set_index('id',inplace=True)
upvotes = pd.concat([df_votes,temp],axis=1)
upvotes.to_csv(fname_votes,index_label='id')

temp = pd.DataFrame(list(totalcomments.values()),index=totalcomments.keys())
#temp.index.names = ['id']
df_comments.set_index('id',inplace=True)
comments = pd.concat([df_comments,temp],axis=1)
comments.to_csv(fname_comments, index_label='id')

temp = pd.DataFrame(list(online.values()),index=online.keys())
#temp.index.names = ['timestamp']
df_online.set_index('timestamp',inplace=True)
on = pd.concat([df_online,temp],axis=1)
on.to_csv(fname_online, index_label='timestamp')