In [37]:
import requests
import pandas as pd
import time
import numpy as np
pd.options.display.max_colwidth = 400 # setting to display up to 400 characters in a column

In [38]:
## define function get post thru looping

def data_collection_post_check(before='',
                               after='', 
                               subreddit='compsci', #default is startrek because we are analyst from startrek
                               no_of_posts=100):

    url = "https://api.pushshift.io/reddit/search/submission/" # target web page
    
    loop = 1          # initialize with loop 1 for easier tracking in the loop later
    error_count = 0   # initialize with variable for error count checking to break from while loop
    
    
    # initialize the dataframe using list which would be substituted in the while loop
    all_df = []
    

    while len(all_df) < no_of_posts: # to get the number of entries
        print(f"Loop #{loop}") 
    
        params = {
            'subreddit': subreddit,
            'size': 100,
            'before': before,  # would be substituted with min epoch, so that later loops would get earlier posts
            'after': after, 
           # 'fields': identified_columns, # these are the identified useful columns
            'selftext:not': "[removed]|[deleted]" #this is to eliminate blank selftext field 
        }
        print("=== Retrieving... ========")
        res = requests.get(url, params) # establish connection to the web page  
        print(f"Status Code: {res.status_code}")
    
        #error checking to re-retrive data for 1 more time
        if res.status_code == 200:
            print("=== Success! =============")
            data = res.json() # store the json data (dict) into "data"
            posts = data['data'] # retrieve the posts from the dictionary
            posts_df = pd.DataFrame(posts) # convert to dataframe
            
            if len(posts_df) == 0: 
                print("No more posts to collect! \nTry adjusting before/after epoch time!")
                break
            
            before = posts_df.created_utc.min() # get the earliest utc in this loop

            if loop == 1:
                all_df = posts_df
                latest_epoch = posts_df.created_utc.max()
                latest_post = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(latest_epoch)) # get the date/time of latest post
            else:
                all_df = pd.concat([all_df, posts_df], axis=0)

            print(f"{len(all_df)*100/no_of_posts}% of data has been added to the dataframe! \n")

            # provide short summary at the end
            if len(all_df) >= no_of_posts:
                earliest_epoch = posts_df.created_utc.min()
                earliest_post = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(earliest_epoch))
                print("=== Summary ==============")
                print(f"Subreddit: {subreddit}")
                print(f"No of Posts: {len(all_df)}")
                print(f"Start Date: {earliest_post}")
                print(f"End Date: {latest_post}")
                print(f"Start Epoch Time: {earliest_epoch}")
                print(f"End Epoch Time: {latest_epoch}")

            else: # loop is still active
#                 time.sleep(np.random.randint(20, 35)) #provide a random time (seconds) for code to sleep
                loop += 1

        else: # handle error where break out of while loop when there are 3 or more connection error.
            loop += 1
            error_count += 1
            print("=== Error! ===============")
            print(f"Error Count: {error_count}\n")
            if error_count > 2:
                print("=== Break ================")
                print(f"Detected more than 2 errors.")
                break
#             time.sleep(np.random.randint(20, 35))
            continue
    return all_df

In [39]:
conservative_check = data_collection_post_check(no_of_posts=15000)

Loop #1
Status Code: 200
0.6666666666666666% of data has been added to the dataframe! 

Loop #2
Status Code: 200
1.3333333333333333% of data has been added to the dataframe! 

Loop #3
Status Code: 200
2.0% of data has been added to the dataframe! 

Loop #4
Status Code: 200
2.6666666666666665% of data has been added to the dataframe! 

Loop #5
Status Code: 200
3.3333333333333335% of data has been added to the dataframe! 

Loop #6
Status Code: 200
4.0% of data has been added to the dataframe! 

Loop #7
Status Code: 200
4.666666666666667% of data has been added to the dataframe! 

Loop #8
Status Code: 200
5.326666666666667% of data has been added to the dataframe! 

Loop #9
Status Code: 200
5.986666666666666% of data has been added to the dataframe! 

Loop #10
Status Code: 200
6.653333333333333% of data has been added to the dataframe! 

Loop #11
Status Code: 200
7.3133333333333335% of data has been added to the dataframe! 

Loop #12
Status Code: 200
7.973333333333334% of data has been ad

Status Code: 200
39.946666666666665% of data has been added to the dataframe! 

Loop #61
Status Code: 200
40.61333333333334% of data has been added to the dataframe! 

Loop #62
Status Code: 200
41.28% of data has been added to the dataframe! 

Loop #63
Status Code: 200
41.946666666666665% of data has been added to the dataframe! 

Loop #64
Status Code: 200
42.61333333333334% of data has been added to the dataframe! 

Loop #65
Status Code: 200
43.28% of data has been added to the dataframe! 

Loop #66
Status Code: 200
43.946666666666665% of data has been added to the dataframe! 

Loop #67
Status Code: 200
44.61333333333334% of data has been added to the dataframe! 

Loop #68
Status Code: 200
45.28% of data has been added to the dataframe! 

Loop #69
Status Code: 200
45.946666666666665% of data has been added to the dataframe! 

Loop #70
Status Code: 200
46.61333333333334% of data has been added to the dataframe! 

Loop #71
Status Code: 200
47.28% of data has been added to the dataframe!

Status Code: 200
79.93333333333334% of data has been added to the dataframe! 

Loop #121
Status Code: 200
80.59333333333333% of data has been added to the dataframe! 

Loop #122
Status Code: 200
81.26% of data has been added to the dataframe! 

Loop #123
Status Code: 200
81.92666666666666% of data has been added to the dataframe! 

Loop #124
Status Code: 200
82.59333333333333% of data has been added to the dataframe! 

Loop #125
Status Code: 200
83.26% of data has been added to the dataframe! 

Loop #126
Status Code: 200
83.92666666666666% of data has been added to the dataframe! 

Loop #127
Status Code: 200
84.58666666666667% of data has been added to the dataframe! 

Loop #128
Status Code: 200
85.25333333333333% of data has been added to the dataframe! 

Loop #129
Status Code: 200
85.92% of data has been added to the dataframe! 

Loop #130
Status Code: 200
86.58666666666667% of data has been added to the dataframe! 

Loop #131
Status Code: 200
87.25333333333333% of data has been adde

In [4]:
# conservative_check.to_csv('compsci.csv', index=False)

In [5]:
#starwars 500/1000
#conservative 89/10_000
#libertarian 846/2000

In [40]:
columns_to_keep = ['author','author_fullname','created_utc',
                    'full_link','id','num_comments',
                    'score','selftext','subreddit',
                     'subreddit_id','subreddit_subscribers','title',
                  'upvote_ratio','url','whitelist_status',
                   'url_overridden_by_dest']

In [41]:
compsci_raw = conservative_check[['author','author_fullname','created_utc',
                    'full_link','id','num_comments',
                    'score','selftext','subreddit',
                     'subreddit_id','subreddit_subscribers','title',
                  'upvote_ratio','url','whitelist_status',
                   'url_overridden_by_dest']].copy()

In [42]:
compsci_raw.shape

(15080, 16)

In [43]:
compsci_raw.isna().sum()

author                        0
author_fullname             303
created_utc                   0
full_link                     0
id                            0
num_comments                  0
score                         0
selftext                     71
subreddit                     0
subreddit_id                  0
subreddit_subscribers         0
title                         0
upvote_ratio               5206
url                           0
whitelist_status              0
url_overridden_by_dest    10352
dtype: int64

In [44]:
conservative_check.columns

Index(['all_awardings', 'allow_live_comments', 'author',
       'author_flair_css_class', 'author_flair_richtext', 'author_flair_text',
       'author_flair_type', 'author_fullname', 'author_is_blocked',
       'author_patreon_flair',
       ...
       'hidden', 'link_flair_css_class', 'link_flair_template_id',
       'link_flair_text', 'quarantine', 'removal_reason',
       'subreddit_name_prefixed', 'suggested_sort', 'rte_mode', 'author_id'],
      dtype='object', length=103)

In [45]:
compsci_raw.to_csv('compsci_raw.csv', index=False)

In [36]:
# compsci_raw.to_excel('compsci_raw.xlsx', index=False)

In [30]:
compsci_raw.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10091 entries, 0 to 99
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   author                  10091 non-null  object 
 1   author_fullname         10023 non-null  object 
 2   created_utc             10091 non-null  int64  
 3   full_link               10091 non-null  object 
 4   id                      10091 non-null  object 
 5   num_comments            10091 non-null  int64  
 6   score                   10091 non-null  int64  
 7   selftext                10025 non-null  object 
 8   subreddit               10091 non-null  object 
 9   subreddit_id            10091 non-null  object 
 10  subreddit_subscribers   10091 non-null  int64  
 11  title                   10091 non-null  object 
 12  upvote_ratio            9751 non-null   float64
 13  url                     10091 non-null  object 
 14  whitelist_status        10091 non-null  o

In [34]:
compsci_raw['selftext'][5000]

KeyError: 5000