In [27]:
import requests
import pandas as pd
import time
import numpy as np
pd.options.display.max_colwidth = 400 # setting to display up to 400 characters in a column

In [28]:
## define function get post thru looping

def data_collection_post_check(before='',
                               after='', 
                               subreddit='datascience', #default is startrek because we are analyst from startrek
                               no_of_posts=100):

    url = "https://api.pushshift.io/reddit/search/submission/" # target web page
    
    loop = 1          # initialize with loop 1 for easier tracking in the loop later
    error_count = 0   # initialize with variable for error count checking to break from while loop
    
    
    # initialize the dataframe using list which would be substituted in the while loop
    all_df = []
    

    while len(all_df) < no_of_posts: # to get the number of entries
        print(f"Loop #{loop}") 
    
        params = {
            'subreddit': subreddit,
            'size': 100,
            'before': before,  # would be substituted with min epoch, so that later loops would get earlier posts
            'after': after, 
           # 'fields': identified_columns, # these are the identified useful columns
            'selftext:not': "[removed]|[deleted]" #this is to eliminate blank selftext field 
        }
        print("=== Retrieving... ========")
        res = requests.get(url, params) # establish connection to the web page  
        print(f"Status Code: {res.status_code}")
    
        #error checking to re-retrive data for 1 more time
        if res.status_code == 200:
            print("=== Success! =============")
            data = res.json() # store the json data (dict) into "data"
            posts = data['data'] # retrieve the posts from the dictionary
            posts_df = pd.DataFrame(posts) # convert to dataframe
            
            if len(posts_df) == 0: 
                print("No more posts to collect! \nTry adjusting before/after epoch time!")
                break
            
            before = posts_df.created_utc.min() # get the earliest utc in this loop

            if loop == 1:
                all_df = posts_df
                latest_epoch = posts_df.created_utc.max()
                latest_post = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(latest_epoch)) # get the date/time of latest post
            else:
                all_df = pd.concat([all_df, posts_df], axis=0)

            print(f"{len(all_df)*100/no_of_posts}% of data has been added to the dataframe! \n")

            # provide short summary at the end
            if len(all_df) >= no_of_posts:
                earliest_epoch = posts_df.created_utc.min()
                earliest_post = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(earliest_epoch))
                print("=== Summary ==============")
                print(f"Subreddit: {subreddit}")
                print(f"No of Posts: {len(all_df)}")
                print(f"Start Date: {earliest_post}")
                print(f"End Date: {latest_post}")
                print(f"Start Epoch Time: {earliest_epoch}")
                print(f"End Epoch Time: {latest_epoch}")

            else: # loop is still active
#                 time.sleep(np.random.randint(20, 35)) #provide a random time (seconds) for code to sleep
                loop += 1

        else: # handle error where break out of while loop when there are 3 or more connection error.
            loop += 1
            error_count += 1
            print("=== Error! ===============")
            print(f"Error Count: {error_count}\n")
            if error_count > 2:
                print("=== Break ================")
                print(f"Detected more than 2 errors.")
                break
#             time.sleep(np.random.randint(20, 35))
            continue
    return all_df

In [37]:
data_science_raw = data_collection_post_check(no_of_posts=10000)

Loop #1
Status Code: 200
1.0% of data has been added to the dataframe! 

Loop #2
Status Code: 200
2.0% of data has been added to the dataframe! 

Loop #3
Status Code: 200
3.0% of data has been added to the dataframe! 

Loop #4
Status Code: 200
4.0% of data has been added to the dataframe! 

Loop #5
Status Code: 200
5.0% of data has been added to the dataframe! 

Loop #6
Status Code: 200
6.0% of data has been added to the dataframe! 

Loop #7
Status Code: 200
6.99% of data has been added to the dataframe! 

Loop #8
Status Code: 200
7.99% of data has been added to the dataframe! 

Loop #9
Status Code: 200
8.99% of data has been added to the dataframe! 

Loop #10
Status Code: 200
9.99% of data has been added to the dataframe! 

Loop #11
Status Code: 200
10.99% of data has been added to the dataframe! 

Loop #12
Status Code: 200
11.99% of data has been added to the dataframe! 

Loop #13
Status Code: 200
12.99% of data has been added to the dataframe! 

Loop #14
Status Code: 200
13.99% of d

Status Code: 200
63.99% of data has been added to the dataframe! 

Loop #65
Status Code: 200
64.99% of data has been added to the dataframe! 

Loop #66
Status Code: 200
65.99% of data has been added to the dataframe! 

Loop #67
Status Code: 200
66.99% of data has been added to the dataframe! 

Loop #68
Status Code: 200
67.99% of data has been added to the dataframe! 

Loop #69
Status Code: 200
68.98% of data has been added to the dataframe! 

Loop #70
Status Code: 200
69.98% of data has been added to the dataframe! 

Loop #71
Status Code: 200
70.98% of data has been added to the dataframe! 

Loop #72
Status Code: 200
71.98% of data has been added to the dataframe! 

Loop #73
Status Code: 200
72.98% of data has been added to the dataframe! 

Loop #74
Status Code: 200
73.98% of data has been added to the dataframe! 

Loop #75
Status Code: 200
74.98% of data has been added to the dataframe! 

Loop #76
Status Code: 200
75.98% of data has been added to the dataframe! 

Loop #77
Status Code:

In [30]:
# data_science_raw.to_csv('data_science.csv', index=False)

In [38]:
columns_to_keep = ['author','author_fullname','created_utc',
                    'full_link','id','num_comments',
                    'score','selftext','subreddit',
                     'subreddit_id','subreddit_subscribers','title',
                  'upvote_ratio','url','whitelist_status',
                   'url_overridden_by_dest']

In [45]:
ds_raw = data_science_raw[['author','author_fullname','created_utc',
                    'full_link','id','num_comments',
                    'score','selftext','subreddit',
                     'subreddit_id','subreddit_subscribers','title',
                  'upvote_ratio','url','whitelist_status',
                   'url_overridden_by_dest']].copy()
ds_raw.shape

(10097, 16)

In [44]:
ds_raw.isnull().sum()

author                       0
author_fullname              4
created_utc                  0
full_link                    0
id                           0
num_comments                 0
score                        0
selftext                     4
subreddit                    0
subreddit_id                 0
subreddit_subscribers        0
title                        0
upvote_ratio                 0
url                          0
whitelist_status             0
url_overridden_by_dest    8634
dtype: int64

In [41]:
ds_raw.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10097 entries, 0 to 99
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   author                  10097 non-null  object 
 1   author_fullname         10093 non-null  object 
 2   created_utc             10097 non-null  int64  
 3   full_link               10097 non-null  object 
 4   id                      10097 non-null  object 
 5   num_comments            10097 non-null  int64  
 6   score                   10097 non-null  int64  
 7   selftext                10093 non-null  object 
 8   subreddit               10097 non-null  object 
 9   subreddit_id            10097 non-null  object 
 10  subreddit_subscribers   10097 non-null  int64  
 11  title                   10097 non-null  object 
 12  upvote_ratio            10097 non-null  float64
 13  url                     10097 non-null  object 
 14  whitelist_status        10097 non-null  o

In [42]:
ds_raw.columns

Index(['author', 'author_fullname', 'created_utc', 'full_link', 'id',
       'num_comments', 'score', 'selftext', 'subreddit', 'subreddit_id',
       'subreddit_subscribers', 'title', 'upvote_ratio', 'url',
       'whitelist_status', 'url_overridden_by_dest'],
      dtype='object')

In [46]:
ds_raw.to_csv('ds_raw10k.csv', index=False)

In [47]:
ds_raw.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10097 entries, 0 to 99
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   author                  10097 non-null  object 
 1   author_fullname         10093 non-null  object 
 2   created_utc             10097 non-null  int64  
 3   full_link               10097 non-null  object 
 4   id                      10097 non-null  object 
 5   num_comments            10097 non-null  int64  
 6   score                   10097 non-null  int64  
 7   selftext                10093 non-null  object 
 8   subreddit               10097 non-null  object 
 9   subreddit_id            10097 non-null  object 
 10  subreddit_subscribers   10097 non-null  int64  
 11  title                   10097 non-null  object 
 12  upvote_ratio            10097 non-null  float64
 13  url                     10097 non-null  object 
 14  whitelist_status        10097 non-null  o