In [1]:
import requests
import calendar
import time
import pandas as pd

In [2]:
def get_posts(num_posts, req_params, before):
    """Get specified number of subreddit posts using Pushshift API
    
    This function will request the number of posts at a time specified
    in the arguments (max 100), up to the total number of posts also 
    specified, with a configurable timeout between requests.  It will
    start at the provided epoch time and work backwards.  Also, it will
    end and return the posts before the specified number of posts is
    reached if the request fails more than 10 times in a row, or 0 
    posts is returned in a request.
    
    Args:
        num_posts (int): Total number of posts desired
        req_params (dict): Dictionary of request parameters
            key (string): url, value (string): Base API url
            key (string): subreddit, value (string): subreddit name
            key (string): size, value (int): number of posts per request
            key (string): fields, value (list): list of subreddit fields
            key (string): secs, value (int): seconds between requests
        before (int): epoch time to start requests from
        
    Return:
        Dataframe of results
    """
    # instantiate list for storing all posts
    all_posts = []
    
    # make requests until required number of posts is reached
    while(len(all_posts) < num_posts):
        params = {
            'subreddit': req_params['subreddit'],
            'size': req_params['size'],
            'fields': req_params['fields'],
            'before': before
        }
        res = requests.get(req_params['url'], params)
        
        # counter for failed requests
        # resets with a successful request
        # breaks loop if unsuccessful requests is greater than 10
        num_failed = 0
        if res.status_code == 200:
            num_failed = 0
            posts = res.json()['data']
            if len(posts) > 0:
                all_posts.append(posts)
                before = posts[-1]['created_utc']
            else:
                print('No more posts')
                break
        elif num_failed > 10:
            print('requests unsuccessful')
            break
        else:
            num_failed += 1
            
        # wait time between requests
        time.sleep(req_params['secs'])
        
    print('Complete')    
    return pd.DataFrame(all_posts)

## Fitness

In [35]:
#instantiate dataframe before first run only
#fitness_df = pd.DataFrame()

In [20]:
#read in existing data to start where left off
fitness_df = pd.read_csv('../Data/fitness.csv', index_col=0)
fitness_df.head(2)

Unnamed: 0,author,created_utc,num_comments,num_crossposts,score,selftext,subreddit,title
0,bigjungus11,1610566951,1,0,1,[removed],Fitness,"Skinnyfat, should I eat more? Less?"
1,Oz390,1610566886,0,0,1,[removed],Fitness,Starting a new fitness programme


In [5]:
#parameters for request
req_params = {
    'url': 'https://api.pushshift.io/reddit/search/submission',
    'subreddit': 'Fitness',
    'size': 100,
    'fields': ['subreddit','author','title','selftext','num_comments','num_crossposts','score','created_utc'],
    'secs': 10
}

In [36]:
#current time for first run
now = calendar.timegm(time.gmtime())

In [39]:
#time for starting from where left off
before = fitness_df.iloc[-1]['created_utc']

In [40]:
fitness_df = pd.concat([fitness_df, get_posts(4000, req_params, before)])

Complete


In [41]:
len(fitness_df)

26000

In [44]:
fitness_df.head()

Unnamed: 0,author,created_utc,num_comments,num_crossposts,score,selftext,subreddit,title
0,bigjungus11,1610566951,1,0,1,[removed],Fitness,"Skinnyfat, should I eat more? Less?"
1,Oz390,1610566886,0,0,1,[removed],Fitness,Starting a new fitness programme
2,Bbxcs273,1610566718,1,0,1,[removed],Fitness,Lifting for someone with a history of disorder...
3,Standard_Carrot_3820,1610566637,1,0,1,[removed],Fitness,Calisthenics doesn't build muscle. Is there a ...
4,AppropriatePiccolo86,1610566123,1,0,1,[removed],Fitness,"Used to be a truck driver, help with leg"


In [45]:
fitness_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26000 entries, 0 to 99
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   author          26000 non-null  object
 1   created_utc     26000 non-null  int64 
 2   num_comments    26000 non-null  int64 
 3   num_crossposts  26000 non-null  int64 
 4   score           26000 non-null  int64 
 5   selftext        25622 non-null  object
 6   subreddit       26000 non-null  object
 7   title           26000 non-null  object
dtypes: int64(4), object(4)
memory usage: 1.8+ MB


In [43]:
#save to file
fitness_df.to_csv('../Data/fitness.csv')

## Bodyweight Fitness

In [58]:
#instantiate dataframe before first run only
#bodyweight_df = pd.DataFrame()

In [10]:
#read in existing data to start where left off
bodyweight_df = pd.read_csv('../Data/bodyweight.csv', index_col=0)
bodyweight_df.head(2)

Unnamed: 0,author,created_utc,num_comments,num_crossposts,score,selftext,subreddit,title
0,Luciferswife4life,1610570566,0,0,1,[removed],bodyweightfitness,What lower body excercises can I do with bad k...
1,J22Charles,1610570222,0,0,1,Been doing cardio and resistance for year swit...,bodyweightfitness,Looking for supplement for lean muscle


In [68]:
#parameters for request
req_params = {
    'url': 'https://api.pushshift.io/reddit/search/submission',
    'subreddit': 'bodyweightfitness',
    'size': 100,
    'fields': ['subreddit','author','title','selftext','num_comments','num_crossposts','score','created_utc'],
    'secs': 10
}

In [62]:
#current time for first run
now = calendar.timegm(time.gmtime())

In [72]:
#time for starting from where left off
before = bodyweight_df.iloc[-1]['created_utc']

In [74]:
bodyweight_df = pd.concat([bodyweight_df, get_posts(9500, req_params, before)])

Complete


In [11]:
len(bodyweight_df)

10000

In [76]:
bodyweight_df.head()

Unnamed: 0,author,created_utc,num_comments,num_crossposts,score,selftext,subreddit,title
0,Luciferswife4life,1610570566,0,0,1,[removed],bodyweightfitness,What lower body excercises can I do with bad k...
1,J22Charles,1610570222,0,0,1,Been doing cardio and resistance for year swit...,bodyweightfitness,Looking for supplement for lean muscle
2,theboxv6,1610569660,0,0,2,Late last year I began slacking on my workouts...,bodyweightfitness,"Unilateral exercises got old fast, help needed"
3,MailmanTom69,1610569504,0,0,1,I was doing pull-ups a lot and became very goo...,bodyweightfitness,Pull-ups suddenly became more difficult?
4,Easteregg316,1610569373,1,0,1,[removed],bodyweightfitness,Stomach pains when doing reverse hyper extension?


In [77]:
bodyweight_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 0 to 99
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   author          10000 non-null  object
 1   created_utc     10000 non-null  int64 
 2   num_comments    10000 non-null  int64 
 3   num_crossposts  10000 non-null  int64 
 4   score           10000 non-null  int64 
 5   selftext        9961 non-null   object
 6   subreddit       10000 non-null  object
 7   title           10000 non-null  object
dtypes: int64(4), object(4)
memory usage: 703.1+ KB


In [78]:
#save to file
bodyweight_df.to_csv('../Data/bodyweight.csv')