In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from psaw import PushshiftAPI

# Getting all submissions with more than a certain upvote score

In [2]:
# Initialize API
api = PushshiftAPI()

# Set up generator to make API request.
api_request_generator = api.search_submissions(subreddit='news', score = ">2000")

# Make the request and collect results into a pd.DataFrame
news_submissions = pd.DataFrame([submission.d_ for submission in api_request_generator])

# Shape (n_results, n_features)
news_submissions.shape

(14878, 98)

In [19]:
# Available features
news_submissions.columns

Index(['all_awardings', 'allow_live_comments', 'author',
       'author_flair_css_class', 'author_flair_richtext', 'author_flair_text',
       'author_flair_type', 'author_fullname', 'author_patreon_flair',
       'author_premium', 'awarders', 'can_mod_post', 'contest_mode',
       'created_utc', 'domain', 'edited', 'full_link', 'gilded', 'gildings',
       'id', 'is_crosspostable', 'is_meta', 'is_original_content',
       'is_reddit_media_domain', 'is_robot_indexable', 'is_self', 'is_video',
       'link_flair_background_color', 'link_flair_richtext',
       'link_flair_text_color', 'link_flair_type', 'locked', 'media_only',
       'no_follow', 'num_comments', 'num_crossposts', 'over_18',
       'parent_whitelist_status', 'permalink', 'pinned', 'pwls',
       'retrieved_on', 'score', 'selftext', 'send_replies', 'spoiler',
       'stickied', 'subreddit', 'subreddit_id', 'subreddit_subscribers',
       'subreddit_type', 'thumbnail', 'title', 'total_awards_received',
       'treatment_ta

In [17]:
# Transform to datetime
news_submissions['date'] = pd.to_datetime(jokes_submissions['created_utc'], utc = True, unit = 's')
news_submissions[['date','title', 'score', 'num_comments', 'selftext']].sample(20)

Unnamed: 0,date,title,score,num_comments,selftext
747,2019-07-24 03:28:46+00:00,"My best friend called me and said ""An evil wiz...",12514,117,I drove all the way to his house just to find ...
822,2019-07-10 09:43:25+00:00,I went to the liquor store on my bicycle and b...,6395,93,\n\n...'cause I fell 7 times on the way home...
161,2020-04-15 14:42:22+00:00,Husband doing crossword with his wife,2528,132,"\n\nHusband: Emphatic no, five letters.\n\nWi..."
1312,2019-04-18 20:13:24+00:00,The girl with no arms and legs laying by the pool,9047,238,There’s a girl with no arms and legs laying by...
5600,2016-12-21 09:12:29+00:00,Break ups are the worst in China...,3047,180,You see her face everywhere.
2864,2018-04-05 16:00:16+00:00,I asked my girlfriend to describe me in 5 words.,34538,537,"She said I'm mature, I'm moral, I'm pure, I'm ..."
5254,2017-02-18 22:10:34+00:00,Two blind pilots enter a plane,4247,100,They have sunglasses and white sticks. As the ...
2569,2018-05-19 15:06:42+00:00,Don't ever underestimate a Scottish police off...,5240,144,A London lawyer runs a stop sign and gets pull...
3232,2018-02-12 05:04:36+00:00,Afternoon Sex,36740,663,"The only way to pull off a Sunday afternoon ""q..."
711,2019-07-30 11:44:48+00:00,I once dated a girl who had a twin.,22951,356,People kept asking me how I could tell them ap...


# Getting submission based on search keyword

Searching comments is done in the same way, but using api.search_comments instead of api.search_submissions

In [28]:
# Set up generator to make API request.
api_request_generator = api.search_submissions(q='(jobs | employment)', score = '>2000')

q_jobs_submissions = pd.DataFrame([submission.d_ for submission in api_request_generator])

q_jobs_submissions['date'] = pd.to_datetime(q_jobs_submissions['created_utc'], utc=True, unit='s')

q_jobs_submissions.shape

(4645, 102)

In [29]:
q_jobs_submissions[['date','title', 'score', 'num_comments', 'selftext']].sample(20)

Unnamed: 0,date,title,score,num_comments,selftext
115,2020-07-21 18:38:33+00:00,Aita for calling out my sister after she shame...,12490,1036,I’m 28 my sister is 26. She got married three ...
2041,2018-03-05 16:21:16+00:00,TIL Before he was a famous musician Johnny Cas...,32066,499,
3571,2016-10-09 18:06:11+00:00,My boyfriend and I started a business out of h...,3612,2867,Hi Reddit! I’m Monique. Two years ago I was wo...
3687,2016-07-24 12:55:42+00:00,EMSK the most popular resumes are either chron...,2039,54,[deleted]
3202,2017-01-26 12:45:34+00:00,Keystone pipeline will create just 35 permanen...,31452,3040,
577,2019-08-15 07:08:55+00:00,Video Game Developer Insight on EA's Relations...,2581,460,I've been a video game developer for near thre...
1973,2018-03-23 15:14:04+00:00,I work as a companion to a man who has Down sy...,3045,251,His Individual Support Plan allows for no alon...
687,2019-07-19 10:47:41+00:00,Fine! But you're stealing their jobs!,3068,163,I love Target. I'm also a stress eater who hap...
3446,2016-11-30 00:42:18+00:00,You are only a Temp!,3459,253,So in my time out of tech support from the lig...
3683,2016-07-26 04:35:07+00:00,Ken M On Six-Figure Jobs,3944,163,


In [30]:
max(q_jobs_submissions['date'])

Timestamp('2021-06-14 02:29:15+0000', tz='UTC')

In [None]:
# Collect submissions/comments within a certain period

In [3]:
start_time = int(datetime(2020, 10, 23).timestamp())
end_time = int(datetime(2020, 10, 26).timestamp())

api_request_generator = api.search_submissions(q='(jobs | employment)',
                                              after = start_time,
                                              before = end_time)

q_jobs_submissions = pd.DataFrame([submission.d_ for submission in api_request_generator])
q_jobs_submissions.shape

(6205, 99)

In [None]:
start_time = int(datetime(2020, 10, 23).timestamp())
end_time = int(datetime(2020, 10, 26).timestamp())

api_request_generator2 = api.search_submissions(q='(jobs OR employment)',
                                              after = start_time,
                                              before = end_time)

q_jobs_submissions2 = pd.DataFrame([submission.d_ for submission in api_request_generator2])
q_jobs_submissions2.shape

In [47]:
q_jobs_submissions['date'] = pd.to_datetime(q_jobs_submissions['created_utc'], utc=True, unit='s')
q_jobs_submissions[['date','title', 'score', 'num_comments', 'selftext']].sample(20)

Unnamed: 0,date,title,score,num_comments,selftext
2942,2020-10-24 12:39:29+00:00,What to expect When you subscribe? ⚡️Sexy dail...,1,3,
417,2020-10-25 19:29:58+00:00,My [21M] Girlfriend [27M] was abused for a lon...,1,5,I met my current girlfriend in May. We're both...
1283,2020-10-25 08:00:45+00:00,What are some non-repetitive CS jobs to look for?,1,7,I did an internship at a big corporate firm in...
4583,2020-10-23 17:16:23+00:00,"""The Promise"" Public Release 0.57",1,0,"**Features planned for the ""The Promise"" Trilo..."
3269,2020-10-24 07:31:14+00:00,"[Health] - Disney workers lose jobs, free coll...",1,0,
677,2020-10-25 17:04:23+00:00,Employees act as if I’m stealing??,1,79,So to set the stage - I’m biracial. Half white...
6045,2020-10-23 00:39:45+00:00,Health risks from wildland fire fighting?,1,5,"Hey y'all, I'm new to this Reddit, but first o..."
3014,2020-10-24 11:31:55+00:00,Doug Ford defends closure of indoor restaurant...,1,0,
3165,2020-10-24 09:11:49+00:00,Doug Ford defends closure of indoor restaurant...,1,0,
2097,2020-10-24 21:16:49+00:00,Agencies to Have Wide Latitude In Deciding Whi...,1,0,


In [48]:
start_time = int(datetime(2020, 10, 23).timestamp())
end_time = int(datetime(2020, 10, 26).timestamp())

api_request_generator = api.search_comments(q='(jobs | employment)',
                                              after = start_time,
                                              before = end_time)

q_jobs_comments = pd.DataFrame([submission.d_ for submission in api_request_generator])
q_jobs_comments['date'] = pd.to_datetime(q_jobs_comments['created_utc'], utc=True, unit='s')
q_jobs_comments.shape

(35706, 39)

In [51]:
q_jobs_comments.columns

Index(['all_awardings', 'associated_award', 'author',
       'author_flair_background_color', 'author_flair_css_class',
       'author_flair_richtext', 'author_flair_template_id',
       'author_flair_text', 'author_flair_text_color', 'author_flair_type',
       'author_fullname', 'author_patreon_flair', 'author_premium', 'awarders',
       'body', 'collapsed_because_crowd_control', 'comment_type',
       'created_utc', 'gildings', 'id', 'is_submitter', 'link_id', 'locked',
       'no_follow', 'parent_id', 'permalink', 'retrieved_on', 'score',
       'send_replies', 'stickied', 'subreddit', 'subreddit_id',
       'top_awarded_type', 'total_awards_received', 'treatment_tags',
       'created', 'distinguished', 'edited', 'author_cakeday', 'date'],
      dtype='object')

In [60]:
q_jobs_comments[['date', 'score', 'body', 'parent_id']].sample(20)

Unnamed: 0,date,score,body,parent_id
18046,2020-10-24 10:36:36+00:00,1,In the Netherlands *all* jobs offer one month ...,t3_jh4uhu
1150,2020-10-25 21:51:53+00:00,2,&gt; and my point is that there are some views...,t1_ga3e634
16630,2020-10-24 14:24:39+00:00,1,I work in schools and if we intervene and some...,t1_g9wup54
35442,2020-10-22 23:27:08+00:00,2,I'd love to see an installment or side-story f...,t3_jg0q8a
22224,2020-10-23 23:20:51+00:00,2,They're going to create so many jobs that we c...,t1_g9sbmvs
1108,2020-10-25 21:56:26+00:00,0,"Love my specialist job, but loved the night sh...",t3_jhz36c
28336,2020-10-23 14:28:54+00:00,1,Your going to enjoy it for now till all the go...,t1_g9r5c16
20371,2020-10-24 02:56:57+00:00,1,Bootlickers when police are constantly caught ...,t1_g9s7ige
20561,2020-10-24 02:33:13+00:00,1,Socialism is common universal ownership of the...,t3_jgvjwe
1336,2020-10-25 21:33:17+00:00,1,"Hi /u/jadedthistle, welcome to /r/narcissistic...",t3_ji1oex
