# Web Scraping 

For this project I am scraping data from the subreddits [*r/Horror*]('https://www.reddit.com/r/horror/') and [*r/Fantasy*]('https://www.reddit.com/r/Fantasy/'). Please refer to the Jupyter Lab notebook  'primary-notebook.ipynb' for more information on the data. I'm using the Pushshift API to complete my web scraping and referencing code found here: https://www.youtube.com/watch?v=AcrjEWsMi_E .

In [1]:
#Importing libraries
import pandas as pd

import requests
from bs4 import BeautifulSoup

In [2]:
#Setting our base URL.
url = "https://api.pushshift.io/reddit/search/submission"

---

In [3]:
#Setting our parameters. Starting with the r/horror subreddit.
params={
    'subreddit': 'horror',
    'size' : 100
}

In [4]:
#Saving my response as res.
res = requests.get(url, params)

In [5]:
#Checking status code
res.status_code

200

In [6]:
#Using .json() to make our html code more readable.
data=res.json()

In [7]:
#Saving the section of the code we're interested in to a variable 'posts'. 
posts=data['data']

In [687]:
##Commenting-out this cell to prevent the dataset from being written over.
#Using 'posts' to create a dataframe.
#df=pd.DataFrame(posts)

---

In [10]:
#Getting the oldest submission to get the created_utc for the next part.
posts[-1]

{'all_awardings': [],
 'allow_live_comments': False,
 'author': 'throwawayanonwhatevr',
 'author_flair_css_class': None,
 'author_flair_richtext': [],
 'author_flair_text': None,
 'author_flair_type': 'text',
 'author_fullname': 't2_6gkjq9e1',
 'author_patreon_flair': False,
 'author_premium': True,
 'awarders': [],
 'can_mod_post': False,
 'contest_mode': False,
 'created_utc': 1624847322,
 'domain': 'self.horror',
 'full_link': 'https://www.reddit.com/r/horror/comments/o9b2m4/tw_horror_movies_where_side_is_relevant/',
 'gildings': {},
 'id': 'o9b2m4',
 'is_created_from_ads_ui': False,
 'is_crosspostable': True,
 'is_meta': False,
 'is_original_content': False,
 'is_reddit_media_domain': False,
 'is_robot_indexable': True,
 'is_self': True,
 'is_video': False,
 'link_flair_background_color': '',
 'link_flair_css_class': 'discussion',
 'link_flair_richtext': [{'e': 'text', 't': 'Discussion'}],
 'link_flair_template_id': '3a1cd228-8f0a-11e1-988a-12313d051e91',
 'link_flair_text': 'Discu

In [11]:
#Including a 'before' parameter and giving it the epoch time from the 'created_utc' section of the oldest submission in our data.
#After that I will re-run everything, then append it into the dataframe we just created.
params={
    'subreddit': 'horror',
    'size' : 100,
    'before' : 1624847322
}

In [12]:
res = requests.get(url, params)
res.status_code

200

In [13]:
data=res.json()

In [14]:
posts=data['data']

In [693]:
##Commenting-out this cell to prevent the dataframe from being written over.
#Adding this new data to our existing dataframe using .append() and setting ignore_index=True .
#df=df.append(posts, ignore_index=True)

---

In [17]:
#Pulling the oldest submission from this new batch of data to get the new created_utc.
posts[-1]

{'all_awardings': [],
 'allow_live_comments': False,
 'author': 'That_creepy_bitch',
 'author_flair_css_class': None,
 'author_flair_richtext': [],
 'author_flair_text': None,
 'author_flair_type': 'text',
 'author_fullname': 't2_cymqn5o1',
 'author_patreon_flair': False,
 'author_premium': False,
 'awarders': [],
 'can_mod_post': False,
 'contest_mode': False,
 'created_utc': 1624768846,
 'domain': 'self.horror',
 'full_link': 'https://www.reddit.com/r/horror/comments/o8q7um/hey_im_new/',
 'gildings': {},
 'id': 'o8q7um',
 'is_created_from_ads_ui': False,
 'is_crosspostable': False,
 'is_meta': False,
 'is_original_content': False,
 'is_reddit_media_domain': False,
 'is_robot_indexable': False,
 'is_self': True,
 'is_video': False,
 'link_flair_background_color': '',
 'link_flair_richtext': [],
 'link_flair_text_color': 'dark',
 'link_flair_type': 'text',
 'locked': False,
 'media_only': False,
 'no_follow': True,
 'num_comments': 1,
 'num_crossposts': 0,
 'over_18': False,
 'parent_w

In [18]:
#Updating the 'before' param with the new epoch time to get a new batch of submissions.
params={
    'subreddit': 'horror',
    'size' : 100,
    'before' : 1624768846
}

In [19]:
#Creating a new requests.get() with the updated params and checking the status_code
res = requests.get(url, params)
res.status_code

200

In [20]:
#Converting it to .json for easier reading and saving the desired section of data to a variable.
data=res.json()
posts=data['data']

In [688]:
##Commenting-out this cell to prevent the dataframe from being written over.
#Appending our dataframe with the new batch of submissions.
#df=df.append(posts, ignore_index=True)

---

In [22]:
#Getting created_utc from oldest submission in this batch.
posts[-1]

{'all_awardings': [],
 'allow_live_comments': False,
 'author': 'gucci_jawline',
 'author_flair_css_class': None,
 'author_flair_richtext': [],
 'author_flair_text': None,
 'author_flair_type': 'text',
 'author_fullname': 't2_3rggjjab',
 'author_patreon_flair': False,
 'author_premium': False,
 'awarders': [],
 'can_mod_post': False,
 'contest_mode': False,
 'created_utc': 1624697326,
 'domain': 'self.horror',
 'full_link': 'https://www.reddit.com/r/horror/comments/o872it/found_a_cool_little_short_horror_film_one_day/',
 'gildings': {},
 'id': 'o872it',
 'is_created_from_ads_ui': False,
 'is_crosspostable': False,
 'is_meta': False,
 'is_original_content': False,
 'is_reddit_media_domain': False,
 'is_robot_indexable': False,
 'is_self': True,
 'is_video': False,
 'link_flair_background_color': '',
 'link_flair_richtext': [],
 'link_flair_text_color': 'dark',
 'link_flair_type': 'text',
 'locked': False,
 'media_only': False,
 'no_follow': True,
 'num_comments': 1,
 'num_crossposts': 0

In [23]:
#Updating the 'before' param with the new epoch time.
params={
    'subreddit': 'horror',
    'size' : 100,
    'before' : 1624697326
}

In [24]:
#Creating a new requests.get() and checking the status_code
res = requests.get(url, params)
res.status_code

200

In [25]:
#Converting it to .json and saving the desired data.
data=res.json()
posts=data['data']

In [689]:
##Commenting-out this cell to prevent the dataframe from being written over.
#Appending our dataframe.
#df=df.append(posts, ignore_index=True)

In [27]:
df.shape

(400, 78)

In [694]:
##Commenting-out this cell to prevent the dataframe from being written over.
#Saving what we have so far.
#df.to_csv('./data/reddit_submissions')

---

To reduce the number of repetitive lines of text, I'll be updating the cells below to collect the remaining r/horror data I need and regularly re-saving the dataset to avoid needing to re-run everything. Doing it this way will also naturally create a delay between each request so I'm not bombarding the server..

In [312]:
#Getting created_utc from oldest submission in this batch.
posts[-1]

{'all_awardings': [],
 'allow_live_comments': False,
 'author': 'suzaman',
 'author_flair_css_class': None,
 'author_flair_richtext': [],
 'author_flair_text': None,
 'author_flair_type': 'text',
 'author_fullname': 't2_3vdwzdgu',
 'author_patreon_flair': False,
 'author_premium': False,
 'awarders': [],
 'can_mod_post': False,
 'contest_mode': False,
 'created_utc': 1621446669,
 'domain': 'self.horror',
 'full_link': 'https://www.reddit.com/r/horror/comments/ngcct1/1981_possession_is_more_proof_that_sam_neill_was/',
 'gildings': {},
 'id': 'ngcct1',
 'is_crosspostable': True,
 'is_meta': False,
 'is_original_content': False,
 'is_reddit_media_domain': False,
 'is_robot_indexable': True,
 'is_self': True,
 'is_video': False,
 'link_flair_background_color': '',
 'link_flair_css_class': 'discussion',
 'link_flair_richtext': [{'e': 'text', 't': 'Discussion'}],
 'link_flair_template_id': '3a1cd228-8f0a-11e1-988a-12313d051e91',
 'link_flair_text': 'Discussion',
 'link_flair_text_color': 'da

In [313]:
#Updating the 'before' param with the new epoch time.
params={
    'subreddit': 'horror',
    'size' : 100,
    'before' : 1621446669
}

In [314]:
#Using .get to pull our url with the updated params and checking the status code.
res = requests.get(url, params)
res.status_code

200

In [315]:
#Updating it to .json() format and selecting desired section of data.
data=res.json()
posts=data['data']

In [690]:
##Commenting-out this cell to prevent the dataframe from being written over.
#Appending it to our dataframe.
#df=df.append(posts, ignore_index=True)

In [317]:
#Using .shape to see how many rows we have now.
df.shape

(5000, 80)

In [695]:
##Commenting-out this cell to prevent the dataframe from being written over.
#Saving the dataframe.
#df.to_csv('./data/reddit_submissions')

---

Now that we have 5000 submissions from the r/horror subreddit, I'll use the same technique to collect 5000 submissions from r/Fantasy.

In [370]:
#Pulling 100 submissions from r/Fantasy.
params={
    'subreddit': 'Fantasy',
    'size' : 100,
}

In [365]:
#Using .get to pull our url with the updated params and checking the status code.
res = requests.get(url, params)
res.status_code

200

In [366]:
#Updating it to .json() format and selecting desired section of data.
data=res.json()
posts=data['data']

In [691]:
##Commenting-out this cell to prevent the dataframe from being written over.
#Appending it to our dataframe.
#df=df.append(posts, ignore_index=True)

---

In [680]:
#Getting created_utc from oldest submission in this batch.
posts[-1]

{'all_awardings': [],
 'allow_live_comments': False,
 'author': 'generalamitt',
 'author_flair_css_class': None,
 'author_flair_richtext': [],
 'author_flair_text': None,
 'author_flair_type': 'text',
 'author_fullname': 't2_xiy1u',
 'author_patreon_flair': False,
 'author_premium': False,
 'awarders': [],
 'can_mod_post': False,
 'contest_mode': False,
 'created_utc': 1594664374,
 'domain': 'self.Fantasy',
 'full_link': 'https://www.reddit.com/r/Fantasy/comments/hqkmqn/brandon_sanderson_supposedly_crafts/',
 'gildings': {},
 'id': 'hqkmqn',
 'is_crosspostable': True,
 'is_meta': False,
 'is_original_content': False,
 'is_reddit_media_domain': False,
 'is_robot_indexable': True,
 'is_self': True,
 'is_video': False,
 'link_flair_background_color': '',
 'link_flair_richtext': [],
 'link_flair_text_color': 'dark',
 'link_flair_type': 'text',
 'locked': False,
 'media_only': False,
 'no_follow': True,
 'num_comments': 19,
 'num_crossposts': 0,
 'over_18': False,
 'parent_whitelist_status'

In [681]:
#Updating the 'before' param with the new epoch time.
params={
    'subreddit': 'Fantasy',
    'size' : 100,
    'before' : 1594664374
}

In [682]:
#Using .get to pull our url with the updated params and checking the status code.
res = requests.get(url, params)
res.status_code

200

In [683]:
#Updating it to .json() format and selecting desired section of data.
data=res.json()
posts=data['data']

In [692]:
##Commenting-out this cell to prevent the dataframe from being written over.
#Appending it to our dataframe.
#df=df.append(posts, ignore_index=True)

In [685]:
#Using .shape to see how many rows we have now.
df.shape

(10000, 87)

In [698]:
##Commenting-out this cell to prevent the dataframe from being written over.
#Saving the dataframe.
#df.to_csv('./data/reddit_submissions')

---