# Reddit 20 for 20 Scraping Workshop Code (for Google Colab Only)
## Presented By: Graham Schuckman | gschuckm@terpmail.umd.edu
Documentation: https://praw.readthedocs.io/en/latest/getting_started/quick_start.html
GitHub Repository: https://github.com/grahamschuckman/20for20

In [None]:
# Colab does not come with the praw library by default, so we need to install it
!pip install praw

In [None]:
"""
This script accepts system arguments for specific subreddits.
The last system argument should be the number of posts requested.
Ex: python reddit_api.py funny memes 100
"""

# We need to import the following libraries to be able to properly run our code
import praw
import pandas as pd
import json
from datetime import datetime
import sys

# Create a client for the Reddit API using the following credentials
reddit = praw.Reddit(client_id='',  # Put the string under "personal use script" here
                     client_secret='',  # Put the secret here
                     user_agent='',  # Put the name of your app here
                     username='',  # Put your reddit username here
                     password='')  # Put your reddit password here

In [None]:
# Check to make sure we successfully created the client
print(reddit.user.me())

### Note: Reddit is recommending we use the Async PRAW to avoid sleep commands, but since we are not running a sophisticated bot that is constantly posting and expecting results back, we can continue with the regular PRAW. Plus, we can avoid entering await commands: https://github.com/praw-dev/asyncpraw

In [None]:
# Since we are testing this in Colab, we need to simulate entering in command line arguments
# We can test this by using the example command given in the docstring, but we will only ask for 5 messages
sys.argv = input('Please enter the command to run your program: ').split(' ')

# Our arguments will be read in as a list, just like it would be from the command line
print(sys.argv, type(sys.argv))

In [None]:
# When running from the command line, the 'python' argument does not get included, so we need to remove it
sys.argv = sys.argv[1:len(sys.argv)]
print(sys.argv)

In [None]:
# Count the system arguments (subreddit names) and subtract 1 since the script name will always be included
arguments = len(sys.argv) - 1
print(arguments)

In [None]:
# If the user forgets to enter in any system arguments, we should exit the program and inform them of the error
if arguments == 0:
    sys.exit("Please provide at least one valid subreddit name. Ex: python reddit_api.py funny memes 100")

In [None]:
# Assign the last system argument to be the number of posts retrieved from each subreddit
last_argument = len(sys.argv) - 1

# Exit the script if no number of posts system argument was provided or it was not provided as an integer
try:
    num_messages = int(sys.argv[last_argument])
    print(num_messages)
    
except:
    sys.exit("Enter the last argument as the number of posts to retrieve. Ex: python reddit_api.py funny memes 100")

In [None]:
# Now we are ready to begin scraping data from the Reddit API
# We can test our scraper by manually providing an argument before we convert the code into a loop
# We will use the first subreddit argument which is at position 1 (remember, lists are 0-based in Python)
position = 1

# Let us make sure we can pass this argument to our Reddit client and get some results
try:
    # Choose to examine a given subreddit
    subreddit_name = sys.argv[position]
    subreddit = reddit.subreddit(subreddit_name)

    # Returns a list-like object with the top (newest = subreddit.new) n submissions to the subreddit (limit of 1000)
    top_subreddit = subreddit.top(limit=num_messages)

# If we pass in an invalid subreddit name like asjhasdasdiugqwdb, we want to handle that error appropriately
except:
    sys.exit("Please confirm that all subreddit names are valid. Ex: python reddit_api.py funny memes 100")

In [None]:
# Let's take a look at how the data is passed to us from Reddit
print(top_subreddit)

In [None]:
# The data is returned in a Python object called a ListingGenerator, which allows us to loop through it
# If we were to loop through normally, we would only get the ID of each item in the ListingGenerator
# Using the pprint library, we can actually see the attributes associated with the objects themselves
import pprint
for submission in subreddit.top(limit=1):
    pprint.pprint(vars(submission))

In [None]:
# We can see there are attributes like author, title, created, etc., that we can pull
# To store these attributes, we can create a dictionary that we will populate with the values we want
# Each key in the dictionary represents an attribute we will scrape, and the values will be lists of the attributes
reddit_dict = {"author": [],
                "title": [],
                "score": [],
                "id": [], 
                "url": [],
                "comms_num": [],
                "created": [],
                "body": [],
                "subscribers": [],
                "subreddit": []}
print(reddit_dict)

In [None]:
# When you loop through a ListingGenerator, it expires. We need to recreate it so that we can finish scraping.
# Choose to examine a given subreddit
subreddit_name = sys.argv[position]
subreddit = reddit.subreddit(subreddit_name)

# Returns a list-like object with the top (newest = subreddit.new) n submissions to the subreddit (limit of 1000)
top_subreddit = subreddit.top(limit=num_messages)
print(top_subreddit)

In [None]:
# We can iterate through the top subreddit posts and append to the dictionary
for submission in top_subreddit:
    # The author must be converted to a string because of the way it is contained in the Redditor class
    reddit_dict["author"].append(str(submission.author))
    reddit_dict["title"].append(submission.title)
    reddit_dict["score"].append(submission.score)
    reddit_dict["id"].append(submission.id)
    reddit_dict["url"].append(submission.url)
    reddit_dict["comms_num"].append(submission.num_comments)
    reddit_dict["created"].append(submission.created)
    reddit_dict["body"].append(submission.selftext)
    reddit_dict["subscribers"].append(submission.subreddit_subscribers)
    reddit_dict["subreddit"].append(subreddit_name)

In [None]:
# Let's take a look at how our dictionary is currently formatted
print(reddit_dict)

In [None]:
# We can see that the data is there, but it's not very intelligible
# To make our data easier to investigate, we can pass the dictionary into a dataframe for easier viewing
reddit_data = pd.DataFrame(reddit_dict)
reddit_data.head()

In [None]:
# The formatting of the created column is not very intelligible. We can change it to ISO from UNIX/UTC
def get_date(created):
    return datetime.utcfromtimestamp(created).isoformat() + '+00:00'

# Apply the above function to the "created" column, and then rename it to timestamp for easier understanding
reddit_data["created"] = reddit_data["created"].apply(get_date)
reddit_data.rename(columns = {"created" : "timestamp"}, inplace = True)

# Examine the dataframe and confirm that our data is structured the way we want
reddit_data

In [None]:
# Convert the dataframe back to a dictionary for rapid exporting to JSON (dataframes are slow)
reddit_dict = reddit_data.to_dict('records')

# This results in a series of key-value pairs that looks very similar to JSON
reddit_dict

In [None]:
# Now we want to export our JSON data back to our desktops
from google.colab import files

# Export reddit data to JSON and name the files appropriately
with open(subreddit_name + "_subreddit.json", 'w+') as f:
    json.dump(reddit_dict, f)

files.download(subreddit_name + "_subreddit.json") 
print("All data has been successfully exported to a JSON file.")

## Display JSON Content Cleanly
http://jsonviewer.stack.hu/

## Below is the completed script with looping (3 extra lines)

In [None]:
"""
This script accepts system arguments for specific subreddits.
The last system argument should be the number of posts requested.
Ex: python reddit_api.py funny memes 100
"""

# We need to import the following libraries to be able to properly run our code
import praw
import pandas as pd
import json
from datetime import datetime
import sys

# Create a client for the Reddit API using the following credentials
reddit = praw.Reddit(client_id='',  # Put the string under "personal use script" here
                     client_secret='',  # Put the secret here
                     user_agent='',  # Put the name of your app here
                     username='',  # Put your reddit username here
                     password='')  # Put your reddit password here

# Since we are testing this in Colab, we need to simulate entering in command line arguments
# We can test this by using the example command given in the docstring, but we will only ask for 5 messages
sys.argv = input('Please enter the command to run your program: ').split(' ')

# When running from the command line, the 'python' argument does not get included, so we need to remove it
sys.argv = sys.argv[1:len(sys.argv)]

# Count the system arguments (subreddit names) and subtract 1 since the script name will always be included
arguments = len(sys.argv) - 1

# If the user forgets to enter in any system arguments, we should exit the program and inform them of the error
if arguments == 0:
    sys.exit("Please provide at least one valid subreddit name. Ex: python reddit_api.py funny memes 100")

# Assign the last system argument to be the number of posts retrieved from each subreddit
last_argument = len(sys.argv) - 1

# Exit the script if no number of posts system argument was provided or it was not provided as an integer
try:
    num_messages = int(sys.argv[last_argument])
    print(num_messages)
    
except:
    sys.exit("Enter the last argument as the number of posts to retrieve. Ex: python reddit_api.py funny memes 100")

# Now we are ready to begin scraping data from the Reddit API
# We need to loop through each subreddit argument and get the requested number of posts for each subreddit
# We will use the first subreddit argument which is at position 1 (remember, lists are 0-based in Python)
position = 1
while (arguments > position):
    print(sys.argv[position])

    try:
        # Choose to examine a given subreddit
        subreddit_name = sys.argv[position]
        subreddit = reddit.subreddit(subreddit_name)

        # Returns a list-like object with the top (newest = subreddit.new) n submissions to the subreddit (limit of 1000)
        top_subreddit = subreddit.top(limit=num_messages)

    # If we pass in an invalid subreddit name like asjhasdasdiugqwdb, we want to handle that error appropriately
    except:
        sys.exit("Please confirm that all subreddit names are valid. Ex: python reddit_api.py funny memes 100")

    # Create a dictionary to store data
    reddit_dict = {"author": [],
                   "title": [],
                   "score": [],
                   "id": [], "url": [],
                   "comms_num": [],
                   "created": [],
                   "body": [],
                   "subscribers": [],
                   "subreddit": []}

    # Iterate through the top subreddit posts and append to the dictionary
    for submission in top_subreddit:
        # The author must be converted to a string because of the way it is contained in the Redditor class
        reddit_dict["author"].append(str(submission.author))
        reddit_dict["title"].append(submission.title)
        reddit_dict["score"].append(submission.score)
        reddit_dict["id"].append(submission.id)
        reddit_dict["url"].append(submission.url)
        reddit_dict["comms_num"].append(submission.num_comments)
        reddit_dict["created"].append(submission.created)
        reddit_dict["body"].append(submission.selftext)
        reddit_dict["subscribers"].append(submission.subreddit_subscribers)
        reddit_dict["subreddit"].append(subreddit_name)

    # Pass the dictionary into a dataframe for easier viewing
    reddit_data = pd.DataFrame(reddit_dict)
    reddit_data.head()

    # The formatting of the created column is not very intelligible. We can change it to ISO from UNIX/UTC
    def get_date(created):
        return datetime.utcfromtimestamp(created).isoformat() + '+00:00'

    # Apply the above function to the "created" column, and then rename it to timestamp for easier understanding
    reddit_data["created"] = reddit_data["created"].apply(get_date)
    reddit_data.rename(columns = {"created" : "timestamp"}, inplace = True)

    # Convert the dataframe back to a dictionary for rapid exporting to JSON
    reddit_dict = reddit_data.to_dict('records')

    # Now we want to export our JSON data back to our desktops
    from google.colab import files

    # Export reddit data to JSON and name the files appropriately
    with open(subreddit_name + "_subreddit.json", 'w+') as f:
        json.dump(reddit_dict, f)

    files.download(subreddit_name + "_subreddit.json") 
    print("All data has been successfully exported to a JSON file.")

    # Loop to the next system argument
    position = position + 1