In [1]:
import pandas as pd
import requests
import time


### Authorizing

In order to make requests to Reddit's API, we're going to have to authenticate ourselves via OAuth2. Unfortunately we're going to need to do several things before we get to the point of receiving our authorization token though.

1. Create a [Reddit](https://www.reddit.com) account.
    - Be sure to remember both your username and password
2. Once you're signed in [create an application](https://www.reddit.com/prefs/apps) to generate the credentials needed to request an authorization token.
    - Scroll all the way down and click `create another app...`
    - Select `script`
    - Enter a name for your application and enter `http://localhost:8080` as your redirect uri
    - Click `create_app`
3. Fill out the information below

In [2]:
client_id = 'OWpV-MSPnpZUzW4bGAjvJg'
client_secret =  'ADKiyrd3kqLqxxxr6jT9M8WrWk72uQ'
user_agent =  'MSDC'
username =  '*********password =  '*********'

Now we're on our way to retrieving our access token; we'll use the basic authentication framework to get there.

In [3]:
auth = requests.auth.HTTPBasicAuth(client_id, client_secret)

data = {
    'grant_type': 'password',
    'username': username,
    'password': password
}

In [4]:
#create an informative header for your application
headers = {'User-Agent': 'namehere/0.0.1'}

res = requests.post(
    'https://www.reddit.com/api/v1/access_token',
    auth=auth,
    data=data,
    headers=headers)



In [5]:
res.json()

{'access_token': 'eyJhbGciOiJSUzI1NiIsImtpZCI6IlNIQTI1NjpzS3dsMnlsV0VtMjVmcXhwTU40cWY4MXE2OWFFdWFyMnpLMUdhVGxjdWNZIiwidHlwIjoiSldUIn0.eyJzdWIiOiJ1c2VyIiwiZXhwIjoxNjkxMDAxMjY3LjUxNzQxNCwiaWF0IjoxNjkwOTE0ODY3LjUxNzQxNCwianRpIjoidGhpNmVKQVhMdlhHeWNqN2paandJcXlkd01FOHNnIiwiY2lkIjoiT1dwVi1NU1BucFpVelc0YkdBanZKZyIsImxpZCI6InQyXzZxODZmYWpzIiwiYWlkIjoidDJfNnE4NmZhanMiLCJsY2EiOjE1OTE4MTMyNDU3NjUsInNjcCI6ImVKeUtWdEpTaWdVRUFBRF9fd056QVNjIiwiZmxvIjo5fQ.G6folaueJBblcgRxhJmpyz21rGbtEJVMpAbXhbm5rNtKmZp6j9Ra2gNntlzHBG0ZznQ5LaAj9AA1xotPNUeMUDvlGIyCzYpY5ye606kO9kYmJvpyJODpvx4wY5cQ1_lgjZMuh3F-s3EVzAUWD7wdokCKAVP2wSYISKO1uKPZuy9kyVoNdY7szG6jw_gdRp-xLTnQyENW7up_EnCa3FRVg-LSkSm9uRQHA-81V_2JBEwXczHJtsYnFNCOf6m5X3om4sjGcdhPHUXhU9hzMQ3HCcCmEYkoHXth20X_LA7rjNUJHpP-3i-tKjargDBlWJNDAlnLdi8iLv7IQsVn0XkVXA',
 'token_type': 'bearer',
 'expires_in': 86400,
 'scope': '*'}

Hopefully upon running the above, you received a successful response code and can save your token. These should last for about two hours by default.

In [6]:
#retrieve access token
token = res.json()['access_token']

Now let's add your access token to the headers and verify that you can successfully submit a call to the api.

In [7]:
headers['Authorization'] = f'bearer {token}'

requests.get('https://oauth.reddit.com/api/v1/me', headers=headers).status_code == 200

True

If all went correctly, we can finally create a simple request.

In [8]:
base_url = 'https://oauth.reddit.com/r/'
subreddit = 'marvelstudios'
subreddit2 = "DCcomics"
res = requests.get(base_url+subreddit, headers=headers)

In [9]:
res.json()['data']['children']

[{'kind': 't3',
  'data': {'approved_at_utc': None,
   'subreddit': 'marvelstudios',
   'selftext': "Here, you can discuss your general thoughts on the entire first season of Secret Invasion.\n\nPlease refrain from submitting any individual threads, especially ones touching on points that have already been discussed a million times the last few days since the finale.\n\nUnless an individual Secret Invasion thread brings up a very unique point of discussion which hasn't been touched on before at all, said thread will be removed and OP will be redirected to this pinned thread. \n\nWhat's more, on u/Illigard's suggestion, we have decided to incorporate a poll in these Season-Wide Discussion Threads from now on in order to gauge our user-base's average opinion on the show.\n\nPlease remain respectful of other people's opinions and discuss in a civil and kind manner.\n\nThis thread will contain spoilers for the entire season of Secret Invasion!\n\n[View Poll](https://www.reddit.com/poll/15d

Explore the response object. Where is our submission data? How many posts were retrieved by default?

In [10]:
#check out response object

In [11]:
posts = res.json()['data']

In [12]:
len(posts)

6

In [13]:
# Function to fetch posts and return post data in a list
def fetch_posts(url, headers, params):
    res = requests.get(url, headers=headers, params=params)
    if res.status_code == 200:
        return res.json()

In [14]:
post = res.json()['data']['children'][0]
[i for i in post['data'].keys()]

['approved_at_utc',
 'subreddit',
 'selftext',
 'author_fullname',
 'saved',
 'mod_reason_title',
 'gilded',
 'clicked',
 'title',
 'link_flair_richtext',
 'subreddit_name_prefixed',
 'hidden',
 'pwls',
 'link_flair_css_class',
 'downs',
 'thumbnail_height',
 'top_awarded_type',
 'hide_score',
 'name',
 'quarantine',
 'link_flair_text_color',
 'upvote_ratio',
 'author_flair_background_color',
 'subreddit_type',
 'ups',
 'total_awards_received',
 'media_embed',
 'thumbnail_width',
 'author_flair_template_id',
 'is_original_content',
 'user_reports',
 'secure_media',
 'is_reddit_media_domain',
 'is_meta',
 'category',
 'secure_media_embed',
 'link_flair_text',
 'can_mod_post',
 'score',
 'approved_by',
 'is_created_from_ads_ui',
 'author_premium',
 'thumbnail',
 'edited',
 'author_flair_css_class',
 'author_flair_richtext',
 'gildings',
 'content_categories',
 'is_self',
 'mod_note',
 'created',
 'link_flair_type',
 'wls',
 'removed_by_category',
 'banned_by',
 'author_flair_type',
 'dom

Let's now make use of the fact that we can pass a parameters dictionary to increase the size of our request then create a dataframe of our submissions.

In [15]:
#modify request
params = {
    'limit': 100
}

res = requests.get(base_url+subreddit,
                   headers=headers,
                  params=params)

In [16]:
# all_post_data = []
# post_count = 1000
# subreddits = ["marvelstudios", "DCcomics"]  

# # While loop to fetch posts from both subreddits
# for subreddit in subreddits:
#     while len(all_post_data) < post_count:
#         data = fetch_posts(base_url + subreddit, headers=headers, params=params)
#         if not data:
#             break

#         posts_count = len(data['data']['children'])
#         print("Number of posts retrieved in the current response:", posts_count)

        
#         posts = data['data']['children']
#         for post in posts:
#             all_post_data.append({
#                 'Title': post['data']['title'],
#                 'Author': post['data']['author'],
#                 'Score': post['data']['score'],
#                 'URL': post['data']['url'],
#                 'Created': post['data']['created_utc'],
#                 'Selftext': post['data']['selftext']
#             })

#         params = {'limit': 100, 'after': posts[-1]['data']['name']}

#         # If there are no more posts, break out of the loop
#         if len(posts) < 100:
#             break

In [16]:
#Marvel one 
all_post_data = []
post_count = 1000
#While loop in a for loop 
while len(all_post_data) < post_count:
    data = fetch_posts(base_url + subreddit, headers=headers, params=params)
    if not data:
        break


    posts_count = len(data['data']['children'])
    print("Number of posts retrieved in the current response:", posts_count)

    # Extract relevant information from each post
    posts = data['data']['children']
    for post in posts:
        all_post_data.append({
            'Title': post['data']['title'],
            'Author': post['data']['author'],
            'Score': post['data']['score'],
            'URL': post['data']['url'],
            'Created': post['data']['created_utc'],
            'Selftext': post['data']['selftext']


        })


    params = {'limit': 100, 'after': posts[-1]['data']['name']}

    # If there are no more posts, break out of the loop
    if len(posts) < 100:
        break

Number of posts retrieved in the current response: 102
Number of posts retrieved in the current response: 100
Number of posts retrieved in the current response: 100
Number of posts retrieved in the current response: 100
Number of posts retrieved in the current response: 100
Number of posts retrieved in the current response: 100
Number of posts retrieved in the current response: 100
Number of posts retrieved in the current response: 100
Number of posts retrieved in the current response: 50


In [16]:
#DcComics one 
all_post_data = []
post_count = 1000
#While loop in a for loop 
while len(all_post_data) < post_count:
    data = fetch_posts(base_url + subreddit2, headers=headers, params=params)
    if not data:
        break


    posts_count = len(data['data']['children'])
    print("Number of posts retrieved in the current response:", posts_count)

    # Extract relevant information from each post
    posts = data['data']['children']
    for post in posts:
        all_post_data.append({
            'Title': post['data']['title'],
            'Author': post['data']['author'],
            'Score': post['data']['score'],
            'URL': post['data']['url'],
            'Created': post['data']['created_utc'],
            'Selftext': post['data']['selftext']


        })


    params = {'limit': 100, 'after': posts[-1]['data']['name']}

    # If there are no more posts, break out of the loop
    if len(posts) < 100:
        break

Number of posts retrieved in the current response: 102
Number of posts retrieved in the current response: 100
Number of posts retrieved in the current response: 100
Number of posts retrieved in the current response: 100
Number of posts retrieved in the current response: 100
Number of posts retrieved in the current response: 100
Number of posts retrieved in the current response: 100
Number of posts retrieved in the current response: 100
Number of posts retrieved in the current response: 31


In [18]:
#check status code
res.status_code

200

In [17]:
#create a dataframe of your submissions
df = pd.DataFrame(all_post_data)

**Exercise**: write a loop to retrieve the 1000 most recent submissions. What parameters of the submissions endpoint will be most helpful for you here? [To the docs!](https://www.reddit.com/dev/api/)

In [20]:
df.nunique()

Title       846
Author      688
Score       282
URL         848
Created     846
Selftext    668
dtype: int64

In [20]:
df.head()

Unnamed: 0,Title,Author,Score,URL,Created,Selftext
0,"Weekly Discussion Thread: Comics, TV, and More...",beary_neutral,16,https://www.reddit.com/r/DCcomics/comments/157...,1690163000.0,Hey there honorary Justice League members - it...
1,[Character of the Month Voting] Not On This Earth,Predaplant,6,https://www.reddit.com/r/DCcomics/comments/15b...,1690553000.0,"Another month is nearing its end, so you know ..."
2,[Cover] G'nort's Illustrated Swimsuit Edition ...,TyranusWrex,1709,https://i.redd.it/pupwmw4t7peb1.jpg,1690547000.0,
3,[Discussion] Roughly how old do you prefer Bil...,nightwing612,213,https://i.redd.it/2wuppx45mreb1.jpg,1690576000.0,
4,[Cover] G'nort's Illustrated Swimsuit Edition ...,ravager27,749,https://i.redd.it/7adzwk3rdpeb1.jpg,1690549000.0,


In [18]:
# Save DataFrame to a CSV file
df.to_csv('reddit_submissions_DC5.csv', index=False)

In [20]:
import os

In [23]:
def concatenate_csv_files(input_files, output_file):
    dfs = []

    for file_path in input_files:
        df = pd.read_csv(file_path)
        dfs.append(df)

    concatenated_df = pd.concat(dfs, ignore_index=True)

    concatenated_df.to_csv(output_file, index=False)

if __name__ == "__main__":
    input_files = [
        "reddit_submissions_DC.csv",
        "reddit_submissions_DC2.csv",
        "reddit_submissions_DC3.csv",
        "reddit_submissions_DC4.csv",
    ]
    output_file = "combined.csv"
    concatenate_csv_files(input_files, output_file)

In [27]:
def concatenate_csv_files(input_files, output_file):
    
    dfs = []
   
    for file_path in input_files:
        df = pd.read_csv(file_path)
        dfs.append(df)

    concatenated_df = pd.concat(dfs, ignore_index=True)

  
    concatenated_df.to_csv(output_file, index=False)

if __name__ == "__main__":
    input_files = [
        "reddit_submissions_MS.csv",
        "reddit_submissions_MS2.csv",
        "reddit_submissions_MS3.csv",
        "reddit_submissions_MS4.csv",
    ]
    output_file = "combined_MS.csv"
    concatenate_csv_files(input_files, output_file)

In [28]:
def concatenate_csv_files(input_files, output_file):
    
    dfs = []
   
    for file_path in input_files:
        df = pd.read_csv(file_path)
        dfs.append(df)

    concatenated_df = pd.concat(dfs, ignore_index=True)

  
    concatenated_df.to_csv(output_file, index=False)

if __name__ == "__main__":
    input_files = [
        "combined_MS.csv",
        "combined_DC.csv"
    ]
    output_file = "combined_FINAL.csv"
    concatenate_csv_files(input_files, output_file)