The script does the following in this order:
Connects to the Reddit API
Loops through each subreddit in the subreddit list
Creates an empty diction"aries for the posts and comments in each subreddit
Searches ALL of the “top” posts within each subreddit for posts that contain the input keyword


In [12]:
# importing packages
import praw
import pandas as pd
import datetime as dt
import glob

In [14]:
def scrape_session(sub, query, path):
    # Python Reddit API Wrapper (PRAW)
    # We connect to reddit by calling praw.Reddit and storing it in the reddit variable
    # client_id, client_secret, user_agent, username, password are passed as arguments
    reddit = praw.Reddit(client_id='_14_Chars_Personal_Use_Script_',
                         client_secret='_27_Chars_Secret_Key_',
                         user_agent='_Your_User_Agent_')
      
        
    # loop through each subreddit in the sub list with the reddit instance
    # https://praw.readthedocs.io/en/latest/code_overview/reddit_instance.html?highlight=reddit    
    for s in sub:
        
        # selecting the individual subreddit instance and putting it into the subreddit method
        subreddit = reddit.subreddit(s) 

        # nested loop to search for the query keyword(s)
        for item in query:
            # creating a post dictionary to store post info
            post_dict = {
                "title": [],  # title of the post
                "score": [],  # score of the post (upvotes/downvotes)
                "id": [],  # id of the post
                "url": [],  # url of the post
                "comms_num": [],  # number of comments on the post
                "created": [],  # UNIX timestamp
                "body": []  # body of the post
            }
            # creating a comments dictionary to store comments info
            comment_dict = {
                "comment_id": [],  # id of the comment
                "comment_parent_id": [],  # id of the parent post
                "comment_body": [],  # body of the comment
                "comment_score": [],  # score of the comment (upvotes/downvotes)
                "comment_link_id": []  # id within the comment tree, within the parent id
            }
            
            # POSTS ###############################################################################################

            # nested, nested for loop to obtain each post element with the specified keyowrds in the query variable and append to post dictionary
            # query is the specified input keyword variable from line 20
            # sort is how Reddit sorts the posts. can be hot, new, rising, top, or controversial
            # "limit = None" returns all posts
            
            for submission in subreddit.search(query,sort = "top",limit = None):
                post_dict["title"].append(submission.title)
                post_dict["score"].append(submission.score)
                post_dict["id"].append(submission.id)
                post_dict["url"].append(submission.url)
                post_dict["comms_num"].append(submission.num_comments)
                post_dict["created"].append(submission.created)
                post_dict["body"].append(submission.selftext)

                
                # COMMENTS ###############################################################################################

                # for loop to obtain each comment element specified, from each post that was returned, and append to comment dictionary
                # using replace_more with "limit = None" returns all comments from the comment forest
                # https://praw.readthedocs.io/en/latest/tutorials/comments.html

                submission.comments.replace_more(limit = None)
                for comment in submission.comments.list():
                    comments_dict["comment_id"].append(comment.id)
                    comments_dict["comment_parent_id"].append(comment.parent_id)
                    comments_dict["comment_body"].append(comment.body)
                    comments_dict["comment_score"].append(comment.score)
                    comments_dict["comment_link_id"].append(comment.link_id)
                    
            ###############################################################################################
            
            # putting each dictionary into a dataframe
            comment_data = pd.DataFrame(comments_dict)
            
            # saving the comment_data to a csv
            comment_data.to_csv(s+"_subreddit_"+ item +"_comments.csv")

            # putting each post_dict dictionary into a dataframe
            post_data = pd.DataFrame(post_dict)
            
            # function to create 'timestamp' column in datetime format from 'created' column (in UNIX format)
            def get_date(created):
                return dt.datetime.fromtimestamp(created)
            
            # applying the get_date function to the 'created' column and saving to '_timestamp' variable
            _timestamp = post_data["created"].apply(get_date)
            
            # assigning '_timestamp' variable to column in post_data df
            post_data = post_data.assign(timestamp = _timestamp)
            
            # saving the post_data df to a csv file
            post_data.to_csv(s+"_subreddit_"+ item + "_posts.csv")

    # pulling all _post.csv files from path into posts
    posts = glob.glob(path +'/*_posts.csv')

    # establishing empty dataframe: posts_df
    posts_df = pd.DataFrame()
    
    # establishing empty list: post_content
    post_content = []
    
    # looping through posts, reading each csv, appending content to dataframe
    for post in posts:
        df = pd.read_csv(post, index_col=None)
        post_content.append(df)
    
    # combining all the post dataframes into single dataframe
    posts_df = pd.concat(post_content).reset_index(drop=True).drop(columns="Unnamed: 0")
    
    # pulling all _comments.csv files from path into comments
    comments = glob.glob(path +'/*_comments.csv')
    
    # establishing empty dataframe: comment_df
    comment_df = pd.DataFrame()
    
    # establishing empty list comment_content
    comment_content = []
    
    # looping through vomments, reading each csv, appending content to dataframe
    for comment in comments:
        df = pd.read_csv(filename, index_col=None)
        comment_content.append(df)
        
    # combining all the comment dataframes into single dataframe
    comment_df = pd.concat(comment_content).reset_index(drop=True).drop(columns="Unnamed: 0")
    
    # dropping the first 3 characters from the ‘comment_link_id’ string in the comment_df 
    comment_df['id'] = comment_df['comment_link_id'].str[3:]
    
    # writing dataframes to two different worksheets in an excel workbook
    with pd.ExcelWriter(f"{query}_posts_and_comments.xlsx", engine="xlsxwriter") as writer:
        posts_df.to_excel(writer, sheet_name='Posts')
        comment_df.to_excel(writer, sheet_name='Comments')

In [17]:
 # make a list of subreddits you want to scrape the data from
sub = ["X", "Y", "Z"] 
# make a list of the keywords to scrape for (1 is recommended)
query = ["A"]
path = ("file_path_to_save_to")

In [16]:
# run scrape sessions function
scrape_session(sub, query, path)