In [11]:
from praw import Reddit
from praw.models import Subreddit, Submission
import os
from dotenv import load_dotenv
from datetime import datetime, date, timedelta

In [18]:
import praw
from datetime import datetime
from praw.models import Subreddit, Submission

class RedditLoader:
    def __init__(self, reddit):
        """Initialize the RedditLoader with a PRAW Reddit instance."""
        self.reddit = reddit
    
    def get_subreddit(self, subreddit_name):
        """Creates subreddit instance to get submissions from it."""
        self.subreddit = Subreddit(reddit=self.reddit, display_name=subreddit_name)
        
    def get_submission(self, submission_id):
        """Creates a submission instance, used for getting comments."""
        self.submission = Submission(reddit=self.reddit, id=submission_id)
        
    def get_comments(self, start_date, company_list):
        """Collects all comments for a submission and returns list of dicts.

        Args:
            start_date (date): date to collect comments for
            company_list (list): list of companies to filter submissions/comments

        Returns:
            list: list of dicts, where each dict is a row for a comment of submission
        """
        submission_data = []
        if not self.submission.stickied:
            created_time = datetime.fromtimestamp(int(self.submission.created_utc))
            
            if created_time.date() == start_date:
                # Check if the title contains any company name
                if not any(company.lower() in self.submission.title.lower() for company in company_list):
                    return submission_data  # Skip if title doesn't match
                
                self.submission.comments.replace_more(limit=None)
                comment_queue = self.submission.comments[:]  
                while comment_queue:
                    comment = comment_queue.pop(0)
                    
                    # Check if comment contains any company name
                    if not any(company.lower() in comment.body.lower() for company in company_list):
                        continue  # Skip this comment if no match
                    
                    try:
                        author_name = comment.author.name
                    except AttributeError:
                        author_name = None
                        
                    row = {
                        'post_title': self.submission.title,
                        'comment_text': comment.body,
                        'comment_author': author_name,
                        'submission_id': self.submission.id,
                        'submission_author': self.submission.author.name,
                        'submission_created_time': created_time,
                    }
                    submission_data.append(row)
                    comment_queue.extend(comment.replies)  
        return submission_data
                    
    def collect_comments(self, subreddit_name, start_date, company_list):
        """Collects all comments for selected date and returns list of dicts.

        Args:
            subreddit_name (str): the name of subreddit.
            start_date (date): for which date to collect comments
            company_list (list): list of companies to filter submissions/comments

        Returns:
            list: list of dictionaries, each dictionary is a row of data. Could be used to create a dataframe.
        """
        subreddit_data = []
        self.get_subreddit(subreddit_name)
        
        for submission in self.subreddit.new(limit=2000):
            self.get_submission(submission.id)
            submission_time = datetime.fromtimestamp(int(self.submission.created_utc))
            if submission_time.date() < start_date:
                break  
            
            new_data = self.get_comments(start_date, company_list)
            
            if new_data:
                subreddit_data.extend(new_data)
            
        return subreddit_data


In [19]:
if __name__ == "__main__":
    
    
    config = dotenv_values(".env")

    reddit = praw.Reddit(
    client_id=config["CLIENT_ID"],
    client_secret=config["CLIENT_SECRET"],
    redirect_uri=config["REDIRECT_URL"],
    user_agent=config["USER_AGENT"]
    )

    loader = RedditLoader(reddit)
start_date = datetime(2024, 11, 30).date()  
company_list = ['Apple', 'Google', 'Microsoft']  
filtered_comments = loader.collect_comments('technology', start_date, company_list)

for comment in filtered_comments:
    print(f"Title: {comment['post_title']}")
    print(f"Comment: {comment['comment_text']}\n")
