In [1]:
import praw
import pandas as pd
from datetime import datetime as dt
from dotenv import load_dotenv
from os import getenv

import warnings
warnings.filterwarnings('ignore') # We can suppress the warnings

In [2]:
# Load from the .env
load_dotenv()

# Get the environmental variables from the .env file
APP_NAME = getenv('APP_NAME')
APP_ID = getenv("APP_ID")
APP_SECRET = getenv("APP_SECRET")
USERNAME = getenv('REDDIT_NAME')
PASSWORD = getenv('REDDIT_PASSWORD')

Confirm that the credentials have been loaded

In [3]:
if APP_NAME and APP_ID and APP_SECRET and USERNAME and PASSWORD:
    print("Credentials loaded")
else:
    print("ERROR: Credentials not loaded!!")

Credentials loaded


# PRAW

Using the API to access Reddit's posts and comments

In [4]:
reddit = praw.Reddit(
    client_id=getenv("CLIENT_ID"), # Right below 'personal use script'
    client_secret=getenv("CLIENT_SECRET"), # secret
    user_agent=f"pda-2023 u/{getenv('USERNAME')}", # app-name u/username
)


Confirm read only access, cannot post/write

In [5]:
print(reddit.read_only)

True


Go to the Farming subreddit

In [6]:
subreddit = reddit.subreddit("Farming")

Create a loop to go through the categories of the subreddit and save the post title and the first comment
The number of posts was limited as any higher return a 429 error

In [8]:
num_posts = 500
categories = ['hot', 'new', 'top', 'rising', 'controversial']

# Initialize empty lists to store post titles and comments
postTitles = []
postComments = []

# for post in subreddit.new(limit=num_posts):
#     postTitles.append(post.title)
#     postComments.append(post.comments[0].body)

for category in categories:
    for post in getattr(subreddit, category)(limit=num_posts):
        postTitles.append(post.title)
        if post.comments:
            postComments.append(post.comments[0].body)
        else:
            postComments.append("No comments")


Save the title and comments to a dataframe

In [9]:
df = pd.DataFrame({
    "PostTitle": postTitles,
    "PostComments": postComments
})

Check how many titles and comments were gathered

In [10]:
df.shape

(2020, 2)

Create a list of keywords relevant to the farming industry of interest

In [11]:
keywords = ['dairy', 'beef', 'livestock','sheep','bovine','pig','pork','price','feed', 'cheese', 'crop', 'wheat'
            ,'barley','chicken','poultry','vegetable','yogurt', 'cow', 'goat', 'cost'
           ]

Use a lambda function to filter the dataframe based on the list of keywords

In [12]:
# Search for posts containing the keywords
title = []
comments = []
filtered_df = df[df.apply(lambda row: any(keyword in row['PostTitle'] or keyword in row['PostComments'] for keyword in keywords), axis=1)]
#Check how many rows are remaining
filtered_df.shape

(503, 2)

Save the filtered dataframe to a CSV for sentiment analysis - commented to stop the file being overwritten

In [13]:
#filtered_df.to_csv("Reddit_Python_Posts.csv", index=False)