In [1]:
import praw
import datetime
import re
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load tokenizer and model for text preprocessing
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def preprocess_text(text):
    """Cleans and tokenizes text for sentiment analysis."""
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters
    return text

def get_reddit_posts_and_comments(subreddit_name, limit=50, comment_limit=10):
    """Fetch recent posts and top comments from a subreddit using Reddit API (PRAW)."""
    # Set up Reddit API credentials (replace with your own keys)
    reddit = praw.Reddit(
        client_id="client_id", 
        client_secret="client_secret", 
        user_agent="Project_name/1.0 (by /u/Name)" 
    )
    
    subreddit = reddit.subreddit(subreddit_name)
    posts = []
    
    for post in subreddit.new(limit=limit):
        comments = []
        post.comments.replace_more(limit=0)  # Load all top-level comments
        for comment in post.comments.list()[:comment_limit]:
            comments.append(preprocess_text(comment.body))
        
        posts.append({
            "title": preprocess_text(post.title),
            "text": preprocess_text(post.selftext),
            "upvotes": post.score,
            "created_utc": datetime.datetime.fromtimestamp(post.created_utc),
            "num_comments": post.num_comments,
            "comments": comments,
            "permalink": f"https://www.reddit.com{post.permalink}"
        })
    
    return posts

# Example usage:
subreddits = ["stocks", "investing", "wallstreetbets"]
all_posts = []
for sub in subreddits:
    all_posts.extend(get_reddit_posts_and_comments(sub))

print(all_posts[:3])  # Print sample data


  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


[{'title': 'trading options vs stock equity', 'text': 'hey guys ive been looking through this subreddit and it looks like a lot of you trade call and put options on stocks instead of the actual stock equity itself \n\nive read a few books on stock options trading when i considered doing it not considering anymore and it always surprised me why people would do it because it seems way more risky and more importantly much much more difficult in stock options you often have to deal with not just the risk of the stocks price move but also implied volatility time decay and delta and on top of that you risk losing the entire premium paid for the option if the stocks price doesnt go overunder your strike price and these premiums can get very expensive if you lose a couple of bets in a row\n\nobviously theres multiple different ways you could combine these options to lower or amplify your risk but to me that just gives more points of riskfailure of losing your investment if you dont consider an

In [5]:
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk
import numpy as np

nltk.download("vader_lexicon")

sia = SentimentIntensityAnalyzer()

def analyze_sentiment(text):
    """Returns the compound sentiment score of a given text using VADER."""
    if not text.strip():
        return 0.0
    return sia.polarity_scores(text)["compound"]

sentiment_scores = []
for post in all_posts:
    title_sentiment = analyze_sentiment(post["title"])
    text_sentiment = analyze_sentiment(post["text"])
    comments_sentiment = [analyze_sentiment(comment) for comment in post["comments"]]

    total_sentiments = [title_sentiment, text_sentiment] + comments_sentiment
    post_avg_sentiment = np.mean(total_sentiments) if total_sentiments else 0.0

    post["title_sentiment"] = title_sentiment
    post["text_sentiment"] = text_sentiment
    post["comments_sentiment"] = comments_sentiment
    post["post_sentiment_score"] = post_avg_sentiment
    sentiment_scores.append(post_avg_sentiment)

market_sentiment_score = np.mean(sentiment_scores) if sentiment_scores else 0.0

if market_sentiment_score > 0.05:
    market_trend = "Bullish 📈"
elif market_sentiment_score < -0.05:
    market_trend = "Bearish 📉"
else:
    market_trend = "Neutral ⚖️"

print(f"Market Sentiment Score: {market_sentiment_score:.3f}")
print(f"Current Market Trend: {market_trend}")




[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\amish\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Market Sentiment Score: 0.146
Current Market Trend: Bullish 📈


In [7]:
import json
import pandas as pd

for post in all_posts:
    post["created_utc"] = post["created_utc"].isoformat()

with open("reddit_sentiment_data.json", "w") as f:
    json.dump(all_posts, f, indent=4)

df = pd.DataFrame(all_posts)

df.to_csv("reddit_sentiment_data.csv", index=False)

print("Data saved successfully! 🚀")



Data saved successfully! 🚀


In [None]:
with open("reddit_sentiment_data.json", "r") as f:
    all_posts = json.load(f)

for post in all_posts:
    post["created_utc"] = datetime.datetime.fromisoformat(post["created_utc"])

df = pd.read_csv("reddit_sentiment_data.csv")
all_posts = df.to_dict(orient="records")

print("Data loaded successfully! ✅")
