# Reddit API Data Collection & Sentiment Analysis

## Part 1: Reddit API Setup & Data Collection

In [2]:
import praw
from credentials import client_id, client_secret, user_agent

# Create Reddit instance (read-only access)
reddit = praw.Reddit(
    client_id=client_id,
    client_secret=client_secret,
    user_agent=user_agent
)

# Test the connection
try:
    # Test by accessing a subreddit (read-only operation)
    subreddit = reddit.subreddit("test")
    print(f"Successfully connected to Reddit API!")
    print(f"Testing with subreddit: {subreddit.display_name}")
    print("Reddit API connection successful!")
except Exception as e:
    print(f"Connection failed: {e}")

Successfully connected to Reddit API!
Testing with subreddit: test
Reddit API connection successful!


## Part 2: Collect Data and Storage 

In [3]:
import pandas as pd
from datetime import datetime

# Target subreddits as specified in assignment
target_subreddits = ['politics', 'PoliticalDiscussion', 'worldnews']
posts_per_subreddit = 20

print("="*60)
print("COLLECTING REDDIT DATA FOR SENTIMENT ANALYSIS")
print("="*60)

# Initialize list to store all posts data
all_posts_data = []

# Collect data from each subreddit
for subreddit_name in target_subreddits:
    print(f"\n📊 Collecting data from r/{subreddit_name}...")
    
    try:
        # Access the subreddit
        subreddit = reddit.subreddit(subreddit_name)
        
        # Get hot posts (you can change to .top() if needed)
        posts = list(subreddit.hot(limit=posts_per_subreddit))
        
        print(f"   ✅ Successfully retrieved {len(posts)} posts")
        
        # Extract required data for each post
        for post in posts:
            post_data = {
                'subreddit': subreddit_name,
                'title': post.title,
                'score': post.score,
                'num_comments': post.num_comments,
                'id': post.id,
                'url': post.url,
                'created_utc': datetime.fromtimestamp(post.created_utc),
                'author': str(post.author) if post.author else '[deleted]'
            }
            all_posts_data.append(post_data)
            
    except Exception as e:
        print(f"   ❌ Error collecting from r/{subreddit_name}: {e}")

# Create DataFrame for storage
df_posts = pd.DataFrame(all_posts_data)

print(f"\n📈 DATA COLLECTION SUMMARY:")
print(f"   Total posts collected: {len(df_posts)}")
print(f"   Posts per subreddit:")
for subreddit in target_subreddits:
    count = len(df_posts[df_posts['subreddit'] == subreddit])
    print(f"     r/{subreddit}: {count} posts")

COLLECTING REDDIT DATA FOR SENTIMENT ANALYSIS

📊 Collecting data from r/politics...
   ✅ Successfully retrieved 20 posts

📊 Collecting data from r/PoliticalDiscussion...
   ✅ Successfully retrieved 20 posts

📊 Collecting data from r/PoliticalDiscussion...
   ✅ Successfully retrieved 20 posts

📊 Collecting data from r/worldnews...
   ✅ Successfully retrieved 20 posts

📊 Collecting data from r/worldnews...
   ✅ Successfully retrieved 20 posts

📈 DATA COLLECTION SUMMARY:
   Total posts collected: 60
   Posts per subreddit:
     r/politics: 20 posts
     r/PoliticalDiscussion: 20 posts
     r/worldnews: 20 posts
   ✅ Successfully retrieved 20 posts

📈 DATA COLLECTION SUMMARY:
   Total posts collected: 60
   Posts per subreddit:
     r/politics: 20 posts
     r/PoliticalDiscussion: 20 posts
     r/worldnews: 20 posts


In [4]:
csv_filename = "reddit_posts_data.csv"
df_posts.to_csv(csv_filename, index=False)

## Part 3: Comment Collection from Most Relevant Posts

**Objective**: Collect 5 comments per post from the most relevant posts (highest score + engagement)

In [5]:
# STEP 1: Identify most relevant posts based on engagement metrics
print("="*60)
print("COLLECTING COMMENTS FROM MOST RELEVANT POSTS")
print("="*60)

# Calculate engagement score (combination of score and comments)
df_posts['engagement_score'] = df_posts['score'] + (df_posts['num_comments'] * 2)

# Sort by engagement score and select top posts for comment collection
# Let's take top 10 most relevant posts across all subreddits
top_posts = df_posts.nlargest(10, 'engagement_score')

print(f"📊 Selected {len(top_posts)} most relevant posts for comment collection:")
print(f"   Selection criteria: score + (num_comments × 2)")
print("\nTop Posts Selected:")
for idx, (_, post) in enumerate(top_posts.iterrows(), 1):
    print(f"   {idx}. r/{post['subreddit']}: {post['title'][:50]}... (Score: {post['score']}, Comments: {post['num_comments']})")

# STEP 2: Collect comments from these top posts
comments_per_post = 5
all_comments_data = []

print(f"\n🔍 Collecting {comments_per_post} comments per post...")

for idx, (_, post) in enumerate(top_posts.iterrows(), 1):
    print(f"\n   📝 Processing post {idx}/{len(top_posts)}: r/{post['subreddit']}")
    
    try:
        # Get the Reddit submission object using the post ID
        submission = reddit.submission(id=post['id'])
        
        # Replace "MoreComments" objects and get top-level comments
        submission.comments.replace_more(limit=0)
        
        # Get the top comments (sorted by score)
        top_comments = sorted(submission.comments, key=lambda x: x.score, reverse=True)
        
        # Collect up to 5 comments per post
        comments_collected = 0
        for comment in top_comments:
            if comments_collected >= comments_per_post:
                break
                
            # Skip deleted/removed comments
            if hasattr(comment, 'body') and comment.body not in ['[deleted]', '[removed]']:
                comment_data = {
                    'post_id': post['id'],
                    'post_title': post['title'],
                    'subreddit': post['subreddit'],
                    'comment_id': comment.id,
                    'body': comment.body,
                    'score': comment.score,
                    'author': str(comment.author) if comment.author else '[deleted]',
                    'created_utc': datetime.fromtimestamp(comment.created_utc)
                }
                all_comments_data.append(comment_data)
                comments_collected += 1
        
        print(f"      ✅ Collected {comments_collected} comments")
        
    except Exception as e:
        print(f"      ❌ Error collecting comments from post {post['id']}: {e}")

# STEP 3: Create DataFrame and save comments data
df_comments = pd.DataFrame(all_comments_data)

print(f"\n📈 COMMENT COLLECTION SUMMARY:")
print(f"   Total comments collected: {len(df_comments)}")
print(f"   Comments per subreddit:")
for subreddit in df_comments['subreddit'].unique():
    count = len(df_comments[df_comments['subreddit'] == subreddit])
    print(f"     r/{subreddit}: {count} comments")

# Display sample of collected comments
print(f"\n📋 Sample of collected comments:")
print(df_comments[['subreddit', 'score', 'body']].head(3).to_string(max_colwidth=50))

COLLECTING COMMENTS FROM MOST RELEVANT POSTS
📊 Selected 10 most relevant posts for comment collection:
   Selection criteria: score + (num_comments × 2)

Top Posts Selected:
   1. r/worldnews: Zelenskyy points out that Trump’s “two weeks” give... (Score: 36367, Comments: 681)
   2. r/politics: Trump faces returning $100bn in tariffs after cour... (Score: 29351, Comments: 1176)
   3. r/politics: Bernie Sanders breaks with Democrats and endorses ... (Score: 21670, Comments: 676)
   4. r/worldnews: EU head's plane hit by suspected Russian GPS inter... (Score: 19752, Comments: 665)
   5. r/politics: Donald Trump posting week-old photo raises eyebrow... (Score: 15378, Comments: 1434)
   6. r/PoliticalDiscussion: Casual Questions Thread... (Score: 89, Comments: 8311)
   7. r/worldnews: To defend against Russian tanks, Finland and Polan... (Score: 7082, Comments: 240)
   8. r/politics: Donald Trump is weaker than he looks... (Score: 6050, Comments: 537)
   9. r/worldnews: All UN Security Coun

In [6]:
# STEP 4: Save comments data with proper linking to parent posts
comments_csv_filename = "reddit_comments_data.csv"
df_comments.to_csv(comments_csv_filename, index=False)

print(f"\n💾 STORAGE COMPLETED:")
print(f"   Comments saved to: {comments_csv_filename}")
print(f"   Total records: {len(df_comments)}")
print(f"   Columns: {list(df_comments.columns)}")

# Verify data integrity - ensure all comments are linked to valid posts
linked_posts = df_comments['post_id'].nunique()
print(f"\n🔗 DATA INTEGRITY CHECK:")
print(f"   Comments are linked to {linked_posts} unique posts")
print(f"   Average comments per post: {len(df_comments) / linked_posts:.1f}")

# Show data structure for verification
print(f"\n📊 DATA STRUCTURE PREVIEW:")
print(df_comments.info())

print(f"\n✅ Comment collection and storage completed successfully!")
print(f"   Each comment is properly linked to its parent post via 'post_id'")
print(f"   Ready for sentiment analysis in next steps")


💾 STORAGE COMPLETED:
   Comments saved to: reddit_comments_data.csv
   Total records: 50
   Columns: ['post_id', 'post_title', 'subreddit', 'comment_id', 'body', 'score', 'author', 'created_utc']

🔗 DATA INTEGRITY CHECK:
   Comments are linked to 10 unique posts
   Average comments per post: 5.0

📊 DATA STRUCTURE PREVIEW:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   post_id      50 non-null     object        
 1   post_title   50 non-null     object        
 2   subreddit    50 non-null     object        
 3   comment_id   50 non-null     object        
 4   body         50 non-null     object        
 5   score        50 non-null     int64         
 6   author       50 non-null     object        
 7   created_utc  50 non-null     datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(6)
memory usage: 3.3+ KB
None

✅ Commen