### Imports and setup for PRAW

In [None]:
import praw
import pandas as pd
import time
import re
from nltk.corpus import stopwords

client_id = '0X3RFkechF94pb0jhrBaBA'
client_secret = '_A0RRGRXo4w_rWKm6mGYeoqqvW2NnA'
user_agent='MyRedditBot:v1.0 (by u/Healthy-Pollution929)'

### Get the data

In [None]:
# Set up the PRAW client
reddit = praw.Reddit(client_id=client_id, client_secret=client_secret, user_agent=user_agent)

def fetch_comments(subreddit_name, search_query, time_filter='year', post_limit=200, comment_limit=10, sleep_time=1):
    """
    Fetch the top 10 most upvoted comments from posts on a specific topic in a given subreddit.
    
    Parameters:
        subreddit_name (str): Name of the subreddit to search.
        search_query (str): The search term to filter posts.
        time_filter (str): Time range for the search ('day', 'week', 'month', 'year', 'all').
        post_limit (int): Maximum number of posts to retrieve.
        comment_limit (int): Maximum number of comments to retrieve per post.
        sleep_time (int): Time in seconds to wait between each request batch.

    Returns:
        pd.DataFrame: A DataFrame containing collected comment data.
    """
    comments_data = []
    subreddit = reddit.subreddit(subreddit_name)
    print(f"Collecting comments from posts in r/{subreddit_name} related to '{search_query}'...")

    try:
        # Search for relevant posts
        for post in subreddit.search(search_query, time_filter=time_filter, limit=post_limit):
            # Set comment sort order to "top" to get the highest upvoted comments
            post.comment_sort = 'top'
            
            # Get the post permalink to construct URLs
            post_url = f"https://www.reddit.com{post.permalink}"

            # Get the top comments for each post
            post.comments.replace_more(limit=0)
            for comment in post.comments[:comment_limit]:  # Limit to top 10 comments per post
                if comment.body.strip():  # Only include non-empty comments
                    # Construct the direct URL to the comment
                    comment_url = f"{post_url}{comment.id}"

                    # Store the comment data in a dictionary
                    comment_info = {
                        "created_date": pd.to_datetime(comment.created_utc, unit='s'),
                        "subreddit_id": subreddit_name,
                        "search_query": search_query,
                        "post_id": post.id,
                        "comment_id": comment.id,
                        "post_title": post.title,
                        "comment_text": comment.body,
                        "upvotes": comment.ups,
                        "post_url": post_url,         
                        "comment_url": comment_url  
                    }

                    # Append the comment data to the list
                    comments_data.append(comment_info)

            # Sleep to prevent hitting Reddit's rate limit
            time.sleep(sleep_time)

    # Handle API exceptions
    except praw.exceptions.RedditAPIException as api_error:
        print(f"Rate limit or other API error: {api_error}")
        time.sleep(60) 

    # Handle other exceptions
    except Exception as e:
        print(f"Unexpected error occurred: {e}")
        time.sleep(10) 

    # Convert collected data to DataFrame
    df_comments = pd.DataFrame(comments_data)
    
    # Reorder columns
    column_order = ["created_date", "subreddit_id", "search_query", "post_id", "comment_id", 
                    "post_title", "comment_text", "upvotes", "post_url", "comment_url"]
    df_comments = df_comments[column_order]

    return df_comments

# Define parameters
subreddit_name = 'Israel'  # Specify one subreddit
search_query = 'Palestine'  # Single topic

# Fetch comments for one subreddit and topic
df_comments = fetch_comments(subreddit_name=subreddit_name, search_query=search_query)

# Optional: Save the data to a CSV file
df_comments.to_csv(f'data/{subreddit_name}_{search_query}_comments.csv', index=False)

print(f"Collected {len(df_comments_israel)} comments on '{search_query}' from r/{subreddit_name}.")


### Data processing

In [None]:
# Load dataframes
df_politics_Israel = pd.read_csv('data/politics_Israel_comments.csv')
df_politics_Palestine = pd.read_csv('data/politics_Palestine_comments.csv')
df_worldnews_Israel = pd.read_csv('data/worldnews_Israel_comments.csv')
df_worldnews_Palestine = pd.read_csv('data/worldnews_Palestine_comments.csv')
df_Israel_Israel = pd.read_csv('data/Israel_Israel_comments.csv')
df_Israel_Palestine = pd.read_csv('data/Israel_Palestine_comments.csv')
df_Palestine_Israel = pd.read_csv('data/Palestine_Israel_comments.csv')
df_Palestine_Palestine = pd.read_csv('data/Palestine_Palestine_comments.csv')

# Concatenate the DataFrames
df_combined = pd.concat([
    df_politics_Israel, df_politics_Palestine, 
    df_worldnews_Israel, df_worldnews_Palestine, 
    df_Israel_Israel, df_Israel_Palestine, 
    df_Palestine_Israel, df_Palestine_Palestine
])

# Print initial shape
print(f"Combined DataFrame shape before processing: {df_combined.shape}")

# Store original text in a new column for reference
df_combined['comment_original'] = df_combined['comment_text']

# Remove duplicate comments based on 'comment_text'
df_combined = df_combined.drop_duplicates(subset='comment_text')

# Remove comments with 1 or fewer upvotes
df_combined = df_combined[df_combined['upvotes'] > 1]

# Text preprocessing steps applied to 'comment_text'
# Remove URLs
df_combined['comment_text'] = df_combined['comment_text'].apply(lambda x: re.sub(r'http\S+|www.\S+', '', x))

# Lowercase conversion
df_combined['comment_text'] = df_combined['comment_text'].str.lower()

# Remove special characters
df_combined['comment_text'] = df_combined['comment_text'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))

# Remove stop words (make sure nltk stopwords are downloaded)
stop_words = set(stopwords.words('english'))
df_combined['comment_text'] = df_combined['comment_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

# Renaming
df_combined.rename(columns={'comment_text': 'comment_processed'}, inplace=True)

# Reorder columns
df_combined = df_combined[['created_date', 'subreddit_id', 'search_query', 'post_id', 'comment_id', 
                           'post_title', 'comment_original', 'comment_processed', 'upvotes', 
                           'post_url', 'comment_url']]

# Reset index
df_combined.reset_index(drop=True, inplace=True)

# Print shape after processing
print(f"Combined DataFrame shape after processing: {df_combined.shape}")

# Display final DataFrame
df_combined


Example to compare original comment with processed comment

In [None]:
# print out a comment URL and text
print(df_combined['comment_url'].iloc[0])
print(df_combined['comment_original'].iloc[0])
print(df_combined['comment_processed'].iloc[0])

### Sentiment Analysis

In [None]:

from afinn import Afinn
from collections import defaultdict

# Load AFINN lexicon with defaultdict to handle missing words
sent_lexicon = defaultdict(lambda: 0, Afinn()._dict)

# Function to calculate sentiment score for a single text
def calculate_sentiment(text):

    print(*(sent_lexicon[word] for word in text), sep='\n')
    
    # Sum up the sentiment scores of each word in the text
    sentiment_score = sum(sent_lexicon[word] for word in text)
    
    # Optionally normalize by the number of words (average sentiment per word)
    if len(text) > 0:
        normalized_score = sentiment_score / len(text)
    else:
        normalized_score = 0
    
    return normalized_score