In [1]:
import praw
import pandas as pd
import spacy
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
import networkx as nx
from collections import Counter
import creds
import time


In [2]:
#Uncomment these to have full output on jupyter
#pd.set_option('display.max_colwidth', None)
#pd.set_option('display.max_rows', None)
#pd.set_option('display.width', None)  # This one is important for terminal-like output
#pd.set_option('display.expand_frame_repr', False) # Prevents line wrapping.


In [None]:
# https://www.reddit.com/r/pushshift/comments/1itme1k/separate_dump_files_for_the_top_40k_subreddits/

In [3]:
import sys
# !{sys.executable} -m pip install spacy
# !{sys.executable} -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")


In [4]:
"""Main function to orchestrate the process."""
reddit = praw.Reddit(
    client_id=creds.CLIENT_ID,
    client_secret=creds.CLIENT_SECRET,
    user_agent=creds.USER_AGENTS
)

In [5]:
# def fetch_reddit_posts(keywords, subreddits, days, limit):
#     end_time = datetime.datetime.now(datetime.UTC)
#     start_time = end_time - datetime.timedelta(days=days)

#     posts = []
#     for subreddit in subreddits:
#         current_end_time = end_time.timestamp()
#         while current_end_time > start_time.timestamp() and len(posts) < limit:
#             print(f"Fetching posts before: {datetime.datetime.fromtimestamp(current_end_time, datetime.UTC)}")
#             for submission in reddit.subreddit(subreddit).search(
#                 " OR ".join(keywords),
#                 sort="new",
#                 limit=min(100, limit - len(posts)),  # Fetch up to 100 at a time, respecting the overall limit
#                 params={'before': int(current_end_time)}  # Pass 'before' in the params dictionary
#             ):
#                 if submission.created_utc < start_time.timestamp():
#                     break  # Stop if we've gone past the start time

#                 posts.append({
#                     "subreddit": subreddit,
#                     "title": submission.title,
#                     "text": submission.selftext,
#                     "created_utc": submission.created_utc
#                 })
#             if posts:
#                 current_end_time = posts[-1]['created_utc']
#             else:
#                 break # No more posts found for this time range

#     return pd.DataFrame(posts)

# keywords = ["lazy", "lousy", "dirty", "bad", "terrible", "horrible", "unreliable"]
# subreddits = ["all"]
# #subreddits = ["airlines", "Comcast", "healthcare", "railways"]
# df = fetch_reddit_posts(keywords, subreddits, days=1000, limit=10000)
# df["created_date"] = pd.to_datetime(df["created_utc"], unit="s")
# print("DF Head---------------------------------")
# print(df.head())
# print("DF Tail---------------------------------")
# print(df.tail())
# print(f"Total posts fetched: {len(df)}")

In [None]:
#Same as above but limits post collection to one every X minute timespan
def fetch_reddit_posts_fast_intervals(keywords, subreddits, days, limit, interval_minutes=10):
    """
    Fetches Reddit posts, finding the first matching post in each interval and skipping to the next.

    Args:
        keywords (list): List of keywords to search for.
        subreddits (list): List of subreddits to search in.
        days (int): Number of days to search back.
        limit (int): Maximum number of posts to retrieve.
        interval_minutes (int, optional): Time interval in minutes. Defaults to 10.

    Returns:
        pd.DataFrame: DataFrame containing fetched posts.
    """

    end_time = datetime.datetime.now(datetime.UTC)
    start_time = end_time - datetime.timedelta(days=days)
    interval_seconds = interval_minutes * 60

    posts = []
    current_time = end_time
    added_post_ids = set()  # Track post IDs to prevent duplicates

    while current_time > start_time and len(posts) < limit:
        interval_start = current_time - datetime.timedelta(minutes=interval_minutes)
        interval_end = current_time
        print(f"End Time of Interval being checked: {interval_end}")
        
        found_post = False
        for subreddit in subreddits:
            if found_post:
                break #if we found a post, move on to the next subreddit.
            for submission in reddit.subreddit(subreddit).search(
                " OR ".join(keywords),
                sort="new",
                time_filter="all",
                params={'before': int(interval_end.timestamp()), 'after': int(interval_start.timestamp())}
            ):
                if submission.id not in added_post_ids: #check if the post was already added.
                    #Print something so we see it running
                    post_text = (submission.selftext or "")  # Handle cases where selftext is None
                    post_text_preview = post_text[:30]
                    print(f"Adding Post - ID: {submission.id}, UTC: {submission.created_utc}, Title: {submission.title}, Text: {post_text_preview}...")

                    posts.append({
                        "subreddit": submission.subreddit.display_name,
                        "title": submission.title,
                        "text": submission.selftext,
                        "created_utc": submission.created_utc
                    })
                added_post_ids.add(submission.id) #add post id to set.
                found_post = True
                break #we found a post, move on to the next subreddit.

        current_time = interval_start

    return pd.DataFrame(posts)

keywords = ["lazy", "lousy", "dirty", "bad", "terrible", "horrible", "unreliable", "wrong", "hate", "sucks"]
subreddits = ["all"]
df = fetch_reddit_posts_fast_intervals(keywords, subreddits, days=10000, limit=1000, interval_minutes=60)
df["created_date"] = pd.to_datetime(df["created_utc"], unit="s")
print("DF Head---------------------------------")
print(df.head())
print("DF Tail---------------------------------")
print(df.tail())
print(f"Total posts fetched: {len(df)}")

Interval End time: 2025-03-31 03:36:24.079975+00:00
Adding Post - ID: 1jnu4uw, UTC: 1743391649.0, Title: AITA for being in touch with every guy I ever talked to?, Text: I am 23 and I have talked to 4...
Interval End time: 2025-03-31 02:36:24.079975+00:00
Interval End time: 2025-03-31 01:36:24.079975+00:00
Interval End time: 2025-03-31 00:36:24.079975+00:00
Interval End time: 2025-03-30 23:36:24.079975+00:00
Interval End time: 2025-03-30 22:36:24.079975+00:00
Interval End time: 2025-03-30 21:36:24.079975+00:00
Interval End time: 2025-03-30 20:36:24.079975+00:00
Interval End time: 2025-03-30 19:36:24.079975+00:00
Interval End time: 2025-03-30 18:36:24.079975+00:00
Interval End time: 2025-03-30 17:36:24.079975+00:00
Interval End time: 2025-03-30 16:36:24.079975+00:00
Interval End time: 2025-03-30 15:36:24.079975+00:00
Interval End time: 2025-03-30 14:36:24.079975+00:00
Interval End time: 2025-03-30 13:36:24.079975+00:00
Interval End time: 2025-03-30 12:36:24.079975+00:00
Interval End time

In [None]:
df.columns
df.to_csv("df.csv", index=False)

In [None]:
# nlp = spacy.load("en_core_web_sm")

# def extract_entities(text):
#     doc = nlp(text)
#     entities = [ent.text for ent in doc.ents if ent.label_ in ["ORG", "GPE", "PRODUCT"]]
#     return entities if entities else None

# df["entities"] = df["text"].apply(lambda x: extract_entities(x) if isinstance(x, str) else [])
# df = df.explode("entities").dropna().reset_index(drop=True)  # Explode list of entities
# print(df.head())


In [None]:
#Same function as above cell but instead prints out a single entity that is most frequent of the post.
# import spacy
# import pandas as pd
# from collections import Counter

# nlp = spacy.load("en_core_web_sm")

def get_most_frequent_entity(text):
    """
    Extracts entities (ORG, GPE, PRODUCT) and returns the most frequent one.
    """
    if not isinstance(text, str) or not text:
        return None

    doc = nlp(text)
    entities = [ent.text for ent in doc.ents if ent.label_ in ["ORG", "GPE", "PRODUCT"]]

    if not entities:
        return None

    entity_counts = Counter(entities)
    return entity_counts.most_common(1)[0][0]  # Return the most frequent entity

# Example usage (assuming 'df' is your DataFrame):

# df["most_frequent_entity"] = df["text"].apply(get_most_frequent_entity)
# df = df.dropna(subset=['most_frequent_entity']).reset_index(drop=True)
df["entity"] = df["text"].apply(get_most_frequent_entity)
df = df.dropna(subset=['entity']).reset_index(drop=True)
print(df.head())

In [None]:
df.columns
df.to_csv("df_w_entity.csv", index=False)

In [None]:
def analyze_top_entities(df):
    """
    Counts the frequency of entities in a DataFrame column and filters the DataFrame
    to include only rows containing the top 5 most frequent entities.

    Args:
        df (pd.DataFrame): DataFrame containing a column named 'entities'
                                     (which should be a single entity per row).

    Returns:
        tuple: A tuple containing:
            - list: A list of the top 5 most frequent entities.
            - pd.DataFrame: A DataFrame filtered to include only rows where
                              the 'entities' column contains one of the top 5 entities.
    """
    # Count the frequency of each entity
    top_entities = df["entity"].value_counts().head(5).index.tolist()

    # Filter dataset to only include these top entities
    df_filtered = df[df["entity"].isin(top_entities)].reset_index(drop=True)

    print("Top 5 Entities:", top_entities)
    print(df_filtered.head())

    return top_entities, df_filtered

# Example usage (assuming 'df' is your DataFrame):

top_entities_result, df_filtered_result = analyze_top_entities(df.copy())

In [None]:
df_filtered_result.to_csv("df_filtered.csv", index=False)

In [None]:
def plot_entity_frequency_over_time(df):
    """
    Plots the frequency of entities over time (created_utc).

    Args:
        df (pd.DataFrame): DataFrame with columns 'entity' and 'created_utc'.
    """

    # Convert 'created_utc' to datetime if it's not already
    df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s')

    # Group by 'created_utc' and 'entity', then count occurrences
    entity_time_counts = df.groupby([pd.Grouper(key='created_utc', freq='Y'), 'entity']).size().reset_index(name='count')
    #D is daily, W Weekly, M Monthly

    # Pivot the data for plotting
    entity_time_pivot = entity_time_counts.pivot(index='created_utc', columns='entity', values='count').fillna(0)

    # Plot the data
    plt.figure(figsize=(15, 7))
    for entity in entity_time_pivot.columns:
        plt.plot(entity_time_pivot.index, entity_time_pivot[entity], label=entity)

    plt.title('Entity Frequency Over Time')
    plt.xlabel('Date')
    plt.ylabel('Frequency')
    plt.legend()
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

plot_entity_frequency_over_time(df_filtered_result)

In [None]:
# def analyze_top_entities(df):
#     """
#     Counts the frequency of entities in a DataFrame column and filters the DataFrame
#     to include only rows containing the top 5 most frequent entities.

#     Args:
#         df (pd.DataFrame): DataFrame containing a column named 'entities'
#                            (which should be a list of entities per row).

#     Returns:
#         tuple: A tuple containing:
#             - list: A list of the top 5 most frequent entities.
#             - pd.DataFrame: A DataFrame filtered to include only rows where
#                             the 'entities' column contains one of the top 5 entities.
#     """
#     # Explode the 'entities' column to count individual entity occurrences
#     df_exploded = df.explode("entities").dropna().reset_index(drop=True)

#     # Count the frequency of each entity
#     top_entities = df_exploded["entities"].value_counts().head(5).index.tolist()

#     # Filter dataset to only include these top entities
#     df_filtered = df_exploded[df_exploded["entities"].isin(top_entities)].reset_index(drop=True)

#     print("Top 5 Entities:", top_entities)
#     print(df_filtered.head())

#     return top_entities, df_filtered

# top_entities_result, df_filtered_result = analyze_top_entities(df.copy())


In [None]:
analyzer = SentimentIntensityAnalyzer()

def get_sentiment(text):
    score = analyzer.polarity_scores(text)
    return score["compound"]

df_filtered_result["sentiment"] = df_filtered_result["text"].apply(lambda x: get_sentiment(x) if isinstance(x, str) else 0)
print(df_filtered_result.head())


In [None]:
import matplotlib.pyplot as plt

def plot_entity_sentiment_over_time(df):
    """
    Plots the sentiment value of entities over time (created_utc).

    Args:
        df (pd.DataFrame): DataFrame with columns 'entity', 'created_utc', and 'sentiment'.
    """

    # Convert 'created_utc' to datetime
    df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s')

    # Group by time and entity, then calculate mean sentiment
    entity_time_sentiment = df.groupby([pd.Grouper(key='created_utc', freq='D'), 'entity'])['sentiment'].mean().reset_index()

    # Pivot the data for plotting
    entity_time_pivot = entity_time_sentiment.pivot(index='created_utc', columns='entity', values='sentiment').fillna(0)

    # Plot the data
    plt.figure(figsize=(15, 7))
    for entity in entity_time_pivot.columns:
        plt.plot(entity_time_pivot.index, entity_time_pivot[entity], label=entity, marker='o', markersize=5)

    plt.title('Entity Sentiment Over Time')
    plt.xlabel('Date')
    plt.ylabel('Sentiment')
    plt.legend()
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

# Call the function
plot_entity_sentiment_over_time(df_filtered_result)