In [1]:
import praw
import pandas as pd
from datetime import datetime
import reddit_credentials as rc

# Initialize Reddit API (replace with your credentials)
reddit = praw.Reddit(client_id=rc.client_id,
                     client_secret=rc.client_secret,
                     user_agent=rc.user_agent)

# Subreddits to analyze
subreddits = ["moving", "florida", "realestate", "jobs", "tax"]

posts = []
for sub in subreddits:
    for submission in reddit.subreddit(sub).search("moving to Florida", limit=1000):
        posts.append([submission.created_utc, submission.title, submission.selftext, sub])

# Convert to DataFrame
df = pd.DataFrame(posts, columns=["created_utc", "title", "selftext", "subreddit"])
df["created_utc"] = pd.to_datetime(df["created_utc"], unit="s")

# Merge title and selftext for classification
df["text"] = df["title"] + " " + df["selftext"]
df = df[["created_utc", "text", "subreddit"]]

  from pandas.core import (


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

# Split data
X_train, X_test, y_train, y_test = train_test_split(df["text"], df["subreddit"], test_size=0.2, random_state=42)

# Build pipeline
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(stop_words="english", max_features=5000)),
    ("clf", LogisticRegression())
])

# Train model
pipeline.fit(X_train, y_train)

# Evaluate model
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")


Model Accuracy: 0.88


In [3]:
from bertopic import BERTopic

# Train topic model
topic_model = BERTopic()
topics, _ = topic_model.fit_transform(df["text"])

# Add topics to DataFrame
df["topic"] = topics

# Aggregate topics over time
df["month"] = df["created_utc"].dt.to_period("M").astype(str)
df_topic_trends = df.groupby(["month", "topic"]).size().reset_index(name="post_count")

# Convert to wide format for forecasting
df_pivot = df_topic_trends.pivot(index="month", columns="topic", values="post_count").fillna(0)
df_pivot.index = pd.to_datetime(df_pivot.index)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [4]:
from prophet import Prophet

# Forecast each topic
topic_forecasts = {}

for topic in df_pivot.columns:
    df_topic = df_pivot[[topic]].reset_index()
    df_topic.rename(columns={"month": "ds", topic: "y"}, inplace=True)

    # Train Prophet model
    model = Prophet()
    model.fit(df_topic)

    # Predict future topics
    future = model.make_future_dataframe(periods=12, freq="M")
    forecast = model.predict(future)

    # Store predictions
    topic_forecasts[topic] = forecast[["ds", "yhat"]]


18:55:07 - cmdstanpy - INFO - Chain [1] start processing
18:55:08 - cmdstanpy - INFO - Chain [1] done processing
18:55:08 - cmdstanpy - INFO - Chain [1] start processing
18:55:08 - cmdstanpy - INFO - Chain [1] done processing
18:55:08 - cmdstanpy - INFO - Chain [1] start processing
18:55:08 - cmdstanpy - INFO - Chain [1] done processing
18:55:09 - cmdstanpy - INFO - Chain [1] start processing
18:55:09 - cmdstanpy - INFO - Chain [1] done processing
18:55:09 - cmdstanpy - INFO - Chain [1] start processing
18:55:09 - cmdstanpy - INFO - Chain [1] done processing


In [6]:
# Example: Generate synthetic future discussion topics
future_topics = {topic: topic_forecasts[topic]["yhat"].values[-1] for topic in topic_forecasts}

# Simulate a new post (e.g., rising topic)
topic_id = list(future_topics.keys())[0]
sample_text = " ".join([word for word, _ in topic_model.get_topic(topic_id)])

# Predict the subreddit
predicted_subreddit = pipeline.predict([sample_text])[0]
print(f"Predicted subreddit: {predicted_subreddit}")


Predicted subreddit: florida
