# Text Analysis of Subreddits - Collaborative Project

The purpose of this project is to analyse the differences and similarities between different subreddits existing around a common topic. At first, we will collect Reddit data, analyse TF-IDF scores, and attempt to classify the subreddits using k-means and Naive Bayes algorithms. Then, we will introduce networks to visualise connections between threaded comments and users.

In [None]:
# Setup autoreload
%load_ext autoreload
%autoreload 2

# Create README.md 
# pip3 install nbconvert
# jupyter nbconvert --execute --to markdown RedditTextAnalysis.ipynb
# then rename to README.md

## I/ Collecting Reddit Data

In [None]:
# Import necessary modules
import os
import pickle
import json
import pandas as pd
from models.reddit_scraper_monthly import RedditScraper_monthly
from config.settings import USER_AGENT
from utils.analysis import *
from datetime import datetime

In [None]:
subs_of_interest = ['islam', 'Christianity', 'atheism', 'Buddhism']

------

### Code for extracting monthly posts

In [None]:
# Define months in 2024 up to the current month
start_dates = [datetime(2024, month, 1) for month in range(1, 12)]
end_dates = [datetime(2024, month + 1, 1) if month < 12 else datetime(2024, 12, 31) for month in range(1, 12)]

scraper = RedditScraper_monthly(USER_AGENT)

# Dictionary to store posts for each subreddit across all months
all_posts = {sub: [] for sub in subs_of_interest}

# Loop through each month to collect posts
for sub in subs_of_interest:
    for start, end in zip(start_dates, end_dates):
        month_name = start.strftime("%B")
        print(f"Scraping {sub} for {month_name} 2024...")
        
        # Scrape posts for the specific month
        posts = scraper.get_subreddit_posts(sub, limit=100, cache=True, after=start, before=end)
        
        # Append the monthly posts to the subreddit-specific list
        all_posts[sub].extend(posts)

In [None]:
# Turning all 40 JSOns into four dataframes 
# Directory where JSON files are stored
cache_dir = 'cache'

# Dictionary to store DataFrames
dataframes = {}

# Loop through each subreddit to combine JSON files into one DataFrame
for sub in subs_of_interest:
    json_files = [f for f in os.listdir(cache_dir) if f.startswith(sub) and f.endswith('.json')]
    
    # List to collect individual DataFrames for each month
    monthly_data = []
    
    for json_file in json_files:
        with open(os.path.join(cache_dir, json_file), 'r') as f:
            data = json.load(f)
            df = pd.DataFrame(data)
            monthly_data.append(df)
    
    # Concatenate all monthly data into a single DataFrame for the subreddit
    dataframes[sub] = pd.concat(monthly_data, ignore_index=True)

# Clean up by removing variables that aren’t needed
del cache_dir, subs_of_interest, sub, json_files, monthly_data, json_file, f, data, df


In [None]:
# Extract each topic's DataFrame from the dataframes dictionary for a better overview
df_islam = dataframes['islam']
df_christianity = dataframes['Christianity']
df_atheism = dataframes['atheism']
df_buddhism = dataframes['Buddhism']

# Note that for the rest of the code to work these dataframes need to be saved as "df_something" (i.e., start with df_)

In [None]:
# Perform TFIDF analysis on aggregated posts for each subreddit
for sub, posts in all_posts.items():
    print(f"\nConducting TFIDF analysis on all posts for {sub} in 2024...")

    # Create a DataFrame for the aggregated posts
    posts_df = create_posts_dataframe(posts)
    
    # Conduct TFIDF analysis
    tfidf_results = tfidf_analyze_subreddit(posts)
    tf_idf_scores = get_mean_tfidf(
        tfidf_matrix=tfidf_results['tfidf_matrix'],
        feature_names=tfidf_results['feature_names'],
        return_df=True
    )
    
    # Get top 10 terms for each subreddit
    top_terms = get_top_terms(tf_idf_scores, n_terms=10)
    print(f"The top 10 terms for {sub} in 2024 are:\n", "\n".join(top_terms), sep="")
    display(tf_idf_scores.head().style.format("{:.3f}"))

In [None]:
# Create a 'datetime' column for each DataFrame that ends in '_df' (increases reusability)
for name, df in list(globals().items()):
    if name.startswith('df_') and isinstance(df, pd.DataFrame):
        df['datetime'] = pd.to_datetime(df['created_utc'], unit='s')