# Text Analysis of Subreddits - Collaborative Project

The purpose of this project is to analyse the differences and similarities between different subreddits existing around a common topic. At first, we will collect Reddit data, analyse TF-IDF scores, and attempt to classify the subreddits using k-means and Naive Bayes algorithms. Then, we will introduce networks to visualise connections between threaded comments and users.

In [None]:
# Setup autoreload
%load_ext autoreload
%autoreload 2

# Create README.md 
# pip3 install nbconvert
# jupyter nbconvert --execute --to markdown RedditTextAnalysis.ipynb
# then rename to README.md

## Part I/ Collecting Reddit Data

In [1]:
# Import necessary modules
import os
import pickle
import json
import pandas as pd
from config.settings import USER_AGENT
from models.reddit_scraper import RedditScraper
from utils.analysis import *
from datetime import datetime

In [13]:
subs_of_interest = ['islam', 'Christianity', 'atheism', 'Buddhism']

In [None]:
# scraper = RedditScraper(USER_AGENT)

# subs_of_interest = ['islam', 'atheism', 'Christianity', 'Buddhism']

# results = {} 

# for sub in subs_of_interest:    
#     posts = scraper.get_subreddit_posts(sub, limit=1000, cache=True)
#     posts_df = create_posts_dataframe(posts)
    
#     tfidf_results = tfidf_analyze_subreddit(posts, include_selftext=True)
#     # tfidf_results = tfidf_analyze_subreddit(posts)
#     tf_idf_scores = get_mean_tfidf(
#         tfidf_matrix=tfidf_results['tfidf_matrix'],
#         feature_names=tfidf_results['feature_names'],
#         return_df=True
#     )
#     results[sub] = {"posts_df":posts_df,
#                     "tfidf_results":tfidf_results,
#                     "tf_idf_scores":tf_idf_scores}

# # Ensure the data directory exists
# os.makedirs("data", exist_ok=True)

# # Just a backup of all the files, not used in the analysis directly
# with open("data/results.pkl", "wb") as f:
#     pickle.dump(results, f)

In [14]:
# Code to open the pickle
with open("data/results.pkl", "rb") as f:
    results = pickle.load(f)

print(results.keys())
print(results['islam'].keys())
print(results['islam']['posts_df']['selftext'][1])
print(results['islam']['posts_df']['selftext'].value_counts())


dict_keys(['islam', 'atheism', 'Christianity', 'Buddhism'])
dict_keys(['posts_df', 'tfidf_results', 'tf_idf_scores'])
I could be in the worst of the worst situations and I read maghrib then POOF its all gone, people who genuinely follow the religion are the best people unlike most people this gen vaping and going to parties thinking its cool.
selftext
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      

## Part II/ Initial Similarity Analysis 

### Jaccard Similarity Analysis

In [None]:
# Extract the vocabulary for each subreddit
vocabularies = {sub: set(results[sub]['tfidf_results']['feature_names']) for sub in subs_of_interest}

# Get the intersection of the vocabularies
common_vocab = set.intersection(*vocabularies.values())

# Report analytics
print(f"Number of common terms across all subreddits: {len(common_vocab)}")
print(f"Common terms: {', '.join(list(common_vocab)[:10])}...")  # Display first 10 common terms

# Calculate Jaccard similarity for each pair of subreddits
for sub1 in subs_of_interest:
    for sub2 in subs_of_interest:
        if sub1 > sub2:
            intersection = vocabularies[sub1].intersection(vocabularies[sub2])
            union = vocabularies[sub1].union(vocabularies[sub2])
            jaccard_similarity = len(intersection) / len(union)
            print(f"Jaccard similarity between {sub1} and {sub2}: {jaccard_similarity:.3f}")

# Report unique terms for each subreddit
for sub in subs_of_interest:
    unique_terms = vocabularies[sub] - common_vocab
    print(f"Number of unique terms in {sub}: {len(unique_terms)}")
    print(f"Unique terms in {sub}: {', '.join(list(unique_terms)[:10])}...")  # Display first 10 unique terms

### Plotting Word Similarities #1: MDS

In [None]:
from utils.analysis import plot_word_similarities_mds

for sub in subs_of_interest:
    plot_word_similarities_mds(results[sub]['tfidf_results']['tfidf_matrix'], 
                               results[sub]['tfidf_results']['feature_names'],
                               n_terms=20,
                               title=sub)
    plt.show()


### Plotting the word similarities #2: Using t-SNE

In [None]:
from utils.analysis import plot_word_similarities_tsne

for sub in subs_of_interest:
    fig, ax = plot_word_similarities_tsne(results[sub]['tfidf_results']['tfidf_matrix'], 
                                     results[sub]['tfidf_results']['feature_names'],
                                     n_highlight=20,
                                     title=sub)
    plt.show();

## Part III/ Believe vs. Knowing 

------
------

### Code for extracting monthly posts

In [None]:
# Define months in 2024 up to the current month
start_dates = [datetime(2024, month, 1) for month in range(1, 12)]
end_dates = [datetime(2024, month + 1, 1) if month < 12 else datetime(2024, 12, 31) for month in range(1, 12)]

scraper = RedditScraper_monthly(USER_AGENT)

# Dictionary to store posts for each subreddit across all months
all_posts = {sub: [] for sub in subs_of_interest}

# Loop through each month to collect posts
for sub in subs_of_interest:
    for start, end in zip(start_dates, end_dates):
        month_name = start.strftime("%B")
        print(f"Scraping {sub} for {month_name} 2024...")
        
        # Scrape posts for the specific month
        posts = scraper.get_subreddit_posts(sub, limit=100, cache=True, after=start, before=end)
        
        # Append the monthly posts to the subreddit-specific list
        all_posts[sub].extend(posts)

In [None]:
# Turning all 40 JSOns into four dataframes 
# Directory where JSON files are stored
cache_dir = 'cache'

# Dictionary to store DataFrames
dataframes = {}

# Loop through each subreddit to combine JSON files into one DataFrame
for sub in subs_of_interest:
    json_files = [f for f in os.listdir(cache_dir) if f.startswith(sub) and f.endswith('.json')]
    
    # List to collect individual DataFrames for each month
    monthly_data = []
    
    for json_file in json_files:
        with open(os.path.join(cache_dir, json_file), 'r') as f:
            data = json.load(f)
            df = pd.DataFrame(data)
            monthly_data.append(df)
    
    # Concatenate all monthly data into a single DataFrame for the subreddit
    dataframes[sub] = pd.concat(monthly_data, ignore_index=True)

# Clean up by removing variables that aren’t needed
del cache_dir, subs_of_interest, sub, json_files, monthly_data, json_file, f, data, df


In [None]:
# Extract each topic's DataFrame from the dataframes dictionary for a better overview
df_islam = dataframes['islam']
df_christianity = dataframes['Christianity']
df_atheism = dataframes['atheism']
df_buddhism = dataframes['Buddhism']

# Note that for the rest of the code to work these dataframes need to be saved as "df_something" (i.e., start with df_)

In [None]:
# Perform TFIDF analysis on aggregated posts for each subreddit
for sub, posts in all_posts.items():
    print(f"\nConducting TFIDF analysis on all posts for {sub} in 2024...")

    # Create a DataFrame for the aggregated posts
    posts_df = create_posts_dataframe(posts)
    
    # Conduct TFIDF analysis
    tfidf_results = tfidf_analyze_subreddit(posts)
    tf_idf_scores = get_mean_tfidf(
        tfidf_matrix=tfidf_results['tfidf_matrix'],
        feature_names=tfidf_results['feature_names'],
        return_df=True
    )
    
    # Get top 10 terms for each subreddit
    top_terms = get_top_terms(tf_idf_scores, n_terms=10)
    print(f"The top 10 terms for {sub} in 2024 are:\n", "\n".join(top_terms), sep="")
    display(tf_idf_scores.head().style.format("{:.3f}"))

In [None]:
# Create a 'datetime' column for each DataFrame that ends in '_df' (increases reusability)
for name, df in list(globals().items()):
    if name.startswith('df_') and isinstance(df, pd.DataFrame):
        df['datetime'] = pd.to_datetime(df['created_utc'], unit='s')