# Network Analysis

In [1]:
from zst_reader import read_lines_zst, write_line_zst
import json

In [2]:
subreddits = ["Conservative", "progressive",
              "democrats", "Republican",
              "NeutralPolitics", "PoliticalDiscussion", "politics"]

In [13]:
# Function to count the number of submissions per author
def count_submissions_per_author(input_path, output_path):

    # Initialize the dictionary
    author_counts = {}

    # Read the data from the zst file
    for line, file_bytes_processed in read_lines_zst(input_path):
        obj = json.loads(line)

        # Get the author
        author = obj.get("author", "")

        # Skip if author is empty or deleted
        if author == "" or author == "[deleted]": continue

        # Set values to 0 if they are not a number
        try: obj["score"] = int(obj["score"])
        except: obj["score"] = 0
        try: obj["ups"] = int(obj["ups"])
        except: obj["ups"] = 0
        try: obj["downs"] = int(obj["downs"])
        except: obj["downs"] = 0
        try: obj["gilded"] = float(obj["gilded"])
        except: obj["gilded"] = 0
        try: obj["controversiality"] = int(obj["controversiality"])
        except: obj["controversiality"] = 0

        # Count comments
        if author not in author_counts:
            author_counts[author] = {"comments": 0, "submissions": 0, "ups": 0,
                                     "downs": 0, "score": 0, "gilded": 0,
                                     "controversiality": 0}
        author_counts[author]["submissions"] += 1
        author_counts[author]["ups"] += obj["ups"]
        author_counts[author]["downs"] += obj["downs"]
        author_counts[author]["score"] += obj["score"]
        author_counts[author]["gilded"] += obj["gilded"]
        author_counts[author]["controversiality"] += obj["controversiality"]

    # Save the data to json file
    with open(output_path, mode="w", newline="") as file:
        json.dump(author_counts, file)

    return

In [14]:
# Count the number of posts per author per subreddit
for s in subreddits:

    input_submission = f"data/{s}/{s}_submissions_classified.zst"
    output_submission = f"analysis/network/{s}_author_counts.json"

    count_submissions_per_author(input_submission, output_submission)

    

In [63]:
# Find the top 50 authors by number of submissions per subreddit
top_authors = {}
total_submissions = {}
for s in subreddits:

    submission_count = f"analysis/network/{s}_author_counts.json"

    with open(submission_count, mode="r", newline="") as file:
        author_counts = json.load(file)

    # If PoliticsModeratorBot is in the dictionary, remove it
    if "PoliticsModeratorBot" in author_counts:
        del author_counts["PoliticsModeratorBot"]
    if "AutoModerator" in author_counts:
        del author_counts["AutoModerator"]
    if "optimalg" in author_counts:
        del author_counts["optimalg"]
    if "IBiteYou" in author_counts:
        del author_counts["IBiteYou"]
        
    # Get the total number of submissions
    total_submissions[s] = sum([author_counts[author]["submissions"] for author in author_counts])
    
    # Sort the authors by number of submissions
    authors_sorted = sorted(author_counts.items(), key=lambda x: x[1]["submissions"], reverse=True)

    # Get author and number of submissions
    authors_sorted = [(author, author_counts[author]["submissions"]) for author, _ in authors_sorted]

    # Get the top 20 authors
    top_authors[s] = authors_sorted[:20]


In [64]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

In [65]:
# For each subreddit, plot the proportion of submissions
for s in subreddits:

    # Calculate proportion of submissions
    df = pd.DataFrame(top_authors[s], columns=["author", "submissions"])
    df["frequency"] = df["submissions"] / total_submissions[s]

    # Plot the proportions. Use a gradient of blue
    sns.barplot(x="frequency", y="author", data=df, palette="Blues_d")
    plt.xticks(rotation=90)
    plt.title(f"Proportion of submissions by top 20 authors in r\{s}")
    #plt.show()

    # Save the plot
    plt.savefig(f"analysis/network/{s}_top_authors.png", bbox_inches="tight")
    plt.close()