In [1]:
# 🚀 Install Dependencies
!pip install praw pandas networkx matplotlib python-louvain scikit-learn

# 📥 Import Libraries
import praw
import pandas as pd
import time
from collections import defaultdict
import networkx as nx
import matplotlib.pyplot as plt
from community import community_louvain
from sklearn.metrics import normalized_mutual_info_score
from networkx.algorithms.cuts import conductance
from networkx.algorithms.community import k_clique_communities, asyn_lpa_communities


Collecting praw
  Downloading praw-7.8.1-py3-none-any.whl.metadata (9.4 kB)
Collecting prawcore<3,>=2.4 (from praw)
  Downloading prawcore-2.4.0-py3-none-any.whl.metadata (5.0 kB)
Collecting update_checker>=0.18 (from praw)
  Downloading update_checker-0.18.0-py3-none-any.whl.metadata (2.3 kB)
Downloading praw-7.8.1-py3-none-any.whl (189 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m189.3/189.3 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading prawcore-2.4.0-py3-none-any.whl (17 kB)
Downloading update_checker-0.18.0-py3-none-any.whl (7.0 kB)
Installing collected packages: update_checker, prawcore, praw
Successfully installed praw-7.8.1 prawcore-2.4.0 update_checker-0.18.0


In [None]:
# ✅ Initialize Reddit API
reddit = praw.Reddit(
    client_id="J_AsaAXgfnC9i6qmdoopYQ",
    client_secret="wiRiW0_9LU3M3SQ710vcEMrZZkmbJA",
    user_agent="DataScienceScraper by AdPutrid8574",
    check_for_async=False
)
# Configuration
subreddit = reddit.subreddit("datascience")
TOP_FLAIRS = ["Discussion", "Projects", "ML", "AI", "Coding"]
MIN_USERS_PER_FLAIR = 50
START_TIMESTAMP = int(time.mktime(time.strptime("2025-01-01", "%Y-%m-%d")))
END_TIMESTAMP = int(time.mktime(time.strptime("2025-02-01", "%Y-%m-%d")))

# Data Structures
flair_activity = defaultdict(lambda: {"total_comments": 0, "users": defaultdict(int)})
user_flair_map = defaultdict(set)
user_posts = defaultdict(set)  # Track posts a user commented on

# Fetch and process posts
print("Fetching comments...")
post_counter = comment_counter = 0

for post in subreddit.new(limit=None):
    if not (START_TIMESTAMP <= post.created_utc < END_TIMESTAMP):
        continue

    flair = post.link_flair_text or "No Flair"

    if flair in TOP_FLAIRS:
        post_counter += 1
        post.comments.replace_more(limit=None)

        for comment in post.comments.list():
            if comment.author and (START_TIMESTAMP <= comment.created_utc < END_TIMESTAMP):
                username = comment.author.name
                flair_activity[flair]["users"][username] += 1
                flair_activity[flair]["total_comments"] += 1
                user_flair_map[username].add(flair)
                user_posts[username].add(post.id)  # Track post ID
                comment_counter += 1

        if post_counter % 100 == 0:
            print(f"Processed {post_counter} posts, {comment_counter} comments...")
        time.sleep(0.5)

print(f"Processed {post_counter} posts and {comment_counter} comments.")

# Select top users per flair
final_user_selection = set()
final_user_flairs = defaultdict(set)

for flair in TOP_FLAIRS:
    if flair in flair_activity:
        sorted_users = sorted(flair_activity[flair]["users"].items(), key=lambda x: x[1], reverse=True)
        top_users = sorted_users[:MIN_USERS_PER_FLAIR]

        for user, count in top_users:
            final_user_selection.add(user)
            final_user_flairs[user].add(flair)

        print(f"Flair: {flair} → Users: {len(top_users)}")

# Build nodes and edges
nodes = []
edges = set()

# Flair nodes
for flair, data in flair_activity.items():
    nodes.append((flair, flair, "flair", data["total_comments"], "Flair"))

# User nodes
for user in final_user_selection:
    flairs = final_user_flairs[user]
    flair_label = list(flairs)[0] if len(flairs) == 1 else "Multiple"
    total_comments = sum(flair_activity[f]["users"].get(user, 0) for f in flairs)
    num_posts = len(user_posts[user])  # Number of unique posts

    nodes.append((user, user, "user", total_comments, num_posts, flair_label))

    for flair in flairs:
        edges.add((user, flair))

# Save to CSV
nodes_df = pd.DataFrame(nodes, columns=["Id", "Label", "Type", "Comments", "Posts", "Flair_Interaction"])
edges_df = pd.DataFrame(edges, columns=["Source", "Target"])

nodes_df.to_csv("comment_network_nodes.csv", index=False)
edges_df.to_csv("comment_network_edges.csv", index=False)
print("Comment network files saved.")

Fetching comments...
Processed 100 posts, 3034 comments...
Processed 104 posts and 3253 comments.
Flair: Discussion → Users: 50
Flair: Projects → Users: 50
Flair: ML → Users: 50
Flair: AI → Users: 50
Flair: Coding → Users: 50
Comment network files saved.


In [2]:
# ✅ Initialize Reddit API

reddit = praw.Reddit(
    client_id="J_AsaAXgfnC9i6qmdoopYQ",
    client_secret="wiRiW0_9LU3M3SQ710vcEMrZZkmbJA",
    user_agent="DataScienceScraper by AdPutrid8574",
    check_for_async=False
)

# Configuration
subreddit = reddit.subreddit("datascience")
TOP_FLAIRS = ["Discussion", "Projects", "ML", "AI", "Coding"]
MIN_USERS_PER_FLAIR = 50
START_TIMESTAMP = int(time.mktime(time.strptime("2025-01-01", "%Y-%m-%d")))
END_TIMESTAMP = int(time.mktime(time.strptime("2025-02-01", "%Y-%m-%d")))

# Data Structures
flair_activity = defaultdict(lambda: {
    "post_upvotes": 0,
    "comment_upvotes": 0,
    "users": defaultdict(lambda: {"post_upvotes": 0, "comment_upvotes": 0})
})

# Track post/comment upvotes per user globally
user_post_upvotes = defaultdict(int)    # Total post upvotes per user
user_comment_upvotes = defaultdict(int) # Total comment upvotes per user
user_flair_map = defaultdict(set)       # Flairs the user received upvotes in

# Fetch and process posts
print("Fetching upvotes...")
post_counter = comment_counter = 0

for post in subreddit.new(limit=None):
    if not (START_TIMESTAMP <= post.created_utc < END_TIMESTAMP):
        continue

    flair = post.link_flair_text or "No Flair"

    if flair in TOP_FLAIRS:
        post_counter += 1

        # Track POST upvotes
        if post.author:
            post_author = post.author.name
            post_upvotes = post.score

            flair_activity[flair]["post_upvotes"] += post_upvotes
            flair_activity[flair]["users"][post_author]["post_upvotes"] += post_upvotes
            user_post_upvotes[post_author] += post_upvotes  # Global tracker
            user_flair_map[post_author].add(flair)

        # Track COMMENT upvotes
        post.comments.replace_more(limit=None)
        for comment in post.comments.list():
            if comment.author and (START_TIMESTAMP <= comment.created_utc < END_TIMESTAMP):
                comment_author = comment.author.name
                comment_upvotes = comment.score

                flair_activity[flair]["comment_upvotes"] += comment_upvotes
                flair_activity[flair]["users"][comment_author]["comment_upvotes"] += comment_upvotes
                user_comment_upvotes[comment_author] += comment_upvotes  # Global tracker
                user_flair_map[comment_author].add(flair)
                comment_counter += 1

        if post_counter % 100 == 0:
            print(f"Processed {post_counter} posts, {comment_counter} comments...")
        time.sleep(0.5)

print(f"Processed {post_counter} posts and {comment_counter} comments.")

# Select top users per flair based on TOTAL upvotes (posts + comments)
final_user_selection = set()
final_user_flairs = defaultdict(set)

for flair in TOP_FLAIRS:
    if flair in flair_activity:
        # Calculate total upvotes (posts + comments) per user in this flair
        user_total_upvotes = {
            user: (data["post_upvotes"] + data["comment_upvotes"])
            for user, data in flair_activity[flair]["users"].items()
        }

        sorted_users = sorted(user_total_upvotes.items(), key=lambda x: x[1], reverse=True)
        top_users = sorted_users[:MIN_USERS_PER_FLAIR]

        for user, _ in top_users:
            final_user_selection.add(user)
            final_user_flairs[user].add(flair)

        print(f"Flair: {flair} → Users: {len(top_users)}")

# Build nodes and edges
nodes = []
edges = set()

# Flair nodes (show total post/comment upvotes)
for flair, data in flair_activity.items():
    nodes.append((
        flair,
        flair,
        "flair",
        data["post_upvotes"],
        data["comment_upvotes"],
        data["post_upvotes"] + data["comment_upvotes"],
        "Flair"
    ))

# User nodes
for user in final_user_selection:
    flairs = final_user_flairs[user]
    flair_label = list(flairs)[0] if len(flairs) == 1 else "Multiple"

    total_post_upvotes = user_post_upvotes[user]
    total_comment_upvotes = user_comment_upvotes[user]
    total_upvotes = total_post_upvotes + total_comment_upvotes

    nodes.append((
        user,
        user,
        "user",
        total_post_upvotes,
        total_comment_upvotes,
        total_upvotes,
        flair_label
    ))

    for flair in flairs:
        edges.add((user, flair))

# Save to CSV
nodes_df = pd.DataFrame(nodes, columns=[
    "Id",
    "Label",
    "Type",
    "Post_Upvotes",
    "Comment_Upvotes",
    "Total_Upvotes",
    "Flair_Interaction"
])

edges_df = pd.DataFrame(edges, columns=["Source", "Target"])

nodes_df.to_csv("upvote_network_nodes.csv", index=False)
edges_df.to_csv("upvote_network_edges.csv", index=False)
print("Upvote network files saved with detailed metrics!")

Fetching upvotes...
Processed 100 posts, 3034 comments...
Processed 104 posts and 3253 comments.
Flair: Discussion → Users: 50
Flair: Projects → Users: 50
Flair: ML → Users: 50
Flair: AI → Users: 50
Flair: Coding → Users: 50
Upvote network files saved with detailed metrics!


In [4]:
# Function to analyze networks
def analyze_network(nodes_file, edges_file, network_name):
    # Load data
    nodes = pd.read_csv(nodes_file)
    edges = pd.read_csv(edges_file)

    # Create an undirected graph
    G = nx.Graph()

    # Add nodes with attributes
    for _, row in nodes.iterrows():
        G.add_node(row['Id'], **row.to_dict())

    # Add edges
    for _, row in edges.iterrows():
        G.add_edge(row['Source'], row['Target'])

    # Basic network statistics
    print(f"\n🔹 {network_name} Network Analysis")
    print("=" * 50)
    print(f"📌 Number of Nodes: {G.number_of_nodes()}")
    print(f"📌 Number of Edges: {G.number_of_edges()}")
    avg_degree = sum(dict(G.degree()).values()) / G.number_of_nodes()
    print(f"📌 Average Degree: {avg_degree:.3f}")
    print(f"📌 Density: {nx.density(G):.4f}")
    print(f"📌 Connected Components: {nx.number_connected_components(G)}")

    # 🔹 Community Detection using Louvain
    partition_louvain = community_louvain.best_partition(G)
    modularity = community_louvain.modularity(partition_louvain, G)
    print(f"\n🔹 Modularity Score (Louvain): {modularity:.3f} (Higher = Stronger Community Structure)")

    # 🔹 Community Detection using Label Propagation Algorithm (LPA)
    communities_lpa = list(asyn_lpa_communities(G))
    num_lpa_communities = len(communities_lpa)
    print(f"🔹 Label Propagation Algorithm (LPA) Communities Detected: {num_lpa_communities}")

    # NMI Score - Compare Detected Communities vs Flair Interaction
    if 'Flair_Interaction' in nodes.columns:
        flair_partition = nodes.set_index('Id')['Flair_Interaction'].to_dict()
        nmi = normalized_mutual_info_score(list(partition_louvain.values()), list(flair_partition.values()))
        print(f"🔹 Normalized Mutual Information (NMI): {nmi:.3f} (How well detected communities match flairs)")

    # 🔹 Conductance (How well-separated communities are)
    detected_communities = {}
    for node, community in partition_louvain.items():
        detected_communities.setdefault(community, []).append(node)

    conductance_scores = [
        conductance(G, comm_nodes) for comm_nodes in detected_communities.values() if len(comm_nodes) > 1
    ]
    avg_conductance = sum(conductance_scores) / len(conductance_scores) if conductance_scores else 0
    print(f"🔹 Average Conductance: {avg_conductance:.3f} (Lower = Stronger Communities)")

    # 🔹 Assortativity (Do users interact within their flair?)
    if "Flair_Interaction" in nodes.columns:
        assortativity = nx.attribute_assortativity_coefficient(G, "Flair_Interaction")
        print(f"🔹 Assortativity: {assortativity:.3f} (1 = Strong flair-based communities, 0 = Random)")

    # 🔹 Centrality Measures
    print("\n🔹 Top 5 Central Nodes (Influential Users)")
    betweenness = nx.betweenness_centrality(G)
    top_betweenness = sorted(betweenness.items(), key=lambda x: x[1], reverse=True)[:5]
    print("🔸 Betweenness Centrality (Users Connecting Different Groups):")
    for user, score in top_betweenness:
        print(f"   - {user}: {score:.4f}")

    degree = dict(G.degree())
    top_degree = sorted(degree.items(), key=lambda x: x[1], reverse=True)[:5]
    print("🔸 Degree Centrality (Most Connected Users):")
    for user, score in top_degree:
        print(f"   - {user}: {score}")

# Upload files
from google.colab import files
uploaded = files.upload()

# Analyze both networks
analyze_network('comment_network_nodes.csv', 'comment_network_edges.csv', 'Comment Network')
analyze_network('upvote_network_nodes.csv', 'upvote_network_edges.csv', 'Upvote Network')




Saving comment_network_edges.csv to comment_network_edges (3).csv
Saving comment_network_nodes.csv to comment_network_nodes (3).csv
Saving upvote_network_edges.csv to upvote_network_edges (3).csv
Saving upvote_network_nodes.csv to upvote_network_nodes (3).csv

🔹 Comment Network Network Analysis
📌 Number of Nodes: 222
📌 Number of Edges: 250
📌 Average Degree: 2.252
📌 Density: 0.0102
📌 Connected Components: 1

🔹 Modularity Score (Louvain): 0.668 (Higher = Stronger Community Structure)
🔹 Label Propagation Algorithm (LPA) Communities Detected: 5
🔹 Normalized Mutual Information (NMI): 0.845 (How well detected communities match flairs)
🔹 Average Conductance: 0.132 (Lower = Stronger Communities)
🔹 Assortativity: 0.720 (1 = Strong flair-based communities, 0 = Random)

🔹 Top 5 Central Nodes (Influential Users)
🔸 Betweenness Centrality (Users Connecting Different Groups):
   - Projects: 0.3815
   - AI: 0.3645
   - Discussion: 0.3590
   - ML: 0.3533
   - Coding: 0.3263
🔸 Degree Centrality (Most Co