CSS - Final Project

In [None]:
import json, re, glob
import pandas as pd
import networkx as nx
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import numpy as np
import re
import glob, json, os

In [32]:
data_dir = "congresstweets-master/data"

print("Looking in:", data_dir)
print("Contents:", os.listdir(data_dir)[:5])

json_paths = glob.glob(os.path.join(data_dir, "*.json"))
print(f"Found {len(json_paths)} JSON files")

tweets = []
for path in json_paths:
    with open(path, "r", encoding="utf-8") as f:
        tweets.extend(json.load(f))

df = pd.DataFrame(tweets)
print(f"Loaded {len(df)} tweets")
df.head()


Looking in: congresstweets-master/data
Contents: ['2017-06-21.json', '2017-06-22.json', '2017-06-23.json', '2017-06-24.json', '2017-06-25.json']
Found 2197 JSON files
Loaded 5068352 tweets


Unnamed: 0,id,screen_name,time,link,text,source,user_id,yesterday,today
0,877527850420776961,RepErikPaulsen,2017-06-21T10:05:17-04:00,https://www.twitter.com/CRN_Supplements/status...,RT @CRN_Supplements Thank you @RepErikPaulsen ...,Twitter Web Client,17513304,,
1,877628169028632576,RepTedBudd,2017-06-21T16:43:55-04:00,https://www.twitter.com/RepTedBudd/statuses/87...,Congrats to our Congressional Award Gold Medal...,Twitter Web Client,817138492614524928,,
2,877580122785685504,SenatorWicker,2017-06-21T13:33:00-04:00,https://www.twitter.com/SenatorWicker/statuses...,ICYMI: I chaired a hearing to explore expandin...,TweetDeck,264219447,,
3,877655119638048770,BernieSanders,2017-06-21T18:31:01-04:00,https://www.twitter.com/BernieSanders/statuses...,The Affordable Care Act should be improved. Bu...,TweetDeck,216776631,,
4,877632313550553089,RepDonBacon,2017-06-21T17:00:23-04:00,https://www.twitter.com/RepDonBacon/statuses/8...,#TaxReform will create a low tax rate just for...,Hootsuite,818975124460335106,,


In [33]:
SAMPLE_SIZE = 300_000
df_sample = df.sample(n=min(SAMPLE_SIZE, len(df)), random_state=42).reset_index(drop=True)
print(f"Working on {len(df_sample):,} tweets (out of {len(df):,})")

mention_re = re.compile(r"@([A-Za-z0-9_]+)")

G = nx.DiGraph()
for screen_name, text in zip(df_sample["screen_name"], df_sample["text"]):
    mentions = set(m.lower() for m in mention_re.findall(text))
    src = screen_name.lower()
    for tgt in mentions:
        if tgt == src:
            continue
        if G.has_edge(src, tgt):
            G[src][tgt]["weight"] += 1
        else:
            G.add_edge(src, tgt, weight=1)

num_nodes = G.number_of_nodes()
num_edges = G.number_of_edges()
in_degs  = np.array([d for _, d in G.in_degree()])
out_degs = np.array([d for _, d in G.out_degree()])

print(f"Nodes: {num_nodes:,}")
print(f"Edges: {num_edges:,}")
print(f"In‑degree → mean: {in_degs.mean():.2f},  std: {in_degs.std():.2f}")
print(f"Out‑degree → mean: {out_degs.mean():.2f}, std: {out_degs.std():.2f}")

low_weight = [(u, v) for u, v, d in G.edges(data=True) if d["weight"] < 3]
G.remove_edges_from(low_weight)
isolates = list(nx.isolates(G))
G.remove_nodes_from(isolates)

print(f"After pruning: {G.number_of_nodes():,} nodes, {G.number_of_edges():,} edges")


Working on 300,000 tweets (out of 5,068,352)
Nodes: 57,043
Edges: 177,540
In‑degree → mean: 3.11,  std: 12.71
Out‑degree → mean: 3.11, std: 31.57
After pruning: 5,307 nodes, 17,773 edges


In [None]:
analyzer = SentimentIntensityAnalyzer()
sent_series = df_sample["text"].apply(analyzer.polarity_scores).apply(pd.Series)
df_sample = pd.concat([df_sample, sent_series], axis=1)


user_stats = (
    df_sample
    .groupby("screen_name")["compound"]
    .agg(["mean","median","std","count"])
    .reset_index()
)

user_stats.columns = [
    "screen_name",
    "avg_compound",
    "med_compound",
    "std_compound",
    "n_tweets"
]

print("Sample of per‐user sentiment stats:")
print(user_stats.head())


mapping_avg   = dict(zip(user_stats["screen_name"].str.lower(), user_stats["avg_compound"]))
mapping_count = dict(zip(user_stats["screen_name"].str.lower(), user_stats["n_tweets"]))

nx.set_node_attributes(G, mapping_avg,   "avg_sentiment")
nx.set_node_attributes(G, mapping_count, "n_tweets")


top5 = user_stats.nlargest(5, "avg_compound")[["screen_name","avg_compound","n_tweets"]]
bot5 = user_stats.nsmallest(5, "avg_compound")[["screen_name","avg_compound","n_tweets"]]
print("\nTop 5 happiest accounts:\n", top5.to_string(index=False))
print("\nTop 5 saddest accounts:\n", bot5.to_string(index=False))


ValueError: Length mismatch: Expected axis has 17 elements, new values have 5 elements