In [10]:
import os
import re
import pickle
import pandas as pd
import statsmodels.api as sm
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer

def compute_negative_word_distribution(event_folder, top_k_words=20):
    """
    As before: scan all contents, pick the top negative words (VADER score < -0.8),
    return them as `radical_keywords`.
    """
    authors_file  = os.path.join(event_folder, "network", "authors.txt")
    contents_file = os.path.join(event_folder, "cslasl-pre", "contents.txt")
    with open(authors_file,  "r", encoding="utf-8") as f: authors  = [l.strip() for l in f if l.strip()]
    with open(contents_file, "r", encoding="utf-8") as f: contents = [l for l in f]
    if len(authors) != len(contents):
        raise ValueError("authors/contents length mismatch")
    text = " ".join(contents).lower()
    text = re.sub(r'[^a-z\s]', ' ', text)
    tokens = word_tokenize(text)
    sia = SentimentIntensityAnalyzer()
    negs = [t for t in tokens if t in sia.lexicon and sia.lexicon[t] < -0.8]
    freq = Counter(negs).most_common(top_k_words)
    return [w for w,_ in freq if len(w)>3]

def compute_radical_labels(event_folder, radical_keywords):
    """
    For each author, compute rad_score = (#kw hits)/(#words) and
    label radicalized=1 if above median.
    Returns a DataFrame [author, radicalized].
    """
    authors_file  = os.path.join(event_folder, "network", "authors.txt")
    contents_file = os.path.join(event_folder, "cslasl-pre", "contents.txt")
    with open(authors_file,  "r", encoding="utf-8") as f: authors  = [l.strip() for l in f if l.strip()]
    with open(contents_file, "r", encoding="utf-8") as f: contents = [l for l in f]
    if len(authors) != len(contents):
        raise ValueError("authors/contents length mismatch")
    df = pd.DataFrame({"author": authors, "content": contents})
    def score(txt):
        txt = txt.lower()
        words = re.sub(r'[^a-z\s]',' ', txt).split()
        if not words: return 0.0
        count = sum(len(re.findall(r'\b'+re.escape(kw)+r'\b', txt)) for kw in radical_keywords)
        return count / len(words)
    df["rad_score"] = df["content"].apply(score)
    med = df["rad_score"].median()
    df["radicalized"] = (df["rad_score"] > med).astype(int)
    return df[["author","radicalized"]]

def load_CRC(event_folder):
    """
    Load precomputed CRC_values.pkl → DataFrame [author,CRC].
    """
    with open(os.path.join(event_folder,"CRC_values.pkl"),"rb") as f:
        d = pickle.load(f)
    return pd.DataFrame(list(d.items()), columns=["author","CRC"])

def fit_logit(df):
    """
    Fit Logit(radicalized ~ CRC) and return coef & p‐value for CRC.
    """
    df = df.dropna(subset=["CRC","radicalized"])
    X = sm.add_constant(df["CRC"])
    y = df["radicalized"]
    m = sm.Logit(y, X).fit(disp=False)
    return m.params["CRC"], m.pvalues["CRC"]

def main_hypothesis4(event_folder):
    # 1) extract event_name
    event_name = os.path.basename(event_folder.rstrip("/"))
    # 2) find radical keywords
    radical_keywords = compute_negative_word_distribution(event_folder)
    # 3) compute labels
    df_labels = compute_radical_labels(event_folder, radical_keywords)
    # 4) load CRC
    df_crc    = load_CRC(event_folder)
    # 5) merge
    df        = pd.merge(df_crc, df_labels, on="author", how="inner")
    # 6) fit
    alpha, pval = fit_logit(df)
    # 7) present result as one‐row DataFrame
    df_res = pd.DataFrame([{
        "Event": event_name,
        "Alpha (CRC coef)": alpha,
        "p-value": pval
    }])
    # 8) print LaTeX
    print("LaTeX table of CRC‐coefficients:")
    print(df_res.to_latex(index=False, float_format="%.3f"))
    return df_res

if __name__=="__main__":
    res = main_hypothesis4("data/2008_elections")

LaTeX table of CRC‐coefficients:
\begin{tabular}{lrr}
\toprule
Event & Alpha (CRC coef) & p-value \\
\midrule
2008_elections & 9.013 & 0.000 \\
\bottomrule
\end{tabular}



In [11]:
event_folder = "data/2008_elections"
metrics_df = main_hypothesis4(event_folder)

LaTeX table of CRC‐coefficients:
\begin{tabular}{lrr}
\toprule
Event & Alpha (CRC coef) & p-value \\
\midrule
2008_elections & 9.013 & 0.000 \\
\bottomrule
\end{tabular}



In [13]:
event_folder = "data/2011_wallstreet"
metrics_df = main_hypothesis4(event_folder)

LaTeX table of CRC‐coefficients:
\begin{tabular}{lrr}
\toprule
Event & Alpha (CRC coef) & p-value \\
\midrule
2011_wallstreet & 9.443 & 0.000 \\
\bottomrule
\end{tabular}



In [14]:
event_folder = "data/2016_elections"
metrics_df = main_hypothesis4(event_folder)

LaTeX table of CRC‐coefficients:
\begin{tabular}{lrr}
\toprule
Event & Alpha (CRC coef) & p-value \\
\midrule
2016_elections & 8.720 & 0.000 \\
\bottomrule
\end{tabular}



In [15]:
event_folder = "data/2017_rally"
metrics_df = main_hypothesis4(event_folder)

LaTeX table of CRC‐coefficients:
\begin{tabular}{lrr}
\toprule
Event & Alpha (CRC coef) & p-value \\
\midrule
2017_rally & 6.363 & 0.000 \\
\bottomrule
\end{tabular}



In [16]:
event_folder = "data/2021_riot"
metrics_df = main_hypothesis4(event_folder)

LaTeX table of CRC‐coefficients:
\begin{tabular}{lrr}
\toprule
Event & Alpha (CRC coef) & p-value \\
\midrule
2021_riot & 6.930 & 0.000 \\
\bottomrule
\end{tabular}

