In [324]:
import os
import numpy as np
from igraph import Graph
import pickle
import re
import pickle
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/shared/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [325]:
import os
import pickle
import numpy as np

def compute_strength_stream(edges_file, authors_set):
    """
    Compute the weighted degree (strength) for nodes by streaming through the edges file.
    Instead of summing raw weights, a concave transformation is applied to dampen extreme values.
    Assumes each line in edges_file is in the format: node1;node2;weight.
    """
    # Initialize strengths for nodes known from the authors file.
    strengths = {node: 0.0 for node in authors_set}
    with open(edges_file, "r") as f:
        for line in f:
            parts = line.strip().split(";")
            if len(parts) != 3:
                continue
            u, v, w = parts[0], parts[1], float(parts[2])
            # Apply concave transformation: log(1 + weight)
            transformed_w = np.log(1 + w)
            # Update both nodes' strength
            if u in strengths:
                strengths[u] += transformed_w
            else:
                strengths[u] = transformed_w
            if v in strengths:
                strengths[v] += transformed_w
            else:
                strengths[v] = transformed_w
    return strengths

def compute_crc(event_name):
    """
    Compute the Composite Reinforced Centrality (CRC) for an event using a streaming approach.
    The CRC aggregates the normalized effective influence across four behavioral layers.
    The effective influence in each layer is normalized by the maximum possible connections (n-1),
    where n is the total number of nodes.
    """
    # Parameters
    omega = 1.0  # Uniform interlayer coupling weight
    layers_list = ["uil", "csl", "tdl", "asl"]
    beta = {
        "uil": 1.0,
        "csl": 1.0,
        "tdl": 1.0,
        "asl": 1.0
    }
    
    # Folder paths
    base_folder = "data"
    event_folder = os.path.join(base_folder, event_name)
    network_folder = os.path.join(event_folder, "network")
    
    # Load nodes from authors.txt
    nodes_file = os.path.join(network_folder, "authors.txt")
    with open(nodes_file, "r") as f:
        all_nodes = [line.strip() for line in f if line.strip()]
    all_nodes_set = set(all_nodes)
    n = len(all_nodes_set)  # Total number of nodes

    # Compute strength (with concave transformation) for each layer using file streaming
    phi = {}
    for l in layers_list:
        edges_file = os.path.join(network_folder, l, "edges.txt")
        if os.path.exists(edges_file):
            strengths = compute_strength_stream(edges_file, all_nodes_set)
        else:
            print(f"Warning: No edges file found for layer {l}. Using 0 strength for all nodes.")
            strengths = {node: 0.0 for node in all_nodes_set}
        phi[l] = strengths
    
    # Compute normalized effective influence for each user per layer:
    # The effective influence for a node in layer l is defined as:
    #   (phi[node] + omega * (total_phi - phi[node])) / (n-1)
    effective_phi = {l: {} for l in layers_list}
    for node in all_nodes_set:
        # Total influence across layers for this node
        total_phi = sum(phi[l].get(node, 0.0) for l in layers_list)
        for l in layers_list:
            phi_val = phi[l].get(node, 0.0)
            effective_phi[l][node] = (phi_val + omega * (total_phi - phi_val)) / (n - 1)
    
    # Compute Composite Reinforced Centrality (CRC) for each user
    # CRC is the product over layers of (1 + beta * normalized effective influence)
    CRC = {}
    for node in all_nodes_set:
        prod = 1.0
        for l in layers_list:
            prod *= (1 + beta[l] * effective_phi[l].get(node, 0.0))
        CRC[node] = prod
    
    # Save CRC values to disk
    crc_file = os.path.join(event_folder, "CRC_values.pkl")
    with open(crc_file, "wb") as f:
        pickle.dump(CRC, f)
    print("CRC values saved to:", crc_file)
    
    return CRC

# Example usage:
# crc = compute_crc("2008_elections")

In [326]:
import os
import re
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler

def analyze_crc(crc, event_folder, radical_keywords):
    """
    Given a CRC dictionary (mapping author -> CRC) and the event folder containing
    "authors.txt" and "contents.txt", this function:
    
      1. Loads the radicalization content data.
      2. Computes a radicalization score for each author based on the frequency of radical keywords.
      3. Creates a binary radicalization label using the median radicalization score.
      4. Merges the CRC values with the radicalization labels.
      5. Generates visualizations (histogram and boxplot) comparing CRC distributions by radicalization status.
      6. Performs logistic regression to assess the relationship between CRC and radicalization.
      7. Computes the Pearson correlation between CRC values and the radicalization label.
      
    The results are saved in the event folder and the merged DataFrame, logistic regression result, 
    and correlation value are returned.
    
    Parameters:
        crc (dict): Dictionary mapping each author (str) to their CRC value (float).
        event_folder (str): Path to the event folder (e.g., "data/2008_elections/").
        
    Returns:
        merged_df (DataFrame): Merged DataFrame with columns 'author', 'CRC', 'content', 'rad_score', and 'radical'.
        logit_result: Fitted logistic regression result (statsmodels object).
        correlation (float): Pearson correlation between CRC and the binary radicalization label.
    """
    # --------------------------
    # Load authors and contents
    # --------------------------
    authors_file = os.path.join(event_folder, "network/authors.txt")
    contents_file = os.path.join(event_folder, "cslasl-pre/contents.txt")
    
    with open(authors_file, "r", encoding="utf-8") as f:
        authors = [line.strip() for line in f if line.strip()]
    
    with open(contents_file, "r", encoding="utf-8") as f:
        contents = [line for line in f if line]
    
    if len(authors) != len(contents):
        raise ValueError("Mismatch between number of authors and contents.")
    
    rad_df = pd.DataFrame({"author": authors, "content": contents})
    
    # --------------------------
    # Compute radicalization score for each author
    # --------------------------
    
    def compute_radical_score(text):
        text_lower = text.lower()
        score = 0
        for kw in radical_keywords:
            score += len(re.findall(r'\b' + re.escape(kw) + r'\b', text_lower))
        words = text_lower.split()
        return score / len(words) if words else 0.0
    
    rad_df["rad_score"] = rad_df["content"].apply(compute_radical_score)
    
    # --------------------------
    # Create binary radicalization label
    # --------------------------
    median_score = rad_df["rad_score"].median()
    rad_df["radical"] = (rad_df["rad_score"] > median_score).astype(int)
    
    # --------------------------
    # Merge CRC values with radicalization data
    # --------------------------
    crc_df = pd.DataFrame(list(crc.items()), columns=["author", "CRC"])
    merged_df = pd.merge(crc_df, rad_df, on="author", how="inner")
    
    # Ensure that the CRC column is numeric
    merged_df["CRC"] = pd.to_numeric(merged_df["CRC"], errors="coerce")
    merged_df["radical"] = merged_df["radical"].astype(int)
    
    # --------------------------
    # Visualization: Distribution of CRC by radicalization status
    # --------------------------
    plt.figure(figsize=(10,6))
    sns.histplot(data=merged_df, x="CRC", hue="radical", kde=True, bins=30)
    plt.title("Distribution of CRC Values by Radicalization Status")
    plt.xlabel("Composite Reinforced Centrality (CRC)")
    plt.ylabel("Frequency")
    hist_path = os.path.join(event_folder, "CRC_distribution_radical.png")
    plt.savefig(hist_path)
    plt.close()
    
    plt.figure(figsize=(8,6))
    sns.boxplot(x="radical", y="CRC", data=merged_df)
    plt.title("CRC by Radicalization Status")
    plt.xlabel("Radicalization Status (0 = Non-Radicalized, 1 = Radicalized)")
    plt.ylabel("Composite Reinforced Centrality (CRC)")
    boxplot_path = os.path.join(event_folder, "CRC_boxplot_radical.png")
    plt.savefig(boxplot_path)
    plt.close()
    
    # --------------------------
    # Logistic Regression Analysis
    # --------------------------
    merged_df["intercept"] = 1.0
    logit_model = sm.Logit(merged_df["radical"], merged_df[["intercept", "CRC"]])
    logit_result = logit_model.fit(disp=False)
    
    # --------------------------
    # Pearson Correlation
    # --------------------------
    scaler = StandardScaler()
    merged_df[['scaled_CRC', 'scaled_rad_score']] = scaler.fit_transform(merged_df[['CRC', 'rad_score']])
    correlation = merged_df["CRC"].corr(merged_df["radical"])
    
    # Save merged DataFrame for further analysis.
    merged_df.to_csv(os.path.join(event_folder, "CRC_radicalization_analysis.csv"), index=False)
    
    print("Logistic Regression Summary:")
    print(logit_result.summary())
    print("Pearson correlation between CRC and Radicalization:", correlation)


    # Get the regression coefficients
    intercept_value = logit_result.params["intercept"]
    crc_coef = logit_result.params["CRC"]
    
    # Compute quartile values for CRC from the merged DataFrame
    quartiles = merged_df["CRC"].quantile([0, 0.25, 0.50, 0.75, 0.90])
    print("CRC Quartiles:")
    print(quartiles)
    
    # Compute predicted probabilities at each quartile
    def predicted_probability(crc_value, intercept, coef):
        log_odds = intercept + coef * crc_value
        return 1 / (1 + np.exp(-log_odds))
    
    pred_probs = quartiles.apply(lambda x: predicted_probability(x, intercept_value, crc_coef))
    print("\nPredicted Probabilities at CRC Quartiles:")
    print(pred_probs)
    
    return merged_df, logit_result, correlation

# Example usage:
# merged_df, logit_result, corr = analyze_crc(crc_2008, "data/2008_elections/")

In [339]:
import os
import re
from collections import Counter
import nltk
import pandas as pd
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer


def compute_negative_word_distribution(crc, event_folder, authors_perc=0.1, top_k_words=20):
    """
    For the top users by CRC, aggregate their content and compute the frequency distribution of
    negative words. Negative words are determined using VADER's lexicon (words with sentiment scores < 0).
    
    Parameters:
        crc (dict): Dictionary mapping each author (str) to their CRC value (float).
        event_folder (str): Path to the event folder containing "network/authors.txt" and "cslasl-pre/contents.txt".
        top_n_authors (int): Number of top users (by CRC) to consider.
        top_k_words (int): Number of most frequent negative words to return.
        
    Returns:
        neg_word_freq (list): A list of tuples (word, frequency) for the top negative words.
        radical_keywords (list): A list of the top_k_words most frequent negative words longer than 4 characters.
    """
    # Paths for authors and contents (adjust according to your folder structure)
    authors_file = os.path.join(event_folder, "network/authors.txt")
    contents_file = os.path.join(event_folder, "cslasl-pre/contents.txt")
    
    # Load authors and contents
    with open(authors_file, "r", encoding="utf-8") as f:
        authors_list = [line.strip() for line in f if line.strip()]
    with open(contents_file, "r", encoding="utf-8") as f:
        contents_list = [line for line in f if line]
        
    if len(authors_list) != len(contents_list):
        raise ValueError("The number of authors and contents do not match.")
    
    # Create a DataFrame mapping authors to their content (each line corresponds)
    data_df = pd.DataFrame({"author": authors_list, "content": contents_list})
    top_n_authors = int(len(authors_list) * authors_perc)
    
    # Sort authors by CRC in descending order and select the top_n_authors.
    sorted_authors = sorted(crc.items(), key=lambda x: x[1], reverse=True)
    top_authors = [author for author, value in sorted_authors[:top_n_authors]]
    
    # Aggregate content for the top authors.
    aggregated_text = " ".join(data_df[data_df["author"].isin(top_authors)]["content"].tolist())
    
    # Preprocess the text: lowercase and remove punctuation/numbers.
    aggregated_text = aggregated_text.lower()
    aggregated_text = re.sub(r'[^a-z\s]', ' ', aggregated_text)
    
    # Tokenize the aggregated text.
    tokens = word_tokenize(aggregated_text)
    
    # Initialize VADER sentiment analyzer and get its lexicon.
    sia = SentimentIntensityAnalyzer()
    vader_lexicon = sia.lexicon  # Dictionary mapping word -> sentiment score.
    
    # Filter tokens: keep only those words that are in the VADER lexicon and have a negative sentiment score.
    negative_tokens = [token for token in tokens if token in vader_lexicon and (vader_lexicon[token] < -0.8)]
    
    # Count the frequency of negative tokens.
    freq_counter = Counter(negative_tokens)
    
    # Get the top_k_words most common negative words.
    neg_word_freq = freq_counter.most_common(top_k_words)
    
    # Extract only words from neg_word_freq that are longer than 4 characters.
    radical_keywords = [word for word, freq in neg_word_freq if len(word) > 3]
    
    return neg_word_freq, radical_keywords

[nltk_data] Downloading package punkt to /home/shared/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [340]:
def get_regression(event, authors_perc=0.1, top_k_words=40):
    crc_file = os.path.join('data', event, 'CRC_values.pkl')
    with open(crc_file, 'rb') as f:
        crc_dict = pickle.load(f)
    event_folder = f"data/{event}"
    _, radical_keywords = compute_negative_word_distribution(crc_dict, event_folder, authors_perc=authors_perc, top_k_words=top_k_words)
    print(f"KEYWORDS: {radical_keywords}")

    merged_df, logit_result, corr = analyze_crc(crc_dict, event_folder, radical_keywords)

In [313]:
crc_2008 = compute_crc("2008_elections")

CRC values saved to: data/2008_elections/CRC_values.pkl


In [319]:
get_regression('2008_elections', authors_perc=0.1, top_k_words=40)

KEYWORDS: ['lies', 'wrong', 'shit', 'fuck', 'fraud', 'crisis', 'stop', 'stupid', 'hate', 'problem', 'anti', 'terrorist', 'attack', 'racist', 'crap', 'lose', 'poor', 'spammer', 'hell', 'fear', 'lost', 'attacks', 'bullshit', 'worse', 'argument', 'negative', 'racism', 'kill', 'problems', 'damn', 'crazy', 'dead', 'illegal']
Logistic Regression Summary:
                           Logit Regression Results                           
Dep. Variable:                radical   No. Observations:                13184
Model:                          Logit   Df Residuals:                    13182
Method:                           MLE   Df Model:                            1
Date:                Wed, 19 Mar 2025   Pseudo R-squ.:                  0.1603
Time:                        14:32:51   Log-Likelihood:                -7526.5
converged:                       True   LL-Null:                       -8963.3
Covariance Type:            nonrobust   LLR p-value:                     0.000
                 

In [345]:
crc_2011 = compute_crc("2011_wallstreet")

CRC values saved to: data/2011_wallstreet/CRC_values.pkl


In [346]:
get_regression('2011_wallstreet', authors_perc=0.1, top_k_words=40)

KEYWORDS: ['protest', 'protesters', 'protests', 'protesting', 'problem', 'shit', 'wrong', 'stop', 'fuck', 'arrested', 'violence', 'violent', 'problems', 'stupid', 'anti', 'arrest', 'poor', 'fight', 'bullshit', 'hate', 'argument', 'debt', 'hell', 'blame', 'riot', 'greed', 'illegal', 'disagree', 'angry', 'lack', 'brutality', 'fail', 'fighting', 'worse', 'evil', 'lose']
Logistic Regression Summary:
                           Logit Regression Results                           
Dep. Variable:                radical   No. Observations:                31627
Model:                          Logit   Df Residuals:                    31625
Method:                           MLE   Df Model:                            1
Date:                Fri, 28 Mar 2025   Pseudo R-squ.:                  0.1585
Time:                        08:09:28   Log-Likelihood:                -18447.
converged:                       True   LL-Null:                       -21922.
Covariance Type:            nonrobust   LLR p-va

In [342]:
crc_2016 = compute_crc("2016_elections")

CRC values saved to: data/2016_elections/CRC_values.pkl


In [343]:
get_regression('2016_elections', authors_perc=0.1, top_k_words=40)

KEYWORDS: ['fake', 'shit', 'fuck', 'wrong', 'stop', 'lost', 'racist', 'hate', 'problem', 'anti', 'stupid', 'bullshit', 'rapist', 'worse', 'illegal', 'hell', 'lose', 'argument', 'blame', 'rape', 'damn', 'crazy', 'fraud', 'attack', 'conspiracy', 'rigged', 'lies', 'poor', 'lying', 'fucked', 'worst', 'doubt', 'fight', 'propaganda']
Logistic Regression Summary:
                           Logit Regression Results                           
Dep. Variable:                radical   No. Observations:               295829
Model:                          Logit   Df Residuals:                   295827
Method:                           MLE   Df Model:                            1
Date:                Thu, 27 Mar 2025   Pseudo R-squ.:                  0.1762
Time:                        21:22:53   Log-Likelihood:            -1.6892e+05
converged:                       True   LL-Null:                   -2.0505e+05
Covariance Type:            nonrobust   LLR p-value:                     0.000
         

In [320]:
crc_2017 = compute_crc("2017_rally")

CRC values saved to: data/2017_rally/CRC_values.pkl


In [321]:
get_regression('2017_rally', authors_perc=0.1, top_k_words=40)

KEYWORDS: ['racist', 'shit', 'violence', 'hate', 'fuck', 'racism', 'wrong', 'supremacists', 'stop', 'anti', 'violent', 'problem', 'protest', 'stupid', 'fake', 'bullshit', 'racists', 'attack', 'protesters', 'killed', 'argument', 'blame', 'worse', 'hell', 'fight', 'death', 'crime', 'fascist', 'terrorist', 'kill', 'lost', 'evil', 'murder', 'illegal', 'fire', 'terrorism']
Logistic Regression Summary:
                           Logit Regression Results                           
Dep. Variable:                radical   No. Observations:               104125
Model:                          Logit   Df Residuals:                   104123
Method:                           MLE   Df Model:                            1
Date:                Wed, 19 Mar 2025   Pseudo R-squ.:                  0.1395
Time:                        14:55:40   Log-Likelihood:                -62104.
converged:                       True   LL-Null:                       -72174.
Covariance Type:            nonrobust   LLR p-v

In [329]:
crc_2020 = compute_crc("2020_covid19")

CRC values saved to: data/2020_covid19/CRC_values.pkl


In [341]:
get_regression('2020_covid19', authors_perc=0.1, top_k_words=40)

KEYWORDS: ['infected', 'death', 'sick', 'shit', 'risk', 'stop', 'fuck', 'worse', 'problem', 'wrong', 'died', 'dead', 'panic', 'fear', 'stupid', 'worst', 'kill', 'severe', 'conspiracy', 'lower', 'negative', 'crisis', 'avoid', 'crazy', 'emergency', 'hell', 'poor', 'lack', 'worried', 'worry', 'alone', 'damn', 'lost', 'fake']
Logistic Regression Summary:
                           Logit Regression Results                           
Dep. Variable:                radical   No. Observations:               371104
Model:                          Logit   Df Residuals:                   371102
Method:                           MLE   Df Model:                            1
Date:                Thu, 20 Mar 2025   Pseudo R-squ.:               4.782e-09
Time:                        18:13:29   Log-Likelihood:            -2.5516e+05
converged:                       True   LL-Null:                   -2.5516e+05
Covariance Type:            nonrobust   LLR p-value:                    0.9606
               

In [322]:
crc_2021 = compute_crc("2021_riot")

CRC values saved to: data/2021_riot/CRC_values.pkl


In [323]:
get_regression('2021_riot', authors_perc=0.1, top_k_words=40)

KEYWORDS: ['shit', 'fuck', 'stop', 'terrorists', 'violence', 'riot', 'wrong', 'stupid', 'bullshit', 'attack', 'lost', 'hate', 'killed', 'terrorist', 'death', 'riots', 'protests', 'fraud', 'worse', 'conspiracy', 'violent', 'hell', 'crazy', 'arrested', 'protest', 'problem', 'damn', 'died', 'fight', 'lies', 'prison', 'fire', 'murder', 'dead', 'argument', 'crime']
Logistic Regression Summary:
                           Logit Regression Results                           
Dep. Variable:                radical   No. Observations:               254480
Model:                          Logit   Df Residuals:                   254478
Method:                           MLE   Df Model:                            1
Date:                Wed, 19 Mar 2025   Pseudo R-squ.:                  0.1206
Time:                        17:30:57   Log-Likelihood:            -1.5323e+05
converged:                       True   LL-Null:                   -1.7424e+05
Covariance Type:            nonrobust   LLR p-value:   