In [1]:
import numpy as np
import pandas as pd
import math
import json
import glob, os
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
def binary_entropy(p):
    # Handle edge cases to avoid log(0)
    if p == 0 or p == 1:
        return 0
    return math.pow(2, - (p * math.log2(p) + (1 - p) * math.log2(1 - p))) - 1

In [3]:
def compute_ec(frequencies):
    """
    frequencies: list of counts for each of the k Likert categories (ordered 1 to k).
    Returns the Ec (entropy-based polarization measure).
    """
    total = sum(frequencies)
    if total == 0:
        return -1
    # Compute proportion for each category
    p = [freq / total for freq in frequencies]
    k = len(p)
    
    # Compute cumulative proportions S_j for j=1,..,k (we need j=1 to k-1)
    cum_sums = []
    s = 0
    for proportion in p:
        s += proportion
        cum_sums.append(s)
    
    # Compute the binary entropy for each cumulative pair (for j=1 to k-1)
    H_values = []
    for j in range(k - 1):
        S_j = cum_sums[j]
        H_val = binary_entropy(S_j)
        H_values.append(H_val)
    
    # Ec is the average of the computed binary entropy values
    Ec = sum(H_values) / (k - 1)
    return Ec

In [4]:
def plot_ec_by_state(results, topic):
    # Build DataFrame
    df = (
        pd.DataFrame.from_dict(results, orient='index')
          .reset_index().rename(columns={'index':'state'})
    )
    df = df.sort_values('Total Frequencies', ascending=False).reset_index(drop=True)

    # Plot setup
    plt.figure(figsize=(max(12, len(df)*0.4), 8))
    ax = sns.barplot(
        x='state', y='Total Frequencies', hue='state',
        data=df, palette='viridis'
    )
    ax.set_xlabel('State')
    ax.set_ylabel('Number of Tweets')
    ax.set_title(f"Tweet Counts & Ec by State for '{topic}'", pad=16)
    plt.xticks(rotation=90)

    # Compute padding
    max_count = df['Total Frequencies'].max()
    pad = max_count * 0.02

    # Annotate: count inside bar, Ec above bar
    for bar, (_, row) in zip(ax.patches, df.iterrows()):
        height = bar.get_height()
        count = int(row['Total Frequencies'])
        ec = row['Ec']

        # 1) Count in middle of bar
        ax.text(
            bar.get_x() + bar.get_width()/2,
            height * 0.5,
            f"{count}",
            ha='center', va='center',
            fontsize=9, color='white'
        )
        # 2) Ec just above the bar
        ax.text(
            bar.get_x() + bar.get_width()/2,
            height + pad,
            f"{ec:.2f}",
            ha='center', va='bottom',
            fontsize=9, color='black'
        )

    plt.tight_layout()

    # Save
    out_dir = "../data/7_Categories_EC_graphs"
    os.makedirs(out_dir, exist_ok=True)
    out = os.path.join(out_dir, f"EC_graph_{topic}.png")
    plt.savefig(out, bbox_inches='tight', dpi=100)
    plt.close()

    print(f"Saved combined freq+Ec histogram to {out}")

In [5]:
def compute_state_ecs(state_stance_counts, topic):
    state_ecs = {}
    for state in state_stance_counts:
        frequencies = [state_stance_counts[state].get(stance, 0) for stance in ["Strongly Opposes", "Opposes", "Weakly Opposes", "Neutral", "Weakly Supports", "Supports", "Strongly Supports"]]
        state_ecs[state] = compute_ec(frequencies)
        
    state_totals = {}
    for state in state_stance_counts:
        total = sum(state_stance_counts[state].values())
        state_totals[state] = total
        
    # Sort states by total frequencies in descending order
    sorted_states = sorted(state_totals.items(), key=lambda x: x[1], reverse=True)
    results = {}
    for state, total in sorted_states:
        ec_value = state_ecs[state]
        results[state] = {
            "Total Frequencies": total,
            "Strongly Opposes": state_stance_counts[state].get('Strongly Opposes', 0),
            "Opposes": state_stance_counts[state].get('Opposes', 0),
            "Weakly Opposes": state_stance_counts[state].get('Weakly Opposes', 0),
            "Neutral": state_stance_counts[state].get('Neutral', 0),
            "Weakly Supports": state_stance_counts[state].get('Weakly Supports', 0),
            "Supports": state_stance_counts[state].get('Supports', 0),
            "Strongly Supports": state_stance_counts[state].get('Strongly Supports', 0),
            "Ec": ec_value
        }
        
    # Calculate national totals by summing over all states.
    national_counts = {'Strongly Opposes': 0, 'Opposes': 0, 'Weakly Opposes': 0, 'Neutral': 0, 'Weakly Supports': 0, 'Supports': 0, 'Strongly Supports': 0}
    for counts in state_stance_counts.values():
        for stance in national_counts.keys():
            national_counts[stance] += counts.get(stance, 0)
    total_national = sum(national_counts.values())
    national_frequencies = [
        national_counts['Strongly Opposes'],
        national_counts['Opposes'],
        national_counts['Weakly Opposes'],
        national_counts['Neutral'],
        national_counts['Weakly Supports'],
        national_counts['Supports'],
        national_counts['Strongly Supports']
    ]
    
    national_ec = compute_ec(national_frequencies)
    
    # Add national results to the dictionary.
    results["US"] = {
        "Total Frequencies": total_national,
        "Strongly Opposes": national_counts.get('Strongly Opposes', 0),
        "Opposes": national_counts.get('Opposes', 0),
        "Weakly Opposes": national_counts.get('Weakly Opposes', 0),
        "Neutral": national_counts.get('Neutral', 0),
        "Weakly Supports": national_counts.get('Weakly Supports', 0),
        "Supports": national_counts.get('Supports', 0),
        "Strongly Supports": national_counts.get('Strongly Supports', 0),
        "Ec": national_ec
    }
    
    plot_ec_by_state(results, topic)

    # Save results in json format in out_dir = "../data/v_b64_EC_json"
    out_dir = "../data/7_Categories_EC_json"
    os.makedirs(out_dir, exist_ok=True)
    out = os.path.join(out_dir, f"state_ec_results_{topic}.txt")    
    with open(out, 'w') as f:
        json.dump(results, f, indent=2)
    print(f"Saved EC results to {out}")

In [6]:
# Folder containing the CSV files.
csv_folder = "../data/7_Categories_Sentiment_Analysis_Direction"
pattern = os.path.join(csv_folder, "cleaned_tweets_with_labels_v_b64_usa_*.csv")

for file in glob.glob(pattern):
  base = os.path.basename(file)
  topic = base.split("usa_")[1].split(".csv")[0]
  print(f"Processing topic: {topic}")

  # Load the CSV.
  df = pd.read_csv(file, lineterminator='\n', parse_dates=True)
  
  # Group by state_code and count tweets per stance.
  state_stance_counts = (
      df.groupby(['state_code', 'stance'])
        .size()
        .unstack(fill_value=0)
        .to_dict(orient='index')
  )

  compute_state_ecs(state_stance_counts, topic)

Processing topic: Abortion
Saved combined freq+Ec histogram to ../data/7_Categories_EC_graphs\EC_graph_Abortion.png
Saved EC results to ../data/7_Categories_EC_json\state_ec_results_Abortion.txt
Processing topic: Taxes
Saved combined freq+Ec histogram to ../data/7_Categories_EC_graphs\EC_graph_Taxes.png
Saved EC results to ../data/7_Categories_EC_json\state_ec_results_Taxes.txt
