# Community Sentiment Analysis

In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

sns.set_style('whitegrid')
warnings.filterwarnings('ignore', message='Precision loss occurred in moment calculation')

In [None]:
df = pd.read_csv('../../1_data_collection/data/cleaned/community_survey_clean.csv')
print(f"Total community members: n={len(df)}")
df.head()

## Descriptive Statistics

In [None]:
print("="*70)
print("VOTING BEHAVIOR")
print("="*70)
vote_counts = df['voted_for_zakho'].value_counts()
print(vote_counts)
print(f"\nVoted 'yes': {(df['voted_for_zakho']=='yes').sum()/len(df)*100:.1f}%")
print(f"Voted 'no': {(df['voted_for_zakho']=='no').sum()/len(df)*100:.1f}%")

In [None]:
print("="*70)
print("SENTIMENT SCORES (1-5 SCALE)")
print("="*70)

sentiment_vars = ['feel_support_zakho', 'football_stress_relief', 'proud_when_team_plays']
sentiment_labels = ['Support for Zakho', 'Football Stress Relief', 'Pride When Team Plays']

for var, label in zip(sentiment_vars, sentiment_labels):
    data = pd.to_numeric(df[var], errors='coerce').dropna()
    print(f"\n{label}:")
    print(f"  M={data.mean():.2f}, SD={data.std(ddof=1):.2f}")
    print(f"  Range: {data.min():.0f}-{data.max():.0f}")
    print(f"  High scores (4-5): {(data >= 4).sum()/len(data)*100:.1f}%")

In [None]:
print("="*70)
print("DEMOGRAPHIC DISTRIBUTION")
print("="*70)

print("\nGender:")
print(df['gender'].value_counts())

print("\nResidence Status:")
print(df['residence_status'].value_counts())

print("\nAge Groups:")
print(df['age_group_comm'].value_counts().sort_index())

## Gender Comparisons

In [None]:
def cohens_d(group1, group2):
    """Calculate Cohen's d effect size"""
    n1, n2 = len(group1), len(group2)
    var1, var2 = np.var(group1, ddof=1), np.var(group2, ddof=1)
    pooled_std = np.sqrt(((n1-1)*var1 + (n2-1)*var2) / (n1+n2-2))
    return (np.mean(group1) - np.mean(group2)) / pooled_std

def cohens_d_ci(group1, group2, confidence=0.95):
    """Calculate CI for Cohen's d"""
    from scipy.stats import t as t_dist
    n1, n2 = len(group1), len(group2)
    d = cohens_d(group1, group2)
    
    var_d = (n1 + n2) / (n1 * n2) + d**2 / (2 * (n1 + n2))
    se_d = np.sqrt(var_d)
    
    alpha = 1 - confidence
    df = n1 + n2 - 2
    t_crit = t_dist.ppf(1 - alpha/2, df)
    
    return d - t_crit * se_d, d + t_crit * se_d

In [None]:
print("="*70)
print("GENDER COMPARISONS (MALE VS FEMALE)")
print("="*70)

for var, label in zip(sentiment_vars, sentiment_labels):
    male = pd.to_numeric(df[df['gender']=='male'][var], errors='coerce').dropna()
    female = pd.to_numeric(df[df['gender']=='female'][var], errors='coerce').dropna()
    
    # Check normality
    _, p_male = stats.shapiro(male)
    _, p_female = stats.shapiro(female)
    
    # Use t-test if both normal, otherwise Mann-Whitney U
    if p_male > 0.05 and p_female > 0.05:
        t_stat, p_val = stats.ttest_ind(male, female)
        test_name = "t-test"
    else:
        t_stat, p_val = stats.mannwhitneyu(male, female, alternative='two-sided')
        test_name = "Mann-Whitney U"
    
    d = cohens_d(male, female)
    ci_lower, ci_upper = cohens_d_ci(male, female)
    
    print(f"\n{label}")
    print("-" * 70)
    print(f"Male:   M={male.mean():.2f}, SD={male.std(ddof=1):.2f}, n={len(male)}")
    print(f"Female: M={female.mean():.2f}, SD={female.std(ddof=1):.2f}, n={len(female)}")
    print(f"{test_name}: stat={t_stat:.3f}, p={p_val:.4f}")
    print(f"Cohen's d={d:.3f}, 95% CI [{ci_lower:.3f}, {ci_upper:.3f}]")

In [None]:
# Voting behavior by gender (Chi-square test)
print("="*70)
print("VOTING BEHAVIOR BY GENDER")
print("="*70)

contingency = pd.crosstab(df['gender'], df['voted_for_zakho'])
print("\nContingency table:")
print(contingency)

chi2, p_val, dof, expected = stats.chi2_contingency(contingency)
print(f"\nChi-square test: χ²({dof})={chi2:.3f}, p={p_val:.4f}")

# Calculate Cramér's V
n = contingency.sum().sum()
cramers_v = np.sqrt(chi2 / (n * (min(contingency.shape) - 1)))
print(f"Cramér's V={cramers_v:.3f}")

## Residence Status Comparisons

In [None]:
print("="*70)
print("RESIDENCE STATUS COMPARISONS")
print("="*70)

for var, label in zip(sentiment_vars, sentiment_labels):
    groups = []
    group_names = []
    
    for status in df['residence_status'].unique():
        group_data = pd.to_numeric(df[df['residence_status']==status][var], errors='coerce').dropna()
        if len(group_data) >= 3:
            groups.append(group_data)
            group_names.append(status)
    
    if len(groups) < 2:
        print(f"\n{label}: Insufficient data for comparison")
        continue
    
    # Check normality for all groups
    normal = all(stats.shapiro(g)[1] > 0.05 for g in groups)
    
    # Use ANOVA if normal, otherwise Kruskal-Wallis
    if normal:
        f_stat, p_val = stats.f_oneway(*groups)
        test_name = "ANOVA"
        stat_name = "F"
    else:
        f_stat, p_val = stats.kruskal(*groups)
        test_name = "Kruskal-Wallis"
        stat_name = "H"
    
    print(f"\n{label}")
    print("-" * 70)
    for name, group in zip(group_names, groups):
        print(f"{name}: M={group.mean():.2f}, SD={group.std(ddof=1):.2f}, n={len(group)}")
    print(f"{test_name}: {stat_name}={f_stat:.3f}, p={p_val:.4f}")

In [None]:
# Voting behavior by residence status
print("="*70)
print("VOTING BEHAVIOR BY RESIDENCE STATUS")
print("="*70)

contingency = pd.crosstab(df['residence_status'], df['voted_for_zakho'])
print("\nContingency table:")
print(contingency)

chi2, p_val, dof, expected = stats.chi2_contingency(contingency)
print(f"\nChi-square test: χ²({dof})={chi2:.3f}, p={p_val:.4f}")

n = contingency.sum().sum()
cramers_v = np.sqrt(chi2 / (n * (min(contingency.shape) - 1)))
print(f"Cramér's V={cramers_v:.3f}")

## Age Group Comparisons

In [None]:
print("="*70)
print("AGE GROUP COMPARISONS")
print("="*70)

for var, label in zip(sentiment_vars, sentiment_labels):
    groups = []
    group_names = []
    
    for age in sorted(df['age_group_comm'].unique()):
        group_data = pd.to_numeric(df[df['age_group_comm']==age][var], errors='coerce').dropna()
        if len(group_data) >= 3:
            groups.append(group_data)
            group_names.append(age)
    
    if len(groups) < 2:
        print(f"\n{label}: Insufficient data for comparison")
        continue
    
    # Use Kruskal-Wallis for age groups (ordinal data)
    h_stat, p_val = stats.kruskal(*groups)
    
    print(f"\n{label}")
    print("-" * 70)
    for name, group in zip(group_names, groups):
        print(f"{name}: M={group.mean():.2f}, SD={group.std(ddof=1):.2f}, n={len(group)}")
    print(f"Kruskal-Wallis: H={h_stat:.3f}, p={p_val:.4f}")