In [1]:
import pandas as pd
from scipy.stats import chi2_contingency, f_oneway, kruskal

In [3]:
# Load the data
anonymized_survey_data = pd.read_excel('data\\anonymized_data.xlsx')
election_results = pd.read_excel("data\\public_data_resultsT.xlsx")

In [4]:
# Create a DataFrame from the result data
result_df = pd.DataFrame(election_results)

In [5]:
# Analysis A:
party_counts = anonymized_survey_data['party'].value_counts()

# Calculate the percentage of Red and Green preferences in the survey data
survey_green_percentage = party_counts.get('Party A', 0) / len(anonymized_survey_data) * 100
survey_red_percentage = party_counts.get('Party B', 0) / len(anonymized_survey_data) * 100

# Calculate the percentage of Red and Green preferences in the result data
result_red_percentage = (result_df['Red'].sum() / result_df['Total'].sum()) * 100
result_green_percentage = (result_df['Green'].sum() / result_df['Total'].sum()) * 100



# Chi-squared test of independence
observed = [[party_counts.get('Party A', 0), party_counts.get('Party B', 0)],
            [result_df['Red'].sum(), result_df['Green'].sum()]]

chi2, p, _, _ = chi2_contingency(observed)

# Compare survey data with election results
print("Analysis A - Red vs. Green Preferences:")
print(f"Survey Red Percentage: {survey_red_percentage:.2f}%")
print(f"Survey Green Percentage: {survey_green_percentage:.2f}%")
print(f"Result Red Percentage: {result_red_percentage:.2f}%")
print(f"Result Green Percentage: {result_green_percentage:.2f}%")

# Print the result of the chi-squared test
print("\nChi-squared test of independence:")
print(f"Chi-squared statistic: {chi2:.2f}")
print(f"P-value: {p:.4f}")

# Set the significance level (alpha)
alpha = 0.05

# Determine if the result is statistically significant
if p < alpha:
    print("There is a significant difference between the political preferences in the survey and the election results.")
else:
    print("There is no significant difference between the political preferences in the survey and the election results.")

Analysis A - Red vs. Green Preferences:
Survey Red Percentage: 30.50%
Survey Green Percentage: 66.50%
Result Red Percentage: 35.62%
Result Green Percentage: 62.56%

Chi-squared test of independence:
Chi-squared statistic: 76.61
P-value: 0.0000
There is a significant difference between the political preferences in the survey and the election results.


In [6]:
# Analysis B:
# Perform chi-squared test
categorical_vars = ['sex', 'education', 'citizenship', 'marital_status']
for var in categorical_vars:
    crosstab = pd.crosstab(anonymized_survey_data['party'], anonymized_survey_data[var])
    chi2, p, _, _ = chi2_contingency(crosstab)
    if p < 0.05:
        significance = 'significant'
    else:
        significance = 'not significant'
    
    print(f'Chi-squared test for {var} and party: chi2 = {chi2}, p = {p} ({significance})')
    print(f'Chi-squared test for {var} and party: chi2 = {chi2}, p = {p}')

# Create a mapping from age_range to numeric values
age_range_mapping = {
    '18-30': 1,
    '31-50': 2,
    '51+': 3
}

# Map the 'age_range' column to numeric values
anonymized_survey_data['age_range_numeric'] = anonymized_survey_data['age_range'].map(age_range_mapping)

# Perform ANOVA (or Kruskal-Wallis) test for age_range_numeric and party
groups = [group['age_range_numeric'] for _, group in anonymized_survey_data.groupby('party')]
f_statistic, p_value = f_oneway(*groups)  # For ANOVA
if p_value < 0.05:
    significance = 'significant'
else:
    significance = 'not significant'

print(f'ANOVA (or Kruskal-Wallis) for age_range_numeric and party: F-statistic = {f_statistic}, p = {p_value} ({significance})')

Chi-squared test for sex and party: chi2 = 8.63231850117096, p = 0.0133510632837391 (significant)
Chi-squared test for sex and party: chi2 = 8.63231850117096, p = 0.0133510632837391
Chi-squared test for education and party: chi2 = 28.045911547794567, p = 9.2110368960829e-05 (significant)
Chi-squared test for education and party: chi2 = 28.045911547794567, p = 9.2110368960829e-05
Chi-squared test for citizenship and party: chi2 = 2.737946870820038, p = 0.2543679506987592 (not significant)
Chi-squared test for citizenship and party: chi2 = 2.737946870820038, p = 0.2543679506987592
Chi-squared test for marital_status and party: chi2 = 4.405981757631282, p = 0.11047225463642474 (not significant)
Chi-squared test for marital_status and party: chi2 = 4.405981757631282, p = 0.11047225463642474
ANOVA (or Kruskal-Wallis) for age_range_numeric and party: F-statistic = 12.363148439529745, p = 8.748401484965074e-06 (significant)


In [8]:
# Analysis B:
# Use chi-squared test 
# Define the list of demographic attributes to consider
demographic_attributes = ['age_range', 'education', 'citizenship', 'marital_status']

# Create a contingency table to compare demographic attributes and voting channel choice
for attribute in demographic_attributes:
    contingency_table = pd.crosstab(anonymized_survey_data['evote'], anonymized_survey_data[attribute])

# Perform the chi-squared test
    chi2, p, _, _ = chi2_contingency(contingency_table)
    if p < 0.05:
        print(f"Significant difference in evote choice based on {attribute}")
    else:
        print(f"No significant difference in evote channel choice based on {attribute}")

Significant difference in evote choice based on age_range
No significant difference in evote channel choice based on education
No significant difference in evote channel choice based on citizenship
No significant difference in evote channel choice based on marital_status
