In [56]:
import pandas as pd
from datetime import datetime
from scipy.stats import fisher_exact, chi2_contingency


# Load the dataset

df= pd.read_excel(r'data\private_dataT.xlsx')
survey_data = pd.DataFrame(df)
election_results = pd.read_excel("data\\public_data_resultsT.xlsx")
election_df = pd.DataFrame(election_results)

In [57]:
# Calculate age from date of birth and create an 'age_range' column
survey_data['dob'] = pd.to_datetime(survey_data['dob'], format='%m/%d/%Y')
current_year = datetime.now().year
survey_data['age'] = current_year - survey_data['dob'].dt.year
survey_data.drop(columns=['dob'], inplace=True)

(A) Is there a significant difference between the political preferences as expressed in the survey and the election results for both electronic and polling station votes?

In [58]:
party_counts = survey_data['party'].value_counts()

observed = [[party_counts.get('Green', 0), party_counts.get('Red', 0)],
            [election_df['Red'][5], election_df['Green'][5]]] # Total votes for each party
print(f"Observed votes (contingency table):  {observed}")

odds_ratio, p = fisher_exact(observed)

if p < 0.05:
    print(f"p-value: {p} - unlikely to be due to chance")
else:
    print(f"p-value: {p} - likely to be due to chance")
print(f"Odds ratio: {odds_ratio}")

Observed votes (contingency table):  [[133, 61], [390, 685]]
p-value: 7.430799425311284e-17 - unlikely to be due to chance
Odds ratio: 3.8295502311895753


(B) Is there a significant difference between political preferences of the voters depending on their demographic attributes recorded in the survey (that is, age, gender, education level…)?

In [59]:
categorical_vars = ['sex', 'education', 'marital_status','citizenship','age']
for var in categorical_vars:
    contingency_table = pd.crosstab(survey_data['party'], survey_data[var])
    chi2, p, _, _ = chi2_contingency(contingency_table)

    # Check p-value to determine significance
    alpha = 0.05
    print(f'Chi-square value for {var}: {chi2}')
    print(f'P-value for {var}: {p}')
    
    if p < alpha:
        print(f'There is a significant difference between political preferences and {var}.\n')
    else:
        print(f'There is no significant difference between political preferences and {var}.\n')



Chi-square value for sex: 8.63231850117096
P-value for sex: 0.0133510632837391
There is a significant difference between political preferences and sex.

Chi-square value for education: 43.183089892918744
P-value for education: 0.000262247357553198
There is a significant difference between political preferences and education.

Chi-square value for marital_status: 13.501960801671222
P-value for marital_status: 0.03572227613017426
There is a significant difference between political preferences and marital_status.

Chi-square value for citizenship: 53.37047080360282
P-value for citizenship: 0.03118765943603746
There is a significant difference between political preferences and citizenship.

Chi-square value for age: 126.05009792240162
P-value for age: 0.3824528119573126
There is no significant difference between political preferences and age.



(C) Is there a significant difference between voter’s choice of the voting channel (that is, if they decide to vote either online or in person) depending on their demographic attributes recorded in the survey?

In [60]:
for var in categorical_vars:
    contingency_table = pd.crosstab(survey_data['evote'], survey_data[var])
    chi2, p, _, _ = chi2_contingency(contingency_table)

    # Check p-value to determine significance
    alpha = 0.05
    print(f'Chi-square value for {var}: {chi2}')
    print(f'P-value for {var}: {p}')
    
    if p < alpha:
        print(f'There is a significant difference between voter’s choice of the voting channel and {var}.\n')
    else:
        print(f'There is no significant difference between voter’s choice of the voting channel and {var}.\n')



Chi-square value for sex: 11.347061347061347
P-value for sex: 0.0007556744673725127
There is a significant difference between voter’s choice of the voting channel and sex.

Chi-square value for education: 7.796412305195842
P-value for education: 0.4536058221305468
There is no significant difference between voter’s choice of the voting channel and education.

Chi-square value for marital_status: 3.4580674521896917
P-value for marital_status: 0.32624149280058357
There is no significant difference between voter’s choice of the voting channel and marital_status.

Chi-square value for citizenship: 17.41265074598408
P-value for citizenship: 0.4949269492708155
There is no significant difference between voter’s choice of the voting channel and citizenship.

Chi-square value for age: 70.29887029887033
P-value for age: 0.19428387578541634
There is no significant difference between voter’s choice of the voting channel and age.

