In [4]:
import pandas as pd
import numpy as np

df = pd.read_json('results-1.json')

data = []
for index, row in df.iterrows():
    if isinstance(row['questionnaireAnswers'], list):  
        for i, answer in enumerate(row['questionnaireAnswers']):  
            answer_value = answer['answer']
            if isinstance(answer_value, str) and answer_value.isdigit():
                answer_value = int(answer_value)
            elif pd.isna(answer_value):  
                continue

            if not isinstance(answer_value, int):
                continue

            #extra condition to filter out the two adRelatedId: Alps and skiing
            if row['adRelatedId'] != 'JCfAhKUJGlLUAHze9ECo' and row['adRelatedId'] != 'Saci51fTJ9rFAqzizDMm':
                continue

            data.append({
                'country': row['country'],
                'questionIndex': i,
                'answer': answer_value  
            })

df_normalized = pd.DataFrame(data)



In [5]:
colombia_data = df_normalized[df_normalized['country'] == 'CO']
austria_data = df_normalized[df_normalized['country'] == 'AT']

colombia_stats = colombia_data.groupby('questionIndex')['answer'].agg(['mean', 'std']).reset_index()
colombia_stats['country'] = 'CO'

austria_stats = austria_data.groupby('questionIndex')['answer'].agg(['mean', 'std']).reset_index()
austria_stats['country'] = 'AT'

print(colombia_stats)
print(austria_stats)

   questionIndex      mean       std country
0              0  3.333333  1.414214      CO
1              1  2.444444  1.012739      CO
2              2  3.259259  1.288786      CO
3              3  2.703704  1.137298      CO
4              4  3.115385  1.275207      CO
5              5  2.884615  1.243444      CO
6              6  2.307692  1.319674      CO
7              7  3.269231  1.150919      CO
8             10  4.692308  1.086986      CO
   questionIndex      mean       std country
0              0  3.142857  1.099450      AT
1              1  2.846154  1.214232      AT
2              2  1.692308  0.630425      AT
3              3  2.769231  1.235168      AT
4              4  1.923077  0.759555      AT
5              5  2.615385  1.043908      AT
6              6  2.230769  1.012739      AT
7              7  2.230769  0.725011      AT
8             10  4.692308  1.109400      AT


In [6]:
from scipy.stats import ttest_ind

t_test_results = []

for q_index in colombia_stats['questionIndex'].unique():
    col_data = colombia_data[colombia_data['questionIndex'] == q_index]['answer']
    aus_data = austria_data[austria_data['questionIndex'] == q_index]['answer']
    
    t_stat, p_value = ttest_ind(col_data, aus_data)
    
    t_test_results.append({
        'questionIndex': q_index,
        't_stat': t_stat,
        'p_value': p_value
    })

t_test_results_df = pd.DataFrame(t_test_results)

print(t_test_results_df)


   questionIndex    t_stat   p_value
0              0  0.438921  0.663141
1              1 -1.101379  0.277660
2              2  4.131965  0.000191
3              3 -0.166034  0.869011
4              4  3.095402  0.003735
5              5  0.670312  0.506821
6              6  0.184326  0.854764
7              7  2.961707  0.005318
8             10  0.000000  1.000000
