In [216]:
import pandas as pd
import subprocess
from pathlib import Path
import os
import numpy as np
from scipy import stats

In [217]:
def get_git_root():
    try:
        git_root = subprocess.check_output(['git', 'rev-parse', '--show-toplevel'],
                                           stderr=subprocess.STDOUT).decode().strip()
        return Path(git_root)
    except subprocess.CalledProcessError:
        print("Warning: Not in a git repository. Using current working directory.")
        return Path.cwd()

In [218]:
filename = 'Evaluating a Fact-Checking Process For Journalism.csv'
output_dir = get_git_root() / 'user_study_effort'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    print(f"Created directory: {output_dir}")
else:
    print(f"Directory already exists: {output_dir}")
df = pd.read_csv(filename)
print(f'Number of answers pre filtering: {len(df)}')

Directory already exists: /Users/sergiopinto/Desktop/MemeFact/user_study_effort
Number of answers pre filtering: 113


In [219]:
# filter invalid submissions on prolific (TIME TAKEN <= 10 minutes)

rejected_prolific_ids = ['676e1a07c8eaac68bfb02b3d','6701920d56778252df2b1a49','6735e4faf027b9361e838666', '5c8d01cd2edaac00169007e6', '61092e5621c9bede90eb43b1', '67701371beadf0bff4672b32', '6786b7d4ca3e75e900edb6a5', '60690d928069052871e34f25', '667583f2ad1a0accaa279c25', '66aa9bfa8baa2b88248c0ed2', '673f8b0d3dd0d3cc8ca0fe32'
]
df = df[~df['Please enter your prolific ID.'].isin(rejected_prolific_ids)] 
print(f'Number of answers post rejection filtering: {len(df)}')

Number of answers post rejection filtering: 103


In [220]:
# filter duplicate answers

df = df.drop_duplicates(subset=['Please enter your prolific ID.'])
print(f'Number of answers post duplicate filtering: {len(df)}')

Number of answers post duplicate filtering: 103


In [221]:
print(f'Number of columns: {len(df)}')
new_col_names = {
    'Timestamp': 'timestamp',
    'I hereby confirm that I have read the Data Collection & Privacy Information and consent to take part in this study by selecting the \'I agree\' option below:': 'auth',
    'Please enter your prolific ID.': 'prolific_id'
}
df = df.rename(columns=new_col_names)

Number of columns: 103


In [222]:
# participant private data

new_col_names = {
    'Please indicate your age group.': 'age_group',
    'Please indicate your proficiency in English language comprehension.': 'english_level',
    'Please indicate your highest completed level of education.': 'education_level',
    'Please indicate your political orientation.': 'political_orientation',
    'Please indicate your years of experience in journalistic fact checking and verification work.': 'fc_years_of_experience',
    'Please provide your email address if you\'d like to be contacted for future studies.': 'email'
}
columns_to_select = ['prolific_id'] + list(new_col_names.keys())
participants_data = df[columns_to_select].rename(columns=new_col_names)
output_path = f"{output_dir}/participants_data.csv"
participants_data.to_csv(output_path, index=False)

In [223]:
# participants feedback

new_col_names = {
    'Optional: Please share any additional thoughts or suggestions about the fact-checking process presented or any other feedback you may want to disclose.': 'process_feedback',
    'Optional: Please share any additional observations, suggestions, or concerns about the questions and their justifications. Your feedback will help improve the fact-checking process.': 'claim_decomposition_feedback',
    'Optional: Please share any additional observations, suggestions, or concerns about the question explanations and verdicts. Your feedback will help improve the fact-checking process.': 'evidence_synthesis_feedback',
    'Optional: Please share any additional observations, suggestions, or concerns about the summary explanation and claim\'s verdict. Your feedback will help improve the fact-checking process.': 'final_conclusion_feedback',
    'Do you have any comments or suggestions about this survey? Your feedback will help us improve future studies.': 'user_study_feedback'
}
columns_to_select = ['prolific_id'] + list(new_col_names.keys())
participants_feedback = df[columns_to_select].rename(columns=new_col_names)
output_path = f"{output_dir}/participants_feedback.csv"
participants_feedback.to_csv(output_path, index=False)

In [224]:
# process evaluation

new_col_names = {
    'How would you assess the explainability of the fact-checking process\'s intermediate steps in terms of understanding how claims are broken down and analyzed to reach verdicts and generate explanations?': 'process_explainability',
    'How would you assess the transparency of the process in demonstrating the progression from input claim to verdict and explanation summary?': 'process_transparency',
    'How would you assess the transparency in how sources are selected and validated?': 'sources_transparency',
    'How would you assess the level of trust that this fact-checking process inspires?': 'process_level_of_trust',
    'How would you assess the credibility of this fact-checking process compared to existing fact-checking approaches you are familiar with? ': 'process_credibility'
}
columns_to_select = ['prolific_id'] + list(new_col_names.keys())
process_evaluation = df[columns_to_select].rename(columns=new_col_names)
output_path = f"{output_dir}/process_evaluation.csv"
process_evaluation.to_csv(output_path, index=False)

In [225]:
# cd artifacts evaluation

new_col_names = {
    'How thoroughly do the decomposing questions cover all aspects of the claim that need verification?': 'questions_coverage',
    'How relevant are the additional aspects introduced by the decomposing questions for verifying the claim\'s accuracy?': 'questions_relevance',
    'How well are the questions formulated to allow for clear "Yes", "No", or "Unverified" verdicts based on available evidence?': 'questions_formulation',
    'How well do the justifications explain the relevance of the questions for verifying the claim\'s accuracy?': 'justifications_explainability'
}

rename_mapping = {}

for old_col, new_base in new_col_names.items():
    rename_mapping[old_col] = f"{new_base}_claim1"

for i in range(1, 5):
    for old_col, new_base in new_col_names.items():
        old_col_with_suffix = f"{old_col}.{i}"
        rename_mapping[old_col_with_suffix] = f"{new_base}_claim{i+1}"

columns_to_select = ['prolific_id'] + list(rename_mapping.keys())
cd_artifacts_evaluation = df[columns_to_select].rename(columns=rename_mapping)

output_path = f"{output_dir}/cd_artifacts_evaluation.csv"
cd_artifacts_evaluation.to_csv(output_path, index=False)

In [226]:
# es artifacts evaluation

new_col_names = {
   'How relevant are the explanations to answering their respective questions?': 'explanations_relevance',
   'How effective are the explanations at reaching conclusions that respond to the questions?': 'explanations_effectiveness', 
   'How logically do the verdicts follow from the evidence in the explanations given the questions?': 'verdicts_logical_connection'
}

rename_mapping = {}

for old_col, new_base in new_col_names.items():
   rename_mapping[old_col] = f"{new_base}_claim1"

for i in range(1, 5):
   for old_col, new_base in new_col_names.items():
       old_col_with_suffix = f"{old_col}.{i}"
       rename_mapping[old_col_with_suffix] = f"{new_base}_claim{i+1}"

columns_to_select = ['prolific_id'] + list(rename_mapping.keys())
es_artifacts_evaluation = df[columns_to_select].rename(columns=rename_mapping)

output_path = f"{output_dir}/es_artifacts_evaluation.csv"
es_artifacts_evaluation.to_csv(output_path, index=False)

In [227]:
# fc artifacts evaluation

new_col_names = {
   'How thoroughly does the summary explanation cover all aspects of the claim that need verification?': 'summary_coverage',
   'How well does the summary explanation support the final verdict given to the claim?': 'summary_verdict_support',
   'How factually aligned is Summary 1 with Summary 2?': 'summaries_allignment',
   'As a journalist, which summary would you consider more credible and rigorous for fact-checking the claim?': 'summary_choice'
}

rename_mapping = {}

for old_col, new_base in new_col_names.items():
   rename_mapping[old_col] = f"{new_base}_claim1"

for i in range(1, 5):
   for old_col, new_base in new_col_names.items():
       old_col_with_suffix = f"{old_col}.{i}"
       rename_mapping[old_col_with_suffix] = f"{new_base}_claim{i+1}"

columns_to_select = ['prolific_id'] + list(rename_mapping.keys())
fc_artifacts_evaluation = df[columns_to_select].rename(columns=rename_mapping)

output_path = f"{output_dir}/fc_artifacts_evaluation.csv"
fc_artifacts_evaluation.to_csv(output_path, index=False)

In [228]:
claims_data = [
   {
       'number': 1,
       'text': 'Says Ford agreed to invest $900 million at an Ohio plant because Donald Trump lowered taxes and is now moving the project to Mexico because Joe Biden is increasing taxes.',
       'political_stance': 'pro_republican',
       'evidence_id': '-1365092868303902720'
   },
   {
       'number': 2,
       'text': 'The Biden administration inherited gains of 50,000 jobs a month. We\'re now finally back to 500,000 jobs a month. We inherited a country where 4,000 people a day were dying from Covid. That\'s now down 75%.',
       'political_stance': 'pro_democrat', 
       'evidence_id': '5948245359632679936'
   },
   {
       'number': 3,
       'text': 'Officials recommend that women who get one of these (COVID-19) shots should absolutely not get pregnant for at least the first two months after they\'ve been injected.',
       'political_stance': 'neutral',
       'evidence_id': '1015833665715298432'
   },
   {
       'number': 4,
       'text': 'Joe Biden and Kamala Harris government-run health care plan could lead to hospitals being closed, put Medicare coverage at risk, and give benefits to illegal immigrants.',
       'political_stance': 'pro_republican',
       'evidence_id': '7928069865906505728'
   },
   {
       'number': 5,
       'text': 'Wisconsin was the last state to start paying COVID-related federal unemployment benefits.',
       'political_stance': 'neutral',
       'evidence_id': '8719092942913588224'
   }
]

claims = pd.DataFrame(claims_data)

output_path = f"{output_dir}/claims.csv"
claims.to_csv(output_path, index=False)

In [229]:
processed_dir = os.path.join(output_dir, 'processed')
if not os.path.exists(processed_dir):
   os.makedirs(processed_dir)

In [230]:
# process evaluations processing

explainability_map = {
    'No clear explainability': 1,
    'Limited explainability': 2,
    'Adequate explainability': 3,
    'High explainability': 4,
    'Exceptional explainability': 5
}

process_transparency_map = {
    'No clear transparency': 1,
    'Limited transparency': 2,
    'Adequate transparency': 3,
    'High transparency': 4,
    'Exceptional transparency': 5
}

sources_transparency_map = {
    'No clear transparency': 1,
    'Limited transparency': 2,
    'Adequate transparency': 3,
    'High transparency': 4,
    'Exceptional transparency': 5
}

trust_map = {
    'No clear trustworthiness': 1,
    'Limited trustworthiness': 2,
    'Adequate trustworthiness': 3,
    'High trustworthiness': 4,
    'Exceptional trustworthiness': 5
}

credibility_map = {
    'No prior experience with fact-checking systems': 0,
    'Lower credibility': 1,
    'Similar credibility': 2,
    'Superior credibility': 3
}

df = pd.read_csv(f"{output_dir}/process_evaluation.csv")

process_evaluation = df.copy()

process_evaluation['process_explainability'] = process_evaluation['process_explainability'].map(explainability_map)
process_evaluation['process_transparency'] = process_evaluation['process_transparency'].map(process_transparency_map)
process_evaluation['sources_transparency'] = process_evaluation['sources_transparency'].map(sources_transparency_map)
process_evaluation['process_level_of_trust'] = process_evaluation['process_level_of_trust'].map(trust_map)
process_evaluation['process_credibility'] = process_evaluation['process_credibility'].map(credibility_map)

# Save to processed directory
output_path = os.path.join(processed_dir, 'process_evaluation_phase_1.csv')
process_evaluation.to_csv(output_path, index=False)

In [231]:
# cd artifacts processing

coverage_map = {
   'No coverage': 1,
   'Limited coverage': 2,
   'Adequate coverage': 3, 
   'High coverage': 4,
   'Excellent coverage': 5
}

relevance_map = {
   'No relevance': 1,
   'Limited relevance': 2,
   'Adequate relevance': 3,
   'High relevance': 4, 
   'Excellent relevance': 5
}

formulation_map = {
   'Poor formulation': 1,
   'Limited formulation': 2,
   'Adequate formulation': 3,
   'Strong formulation': 4,
   'Excellent formulation': 5
}

justification_map = {
   'No relevance': 1,
   'Limited relevance': 2,
   'Adequate relevance': 3,
   'High relevance': 4,
   'Excellent relevance': 5
}

cd_df = pd.read_csv(f"{output_dir}/cd_artifacts_evaluation.csv")
cd_processed = cd_df.copy()

for col in cd_processed.columns:
   if 'questions_coverage' in col:
       cd_processed[col] = cd_processed[col].map(coverage_map)
   elif 'questions_relevance' in col:
       cd_processed[col] = cd_processed[col].map(relevance_map)
   elif 'questions_formulation' in col:
       cd_processed[col] = cd_processed[col].map(formulation_map)
   elif 'justifications_explainability' in col:
       cd_processed[col] = cd_processed[col].map(justification_map)
   else:
       print(f'Not processing column: {col}')

output_path = os.path.join(processed_dir, 'cd_process_phase_1.csv')
cd_processed.to_csv(output_path, index=False)

Not processing column: prolific_id


In [232]:
# es artifacts processing

relevance_map = {
   'No relevance': 1,
   'Limited relevance': 2,
   'Adequate relevance': 3,
   'High relevance': 4, 
   'Excellent relevance': 5
}

effectiveness_map = {
   'No effectiveness': 1,
   'Limited effectiveness': 2,
   'Adequate effectiveness': 3,
   'High effectiveness': 4,
   'Excellent effectiveness': 5
}

logical_connection_map = {
   'No logical connection': 1,
   'Limited logical connection': 2,
   'Adequate logical connection': 3,
   'Strong logical connection': 4,
   'Excellent logical connection': 5
}

es_df = pd.read_csv(f"{output_dir}/es_artifacts_evaluation.csv")
es_processed = es_df.copy()

for col in es_processed.columns:
   if 'explanations_relevance' in col:
       es_processed[col] = es_processed[col].map(relevance_map)
   elif 'explanations_effectiveness' in col:
       es_processed[col] = es_processed[col].map(effectiveness_map)
   elif 'verdicts_logical_connection' in col:
       es_processed[col] = es_processed[col].map(logical_connection_map)
   else:
       print(f'Not processing column: {col}')

output_path = os.path.join(processed_dir, 'es_process_phase_1.csv')
es_processed.to_csv(output_path, index=False)

Not processing column: prolific_id


In [233]:
# fc artifacts processing

coverage_map = {
   'No coverage': 1,
   'Limited coverage': 2,
   'Adequate coverage': 3,
   'High coverage': 4, 
   'Excellent coverage': 5
}

support_map = {
   'No support': 1,
   'Limited support': 2,
   'Adequate support': 3,
   'Strong support': 4,
   'Excellent support': 5
}

alignment_map = {
   'Completely misaligned (Contains fabricated content that completely alters the meaning)': 1,
   'Major misalignment (Contains factual errors that significantly misrepresent the content)': 2,
   'Minor misalignment (Some details differ but maintains the overall message)': 3,
   'Completely aligned (Accurately represents the same meaning and details)': 4
}

choice_map = {
   'Summary 1 is significantly more credible and rigorous': 5,
   'Summary 1 is somewhat more credible and rigorous': 4,
   'Both summaries are equally credible and rigorous': 3,
   'Summary 2 is somewhat more credible and rigorous': 2,
   'Summary 2 is significantly more credible and rigorous': 1
}

fc_df = pd.read_csv(f"{output_dir}/fc_artifacts_evaluation.csv")
fc_processed = fc_df.copy()

for col in fc_processed.columns:
   if 'summary_coverage' in col:
       fc_processed[col] = fc_processed[col].map(coverage_map)
   elif 'summary_verdict_support' in col:
       fc_processed[col] = fc_processed[col].map(support_map)
   elif 'summaries_allignment' in col:
       fc_processed[col] = fc_processed[col].map(alignment_map)
   elif 'summary_choice' in col:
       fc_processed[col] = fc_processed[col].map(choice_map)
   else:
       print(f'Not processing column: {col}')

output_path = os.path.join(processed_dir, 'fc_process_phase_1.csv')
fc_processed.to_csv(output_path, index=False)

Not processing column: prolific_id


In [234]:
import scipy

proc_df = pd.read_csv(f"{output_dir}/processed/process_evaluation_phase_1.csv")

def get_comprehensive_stats(values, scale_max):
   mean = np.mean(values)
   std = np.std(values)
   n = len(values)
   se = scipy.stats.sem(values)
   ci = scipy.stats.t.interval(confidence=0.95, df=n-1, loc=mean, scale=se)
   
   # Value distribution
   value_counts = pd.Series(values).value_counts().sort_index()
   percentages = (value_counts / len(values) * 100).round(2)
   
   return {
       'basic_stats': {
           'mean': mean,
           'std': std,
           'median': np.median(values),
           'min': np.min(values),
           'max': np.max(values),
           'q1': np.percentile(values, 25),
           'q3': np.percentile(values, 75),
           'ci_lower': ci[0],
           'ci_upper': ci[1],
           'mean_percentage': (mean/scale_max)*100,
           'n': n
       },
       'distribution': {
           'values': value_counts.index.tolist(),
           'counts': value_counts.tolist(),
           'percentages': percentages.tolist()
       },
       'raw_values': values
   }

dimensions_5scale = {
   'Process Explainability': {'col': 'process_explainability', 'stats': None},
   'Process Transparency': {'col': 'process_transparency', 'stats': None},
   'Sources Transparency': {'col': 'sources_transparency', 'stats': None},
   'Process Level of Trust': {'col': 'process_level_of_trust', 'stats': None}
}

for dim_name, dim_data in dimensions_5scale.items():
   dim_data['stats'] = get_comprehensive_stats(proc_df[dim_data['col']].values, 5)

def print_basic_stats(stats, dimension, scale_max):
   bs = stats['basic_stats']
   print(f"\n{dimension} Statistics (Scale 1-{scale_max}):")
   print(f"Mean: {bs['mean']:.2f} (95% CI: [{bs['ci_lower']:.2f}, {bs['ci_upper']:.2f}])")
   print(f"Standard Deviation: {bs['std']:.2f}")
   print(f"Median: {bs['median']:.2f}")
   print(f"Min: {bs['min']:.2f}")
   print(f"Max: {bs['max']:.2f}")
   print(f"Q1: {bs['q1']:.2f}")
   print(f"Q3: {bs['q3']:.2f}")
   print(f"Mean as % of maximum: {bs['mean_percentage']:.1f}%")

def print_distribution(stats, dimension):
   dist = stats['distribution']
   print(f"\n{dimension} Value Distribution:")
   for val, count, pct in zip(dist['values'], dist['counts'], dist['percentages']):
       print(f"Value {val}: {count} occurrences ({pct:.1f}%)")

for dim_name, dim_data in dimensions_5scale.items():
   print_basic_stats(dim_data['stats'], dim_name, 5)
   print_distribution(dim_data['stats'], dim_name)

credibility_data = proc_df['process_credibility']
zero_count = (credibility_data == 0).sum()
total_count = len(credibility_data)
zero_percentage = (zero_count / total_count) * 100

print("\nProcess Credibility Analysis:")
print(f"Participants with no prior experience: {zero_count} ({zero_percentage:.1f}%)")

non_zero_data = credibility_data[credibility_data != 0]
if len(non_zero_data) > 0:
   credibility_stats = get_comprehensive_stats(non_zero_data, 3)
   bs = credibility_stats['basic_stats']
   
   print("\nStatistics for participants with prior experience (Scale 1-3):")
   print(f"N = {bs['n']} participants")
   print(f"Mean: {bs['mean']:.2f} (95% CI: [{bs['ci_lower']:.2f}, {bs['ci_upper']:.2f}])")
   print(f"Standard Deviation: {bs['std']:.2f}")
   print(f"Median: {bs['median']:.2f}")
   print(f"Min: {bs['min']:.2f}")
   print(f"Max: {bs['max']:.2f}")
   print(f"Q1: {bs['q1']:.2f}")
   print(f"Q3: {bs['q3']:.2f}")
   print(f"Mean as % of maximum: {bs['mean_percentage']:.1f}%")
   print_distribution(credibility_stats, "Process Credibility (excluding no prior experience)")


Process Explainability Statistics (Scale 1-5):
Mean: 3.56 (95% CI: [3.41, 3.71])
Standard Deviation: 0.76
Median: 4.00
Min: 1.00
Max: 5.00
Q1: 3.00
Q3: 4.00
Mean as % of maximum: 71.3%

Process Explainability Value Distribution:
Value 1: 1 occurrences (1.0%)
Value 2: 7 occurrences (6.8%)
Value 3: 35 occurrences (34.0%)
Value 4: 53 occurrences (51.5%)
Value 5: 7 occurrences (6.8%)

Process Transparency Statistics (Scale 1-5):
Mean: 3.53 (95% CI: [3.38, 3.68])
Standard Deviation: 0.76
Median: 4.00
Min: 2.00
Max: 5.00
Q1: 3.00
Q3: 4.00
Mean as % of maximum: 70.7%

Process Transparency Value Distribution:
Value 2: 10 occurrences (9.7%)
Value 3: 35 occurrences (34.0%)
Value 4: 51 occurrences (49.5%)
Value 5: 7 occurrences (6.8%)

Sources Transparency Statistics (Scale 1-5):
Mean: 3.25 (95% CI: [3.09, 3.42])
Standard Deviation: 0.83
Median: 3.00
Min: 1.00
Max: 5.00
Q1: 3.00
Q3: 4.00
Mean as % of maximum: 65.0%

Sources Transparency Value Distribution:
Value 1: 2 occurrences (1.9%)
Value 2: 

In [235]:
# cd artifacts statistics
cd_df = pd.read_csv(f"{output_dir}/processed/cd_process_phase_1.csv")

coverage_cols = [col for col in cd_df.columns if 'questions_coverage' in col]
relevance_cols = [col for col in cd_df.columns if 'questions_relevance' in col]
formulation_cols = [col for col in cd_df.columns if 'questions_formulation' in col]
explainability_cols = [col for col in cd_df.columns if 'justifications_explainability' in col]

def get_comprehensive_stats(df, cols):
    all_values = df[cols].values.ravel()
    
    mean = np.mean(all_values)
    std = np.std(all_values)
    n = len(all_values)
    se = scipy.stats.sem(all_values)
    ci = scipy.stats.t.interval(confidence=0.95, df=n-1, loc=mean, scale=se)
    
    value_counts = pd.Series(all_values).value_counts().sort_index()
    percentages = (value_counts / len(all_values) * 100).round(2)
    
    return {
        'basic_stats': {
            'mean': mean,
            'std': std,
            'median': np.median(all_values),
            'min': np.min(all_values),
            'max': np.max(all_values),
            'q1': np.percentile(all_values, 25),
            'q3': np.percentile(all_values, 75),
            'ci_lower': ci[0],
            'ci_upper': ci[1]
        },
        'distribution': {
            'values': value_counts.index.tolist(),
            'counts': value_counts.tolist(),
            'percentages': percentages.tolist()
        },
        'raw_values': all_values
    }

dimensions = {
    'Question Coverage': {'cols': coverage_cols, 'stats': None},
    'Question Relevance': {'cols': relevance_cols, 'stats': None},
    'Question Formulation': {'cols': formulation_cols, 'stats': None},
    'Justification Explainability': {'cols': explainability_cols, 'stats': None}
}

for dim_name, dim_data in dimensions.items():
    dim_data['stats'] = get_comprehensive_stats(cd_df, dim_data['cols'])

claim_stats = {}
for i in range(1, 6):
    claim_cols = [col for col in cd_df.columns if f'claim{i}' in col]
    claim_stats[f'claim_{i}'] = get_comprehensive_stats(cd_df, claim_cols)

def print_basic_stats(stats, dimension):
    bs = stats['basic_stats']
    print(f"\n{dimension} Statistics:")
    print(f"Mean: {bs['mean']:.2f} (95% CI: [{bs['ci_lower']:.2f}, {bs['ci_upper']:.2f}])")
    print(f"Standard Deviation: {bs['std']:.2f}")
    print(f"Median: {bs['median']:.2f}")
    print(f"Min: {bs['min']:.2f}")
    print(f"Max: {bs['max']:.2f}")
    print(f"Q1: {bs['q1']:.2f}")
    print(f"Q3: {bs['q3']:.2f}")

def print_distribution(stats, dimension):
    dist = stats['distribution']
    print(f"\n{dimension} Value Distribution:")
    for val, count, pct in zip(dist['values'], dist['counts'], dist['percentages']):
        print(f"Value {val}: {count} occurrences ({pct:.1f}%)")

for dim_name, dim_data in dimensions.items():
    print_basic_stats(dim_data['stats'], dim_name)
    print_distribution(dim_data['stats'], dim_name)

print("\nPer-Claim Statistics:")
for i in range(1, 6):
    stats = claim_stats[f'claim_{i}']
    mean = stats['basic_stats']['mean']
    ci_lower = stats['basic_stats']['ci_lower']
    ci_upper = stats['basic_stats']['ci_upper']
    print(f"\nClaim {i}: {mean:.2f} (95% CI: [{ci_lower:.2f}, {ci_upper:.2f}])")
    print_distribution(stats, f"Claim {i}")

stats_data = {
    'dimensions': dimensions,
    'claims': claim_stats
}


Question Coverage Statistics:
Mean: 3.92 (95% CI: [3.84, 4.00])
Standard Deviation: 0.92
Median: 4.00
Min: 1.00
Max: 5.00
Q1: 3.00
Q3: 5.00

Question Coverage Value Distribution:
Value 1: 1 occurrences (0.2%)
Value 2: 43 occurrences (8.3%)
Value 3: 104 occurrences (20.2%)
Value 4: 217 occurrences (42.1%)
Value 5: 150 occurrences (29.1%)

Question Relevance Statistics:
Mean: 3.85 (95% CI: [3.77, 3.93])
Standard Deviation: 0.92
Median: 4.00
Min: 1.00
Max: 5.00
Q1: 3.00
Q3: 5.00

Question Relevance Value Distribution:
Value 1: 1 occurrences (0.2%)
Value 2: 47 occurrences (9.1%)
Value 3: 114 occurrences (22.1%)
Value 4: 217 occurrences (42.1%)
Value 5: 136 occurrences (26.4%)

Question Formulation Statistics:
Mean: 3.77 (95% CI: [3.69, 3.84])
Standard Deviation: 0.88
Median: 4.00
Min: 1.00
Max: 5.00
Q1: 3.00
Q3: 4.00

Question Formulation Value Distribution:
Value 1: 5 occurrences (1.0%)
Value 2: 39 occurrences (7.6%)
Value 3: 129 occurrences (25.1%)
Value 4: 241 occurrences (46.8%)
Value

In [236]:
# es statistics

es_df = pd.read_csv(f"{output_dir}/processed/es_process_phase_1.csv")

relevance_cols = [col for col in es_df.columns if 'explanations_relevance' in col]
effectiveness_cols = [col for col in es_df.columns if 'explanations_effectiveness' in col]
logical_cols = [col for col in es_df.columns if 'verdicts_logical_connection' in col]

def get_comprehensive_stats(df, cols):
   all_values = df[cols].values.ravel()
   
   mean = np.mean(all_values)
   std = np.std(all_values)
   n = len(all_values)
   se = scipy.stats.sem(all_values)
   ci = scipy.stats.t.interval(confidence=0.95, df=n-1, loc=mean, scale=se)
   
   value_counts = pd.Series(all_values).value_counts().sort_index()
   percentages = (value_counts / len(all_values) * 100).round(2)
   
   return {
       'basic_stats': {
           'mean': mean,
           'std': std,
           'median': np.median(all_values),
           'min': np.min(all_values),
           'max': np.max(all_values),
           'q1': np.percentile(all_values, 25),
           'q3': np.percentile(all_values, 75),
           'ci_lower': ci[0],
           'ci_upper': ci[1]
       },
       'distribution': {
           'values': value_counts.index.tolist(),
           'counts': value_counts.tolist(),
           'percentages': percentages.tolist()
       },
       'raw_values': all_values
   }

dimensions = {
   'Explanations Relevance': {'cols': relevance_cols, 'stats': None},
   'Explanations Effectiveness': {'cols': effectiveness_cols, 'stats': None},
   'Verdicts Logical Connection': {'cols': logical_cols, 'stats': None}
}

for dim_name, dim_data in dimensions.items():
   dim_data['stats'] = get_comprehensive_stats(es_df, dim_data['cols'])

claim_stats = {}
for i in range(1, 6):
   claim_cols = [col for col in es_df.columns if f'claim{i}' in col]
   claim_stats[f'claim_{i}'] = get_comprehensive_stats(es_df, claim_cols)

def print_basic_stats(stats, dimension):
   bs = stats['basic_stats']
   print(f"\n{dimension} Statistics:")
   print(f"Mean: {bs['mean']:.2f} (95% CI: [{bs['ci_lower']:.2f}, {bs['ci_upper']:.2f}])")
   print(f"Standard Deviation: {bs['std']:.2f}")
   print(f"Median: {bs['median']:.2f}")
   print(f"Min: {bs['min']:.2f}")
   print(f"Max: {bs['max']:.2f}")
   print(f"Q1: {bs['q1']:.2f}")
   print(f"Q3: {bs['q3']:.2f}")

def print_distribution(stats, dimension):
   dist = stats['distribution']
   print(f"\n{dimension} Value Distribution:")
   for val, count, pct in zip(dist['values'], dist['counts'], dist['percentages']):
       print(f"Value {val}: {count} occurrences ({pct:.1f}%)")

for dim_name, dim_data in dimensions.items():
   print_basic_stats(dim_data['stats'], dim_name)
   print_distribution(dim_data['stats'], dim_name)

print("\nPer-Claim Statistics:")
for i in range(1, 6):
   stats = claim_stats[f'claim_{i}']
   mean = stats['basic_stats']['mean']
   ci_lower = stats['basic_stats']['ci_lower']
   ci_upper = stats['basic_stats']['ci_upper']
   print(f"\nClaim {i}: {mean:.2f} (95% CI: [{ci_lower:.2f}, {ci_upper:.2f}])")
   print_distribution(stats, f"Claim {i}")

stats_data = {
   'dimensions': dimensions,
   'claims': claim_stats
}


Explanations Relevance Statistics:
Mean: 3.68 (95% CI: [3.59, 3.76])
Standard Deviation: 0.95
Median: 4.00
Min: 1.00
Max: 5.00
Q1: 3.00
Q3: 4.00

Explanations Relevance Value Distribution:
Value 1: 7 occurrences (1.4%)
Value 2: 56 occurrences (10.9%)
Value 3: 133 occurrences (25.8%)
Value 4: 220 occurrences (42.7%)
Value 5: 99 occurrences (19.2%)

Explanations Effectiveness Statistics:
Mean: 3.61 (95% CI: [3.52, 3.69])
Standard Deviation: 1.00
Median: 4.00
Min: 1.00
Max: 5.00
Q1: 3.00
Q3: 4.00

Explanations Effectiveness Value Distribution:
Value 1: 11 occurrences (2.1%)
Value 2: 65 occurrences (12.6%)
Value 3: 135 occurrences (26.2%)
Value 4: 208 occurrences (40.4%)
Value 5: 96 occurrences (18.6%)

Verdicts Logical Connection Statistics:
Mean: 3.67 (95% CI: [3.58, 3.75])
Standard Deviation: 0.96
Median: 4.00
Min: 1.00
Max: 5.00
Q1: 3.00
Q3: 4.00

Verdicts Logical Connection Value Distribution:
Value 1: 6 occurrences (1.2%)
Value 2: 59 occurrences (11.5%)
Value 3: 141 occurrences (27.

In [237]:
# fc statistics

fc_df = pd.read_csv(f"{output_dir}/processed/fc_process_phase_1.csv")

coverage_cols = [col for col in fc_df.columns if 'summary_coverage' in col]
support_cols = [col for col in fc_df.columns if 'summary_verdict_support' in col]
alignment_cols = [col for col in fc_df.columns if 'summaries_allignment' in col]
choice_cols = [col for col in fc_df.columns if 'summary_choice' in col]

def get_comprehensive_stats(df, cols, scale_max):
    all_values = df[cols].values.ravel()
    
    mean = np.mean(all_values)
    std = np.std(all_values)
    n = len(all_values)
    se = scipy.stats.sem(all_values)
    ci = scipy.stats.t.interval(confidence=0.95, df=n-1, loc=mean, scale=se)
    
    value_counts = pd.Series(all_values).value_counts().sort_index()
    percentages = (value_counts / len(all_values) * 100).round(2)
    
    return {
        'basic_stats': {
            'mean': mean,
            'std': std,
            'median': np.median(all_values),
            'min': np.min(all_values),
            'max': np.max(all_values),
            'q1': np.percentile(all_values, 25),
            'q3': np.percentile(all_values, 75),
            'ci_lower': ci[0],
            'ci_upper': ci[1],
            'mean_percentage': (mean/scale_max)*100
        },
        'distribution': {
            'values': value_counts.index.tolist(),
            'counts': value_counts.tolist(),
            'percentages': percentages.tolist()
        },
        'raw_values': all_values
    }

dimensions = {
    'Summary Coverage': {'cols': coverage_cols, 'scale_max': 5, 'stats': None},
    'Summary Verdict Support': {'cols': support_cols, 'scale_max': 5, 'stats': None},
    'Summaries Alignment': {'cols': alignment_cols, 'scale_max': 4, 'stats': None},
    'Summary Choice': {'cols': choice_cols, 'scale_max': 5, 'stats': None}
}

for dim_name, dim_data in dimensions.items():
    dim_data['stats'] = get_comprehensive_stats(fc_df, dim_data['cols'], dim_data['scale_max'])

def print_basic_stats(stats, dimension, scale_max):
    bs = stats['basic_stats']
    print(f"\n{dimension} Statistics (Scale 1-{scale_max}):")
    print(f"Mean: {bs['mean']:.2f} (95% CI: [{bs['ci_lower']:.2f}, {bs['ci_upper']:.2f}])")
    print(f"Standard Deviation: {bs['std']:.2f}")
    print(f"Median: {bs['median']:.2f}")
    print(f"Min: {bs['min']:.2f}")
    print(f"Max: {bs['max']:.2f}")
    print(f"Q1: {bs['q1']:.2f}")
    print(f"Q3: {bs['q3']:.2f}")
    print(f"Mean as % of maximum: {bs['mean_percentage']:.1f}%")

def print_distribution(stats, dimension):
    dist = stats['distribution']
    print(f"\n{dimension} Value Distribution:")
    for val, count, pct in zip(dist['values'], dist['counts'], dist['percentages']):
        print(f"Value {val}: {count} occurrences ({pct:.1f}%)")

for dim_name, dim_data in dimensions.items():
    print_basic_stats(dim_data['stats'], dim_name, dim_data['scale_max'])
    print_distribution(dim_data['stats'], dim_name)

print("\nPer-Claim Analysis (Separated by Scale):")

scale5_dimensions = ['Summary Coverage', 'Summary Verdict Support', 'Summary Choice']
print("\nAnalysis for 1-5 scale metrics:")
for i in range(1, 6):
    claim_cols = []
    for dim in scale5_dimensions:
        cols = [col for col in dimensions[dim]['cols'] if f'claim{i}' in col]
        claim_cols.extend(cols)
    if claim_cols:
        stats = get_comprehensive_stats(fc_df, claim_cols, 5)
        print(f"\nClaim {i} (1-5 scale metrics):")
        print(f"Mean: {stats['basic_stats']['mean']:.2f} (95% CI: [{stats['basic_stats']['ci_lower']:.2f}, {stats['basic_stats']['ci_upper']:.2f}])")
        print_distribution(stats, f"Claim {i}")

print("\nAnalysis for 1-4 scale metric (Summaries Alignment):")
for i in range(1, 6):
    claim_cols = [col for col in alignment_cols if f'claim{i}' in col]
    if claim_cols:
        stats = get_comprehensive_stats(fc_df, claim_cols, 4)
        print(f"\nClaim {i} Alignment:")
        print(f"Mean: {stats['basic_stats']['mean']:.2f} (95% CI: [{stats['basic_stats']['ci_lower']:.2f}, {stats['basic_stats']['ci_upper']:.2f}])")
        print_distribution(stats, f"Claim {i}")


Summary Coverage Statistics (Scale 1-5):
Mean: 3.60 (95% CI: [3.52, 3.69])
Standard Deviation: 0.97
Median: 4.00
Min: 1.00
Max: 5.00
Q1: 3.00
Q3: 4.00
Mean as % of maximum: 72.1%

Summary Coverage Value Distribution:
Value 1: 3 occurrences (0.6%)
Value 2: 71 occurrences (13.8%)
Value 3: 152 occurrences (29.5%)
Value 4: 190 occurrences (36.9%)
Value 5: 99 occurrences (19.2%)

Summary Verdict Support Statistics (Scale 1-5):
Mean: 3.57 (95% CI: [3.48, 3.65])
Standard Deviation: 0.97
Median: 4.00
Min: 1.00
Max: 5.00
Q1: 3.00
Q3: 4.00
Mean as % of maximum: 71.3%

Summary Verdict Support Value Distribution:
Value 1: 7 occurrences (1.4%)
Value 2: 67 occurrences (13.0%)
Value 3: 160 occurrences (31.1%)
Value 4: 189 occurrences (36.7%)
Value 5: 92 occurrences (17.9%)

Summaries Alignment Statistics (Scale 1-4):
Mean: 2.79 (95% CI: [2.71, 2.87])
Standard Deviation: 0.92
Median: 3.00
Min: 1.00
Max: 4.00
Q1: 2.00
Q3: 3.00
Mean as % of maximum: 69.7%

Summaries Alignment Value Distribution:
Value 

In [238]:
# processing participants data

df = pd.read_csv(f"{output_dir}/participants_data.csv")
participants_processed = df.copy()

participants_processed = participants_processed.drop('email', axis=1)

participants_processed['english_level'] = participants_processed['english_level'].str.extract(r'\((.*?)\)', expand=False)
participants_processed['political_orientation'] = participants_processed['political_orientation'].str.extract(r'\((.*?)\)', expand=False)

output_path = os.path.join(processed_dir, 'participants_data_phase_1.csv')
participants_processed.to_csv(output_path, index=False)

In [239]:
# demographics data information

participants = pd.read_csv(f"{output_dir}/processed/participants_data_phase_1.csv")

print("Demographics Distribution:\n")
print("Age Groups:")
print(participants['age_group'].value_counts(normalize=True) * 100)
print("\nEducation Levels:")
print(participants['education_level'].value_counts(normalize=True) * 100)
print("\nEnglish Proficiency:")
print(participants['english_level'].value_counts(normalize=True) * 100)
print("\nPolitical Orientation:")
print(participants['political_orientation'].value_counts(normalize=True) * 100)
print("\nFact-checking Experience (years):")
print(participants['fc_years_of_experience'].value_counts(normalize=True) * 100)

Demographics Distribution:

Age Groups:
age_group
26-35 years old      36.893204
18-25 years old      29.126214
36-50 years old      26.213592
Over 50 years old     7.766990
Name: proportion, dtype: float64

Education Levels:
education_level
Bachelor's degree                    52.427184
Master's degree                      34.951456
High school diploma or equivalent     7.766990
Doctoral degree (PhD)                 2.912621
I prefer not to answer                1.941748
Name: proportion, dtype: float64

English Proficiency:
english_level
Native speaker          64.705882
High proficiency        25.490196
Moderate proficiency     8.823529
Basic comprehension      0.980392
Name: proportion, dtype: float64

Political Orientation:
political_orientation
Very Liberal               37.373737
Moderately Liberal         32.323232
Moderate                   21.212121
Moderately Conservative     8.080808
Very Conservative           1.010101
Name: proportion, dtype: float64

Fact-checking Experi

In [240]:
# processed averages per participant

cd_df = pd.read_csv(f"{output_dir}/processed/cd_process_phase_1.csv")
es_df = pd.read_csv(f"{output_dir}/processed/es_process_phase_1.csv")
fc_df = pd.read_csv(f"{output_dir}/processed/fc_process_phase_1.csv")
proc_df = pd.read_csv(f"{output_dir}/processed/process_evaluation_phase_1.csv")
participants_df = pd.read_csv(f"{output_dir}/processed/participants_data_phase_1.csv")

participant_averages = {}
participant_ids = cd_df['prolific_id'].unique()

for pid in participant_ids:
    participant_averages[pid] = {}
    
    cd_participant = cd_df[cd_df['prolific_id'] == pid]
    participant_averages[pid].update({
        'cd_overall': cd_participant.iloc[:, 1:].mean().mean(),  # Skip prolific_id column
        'cd_coverage': cd_participant[[col for col in cd_participant.columns if 'coverage' in col]].mean().mean(),
        'cd_relevance': cd_participant[[col for col in cd_participant.columns if 'relevance' in col]].mean().mean(),
        'cd_formulation': cd_participant[[col for col in cd_participant.columns if 'formulation' in col]].mean().mean(),
        'cd_explainability': cd_participant[[col for col in cd_participant.columns if 'explainability' in col]].mean().mean()
    })
    
    # ES averages
    es_participant = es_df[es_df['prolific_id'] == pid]
    participant_averages[pid].update({
        'es_overall': es_participant.iloc[:, 1:].mean().mean(),  # Skip prolific_id column
        'es_relevance': es_participant[[col for col in es_participant.columns if 'relevance' in col]].mean().mean(),
        'es_effectiveness': es_participant[[col for col in es_participant.columns if 'effectiveness' in col]].mean().mean(),
        'es_logical_connection': es_participant[[col for col in es_participant.columns if 'logical_connection' in col]].mean().mean()
    })
    
    fc_participant = fc_df[fc_df['prolific_id'] == pid]
    scale5_cols = ([col for col in fc_participant.columns if 'coverage' in col] + 
                   [col for col in fc_participant.columns if 'support' in col] + 
                   [col for col in fc_participant.columns if 'choice' in col])
    participant_averages[pid].update({
        'fc_scale5_overall': fc_participant[scale5_cols].mean().mean(),
        'fc_coverage': fc_participant[[col for col in fc_participant.columns if 'coverage' in col]].mean().mean(),
        'fc_support': fc_participant[[col for col in fc_participant.columns if 'support' in col]].mean().mean(),
        'fc_choice': fc_participant[[col for col in fc_participant.columns if 'choice' in col]].mean().mean(),
        'fc_alignment': fc_participant[[col for col in fc_participant.columns if 'allignment' in col]].mean().mean()
    })
    
    proc_participant = proc_df[proc_df['prolific_id'] == pid]
    scale5_cols = ['process_explainability', 'process_transparency', 'sources_transparency', 'process_level_of_trust']
    participant_averages[pid].update({
        'proc_scale5_overall': proc_participant[scale5_cols].mean().mean(),
        'proc_explainability': proc_participant['process_explainability'].values[0],
        'proc_transparency': proc_participant['process_transparency'].values[0],
        'proc_sources_transparency': proc_participant['sources_transparency'].values[0],
        'proc_trust': proc_participant['process_level_of_trust'].values[0],
        'proc_credibility': proc_participant['process_credibility'].values[0]
    })
    
    participant_demo = participants_df[participants_df['prolific_id'] == pid]
    participant_averages[pid].update({
        'age_group': participant_demo['age_group'].values[0],
        'english_level': participant_demo['english_level'].values[0],
        'education_level': participant_demo['education_level'].values[0],
        'political_orientation': participant_demo['political_orientation'].values[0],
        'fc_years_of_experience': participant_demo['fc_years_of_experience'].values[0]
    })

participant_averages_df = pd.DataFrame.from_dict(participant_averages, orient='index')
participant_averages_df.index.name = 'prolific_id'
output_path = f"{output_dir}/processed/participants_averages_phase_1.csv"
participant_averages_df.to_csv(output_path)

In [241]:
import pandas as pd
import numpy as np
from scipy import stats
import warnings
warnings.filterwarnings('ignore') 

df = pd.read_csv(f"{output_dir}/processed/participants_averages_phase_1.csv")

evaluation_metrics = {
    'Claim Decomposition': ['cd_overall', 'cd_coverage', 'cd_relevance', 'cd_formulation', 'cd_explainability'],
    'Evidence Synthesis': ['es_overall', 'es_relevance', 'es_effectiveness', 'es_logical_connection'],
    'Final Conclusion': ['fc_scale5_overall', 'fc_coverage', 'fc_support', 'fc_choice', 'fc_alignment'],
    'Process Evaluation': ['proc_scale5_overall', 'proc_explainability', 'proc_transparency', 
                          'proc_sources_transparency', 'proc_trust']
}

def run_statistical_tests(group_column, value_columns, df):
    results = []
    
    for col in value_columns:
        valid_data = df[[group_column, col]].dropna()
        
        if len(valid_data) > 0:
            # Kruskal-Wallis H-test
            groups = [group for _, group in valid_data.groupby(group_column)[col]]
            h_stat, p_value = stats.kruskal(*groups)
            
            # Get mean values and counts for each group
            group_stats = valid_data.groupby(group_column)[col].agg(['mean', 'count'])
            
            results.append({
                'Metric': col,
                'H-statistic': h_stat,
                'p-value': p_value,
                'Group_Stats': group_stats
            })
    
    return pd.DataFrame(results)

def analyze_demographic(demographic_col, df, metrics_dict):
    print(f"\nAnalysis for {demographic_col}:")
    print("-" * 50)
    
    # Print distribution of demographic groups
    print("\nGroup Distribution:")
    group_dist = df[demographic_col].value_counts()
    for group, count in group_dist.items():
        print(f"{group}: {count} participants ({count/len(df)*100:.1f}%)")
    
    for metric_group, metrics in metrics_dict.items():
        results = run_statistical_tests(demographic_col, metrics, df)
        
        # Print significant results (p < 0.05)
        sig_results = results[results['p-value'] < 0.05]
        if not sig_results.empty:
            print(f"\n{metric_group} Metrics:")
            print("\nSignificant differences found:")
            for _, row in sig_results.iterrows():
                print(f"\n{row['Metric']}:")
                print(f"H-statistic: {row['H-statistic']:.2f}")
                print(f"p-value: {row['p-value']:.4f}")
                
                # Print group statistics
                stats_df = row['Group_Stats']
                print("\nGroup statistics:")
                for group in stats_df.index:
                    mean = stats_df.loc[group, 'mean']
                    count = stats_df.loc[group, 'count']
                    print(f"  {group}: {mean:.2f} (n={count})")
                
                # Only make comparisons for groups with sufficient sample size (e.g., n > 5)
                valid_groups = stats_df[stats_df['count'] > 5]
                if len(valid_groups) >= 2:
                    print("\nComparison between groups with sufficient sample size:")
                    print(f"Range of scores: {valid_groups['mean'].min():.2f} to {valid_groups['mean'].max():.2f}")
                    
        else:
            print(f"\n{metric_group} Metrics:")
            print("No significant differences found between groups")

# Run analysis for each demographic factor including fc_years_of_experience
demographics = ['age_group', 'education_level', 'english_level', 
                'political_orientation', 'fc_years_of_experience']
for demo in demographics:
    analyze_demographic(demo, df, evaluation_metrics)


Analysis for age_group:
--------------------------------------------------

Group Distribution:
26-35 years old: 38 participants (36.9%)
18-25 years old: 30 participants (29.1%)
36-50 years old: 27 participants (26.2%)
Over 50 years old: 8 participants (7.8%)

Claim Decomposition Metrics:

Significant differences found:

cd_formulation:
H-statistic: 8.07
p-value: 0.0446

Group statistics:
  18-25 years old: 3.75 (n=30)
  26-35 years old: 3.59 (n=38)
  36-50 years old: 3.98 (n=27)
  Over 50 years old: 3.92 (n=8)

Comparison between groups with sufficient sample size:
Range of scores: 3.59 to 3.98

Evidence Synthesis Metrics:
No significant differences found between groups

Final Conclusion Metrics:
No significant differences found between groups

Process Evaluation Metrics:
No significant differences found between groups

Analysis for education_level:
--------------------------------------------------

Group Distribution:
Bachelor's degree: 54 participants (52.4%)
Master's degree: 36 p

In [248]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from scipy import stats

metrics = {
    'Claim Decomposition': [
        'cd_overall', 'cd_coverage', 'cd_relevance', 'cd_formulation', 'cd_explainability'
    ],
    'Evidence Synthesis': [
        'es_overall', 'es_relevance', 'es_effectiveness', 'es_logical_connection'
    ],
    'Final Conclusion': [
        'fc_scale5_overall', 'fc_coverage', 'fc_support', 'fc_choice'  # Removed fc_alignment
    ],
    'Process Evaluation': [
        'proc_scale5_overall', 'proc_explainability', 'proc_transparency', 
        'proc_sources_transparency', 'proc_trust'
    ]
}

def analyze_correlations(df, output_dir):
    # Create list of all metrics
    all_metrics = []
    for phase_metrics in metrics.values():
        all_metrics.extend(phase_metrics)

    # Calculate correlations
    correlations = df[all_metrics].corr()
    p_values = pd.DataFrame(np.zeros_like(correlations), columns=correlations.columns, index=correlations.index)

    # Calculate p-values
    for i in range(len(correlations.columns)):
        for j in range(len(correlations.columns)):
            coef, p = stats.spearmanr(df[correlations.columns[i]], df[correlations.columns[j]], nan_policy='omit')
            p_values.iloc[i,j] = p

    # Create and save correlation heatmap
    plt.figure(figsize=(15, 12))
    sns.heatmap(correlations, annot=True, cmap='RdBu', center=0, fmt='.2f')
    plt.title('Correlation Heatmap of All Metrics (Excluding 4-point Scale Items)')
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.savefig(f"{output_dir}/processed/correlation_heatmap_no_4point.png")
    plt.close()

    # Save significant correlations to CSV
    significant_correlations = []
    for i in range(len(correlations.columns)):
        for j in range(i + 1, len(correlations.columns)):  # Only upper triangle to avoid duplicates
            metric1 = correlations.columns[i]
            metric2 = correlations.columns[j]
            corr = correlations.iloc[i,j]
            p_val = p_values.iloc[i,j]
            
            if p_val < 0.05:  # Significant correlation
                significant_correlations.append({
                    'Metric1': metric1,
                    'Metric2': metric2,
                    'Correlation': corr,
                    'P-value': p_val,
                    'Strength': 'Strong' if abs(corr) >= 0.5 else 'Moderate' if abs(corr) >= 0.3 else 'Weak',
                    'Direction': 'Positive' if corr > 0 else 'Negative'
                })

    # Save to CSV
    pd.DataFrame(significant_correlations).to_csv(
        f"{output_dir}/processed/significant_correlations_no_4point.csv", 
        index=False
    )

    return correlations, p_values

# Function to print significant correlations between phases
def print_phase_correlations(correlations, p_values, phase1, phase2):
    print(f"\nSignificant correlations between {phase1} and {phase2}:")
    print("-" * 50)
    
    metrics1 = metrics[phase1]
    metrics2 = metrics[phase2]
    
    found_significant = False
    
    for m1 in metrics1:
        for m2 in metrics2:
            if m1 != m2:  # Don't report correlation with itself
                corr = correlations.loc[m1, m2]
                p_val = p_values.loc[m1, m2]
                
                if p_val < 0.05:  # Significant correlation
                    found_significant = True
                    print(f"\n{m1} vs {m2}:")
                    print(f"Correlation coefficient: {corr:.2f}")
                    print(f"P-value: {p_val:.4f}")
                    print("Interpretation:", end=" ")
                    if corr > 0:
                        print(f"Positive correlation - as {m1} increases, {m2} tends to increase")
                    else:
                        print(f"Negative correlation - as {m1} increases, {m2} tends to decrease")
                    
                    # Add strength interpretation
                    abs_corr = abs(corr)
                    if abs_corr < 0.3:
                        print("Strength: Weak correlation")
                    elif abs_corr < 0.5:
                        print("Strength: Moderate correlation")
                    else:
                        print("Strength: Strong correlation")
    
    if not found_significant:
        print("No significant correlations found")

# Read the data
df = pd.read_csv(f"{output_dir}/processed/participants_averages_phase_1.csv")

# Run the analysis
correlations, p_values = analyze_correlations(df, output_dir)

# Print correlations between all pairs of phases
phases = list(metrics.keys())
for i in range(len(phases)):
    for j in range(i + 1, len(phases)):
        print_phase_correlations(correlations, p_values, phases[i], phases[j])


Significant correlations between Claim Decomposition and Evidence Synthesis:
--------------------------------------------------

cd_overall vs es_overall:
Correlation coefficient: 0.73
P-value: 0.0000
Interpretation: Positive correlation - as cd_overall increases, es_overall tends to increase
Strength: Strong correlation

cd_overall vs es_relevance:
Correlation coefficient: 0.71
P-value: 0.0000
Interpretation: Positive correlation - as cd_overall increases, es_relevance tends to increase
Strength: Strong correlation

cd_overall vs es_effectiveness:
Correlation coefficient: 0.65
P-value: 0.0000
Interpretation: Positive correlation - as cd_overall increases, es_effectiveness tends to increase
Strength: Strong correlation

cd_overall vs es_logical_connection:
Correlation coefficient: 0.70
P-value: 0.0000
Interpretation: Positive correlation - as cd_overall increases, es_logical_connection tends to increase
Strength: Strong correlation

cd_coverage vs es_overall:
Correlation coefficient: 

In [243]:
# performance by claim type
import scipy

claims_df = pd.read_csv(f"{output_dir}/claims.csv")
cd_df = pd.read_csv(f"{output_dir}/processed/cd_process_phase_1.csv")
es_df = pd.read_csv(f"{output_dir}/processed/es_process_phase_1.csv")
fc_df = pd.read_csv(f"{output_dir}/processed/fc_process_phase_1.csv")

def get_claim_stats(df, claim_number, metric_identifier):
    claim_cols = [col for col in df.columns if f'claim{claim_number}' in col and metric_identifier in col]
    if claim_cols:
        claim_data = df[claim_cols].values.ravel()
        return {
            'mean': np.mean(claim_data),
            'std': np.std(claim_data),
            'median': np.median(claim_data),
            'n': len(claim_data),
            'ci': scipy.stats.t.interval(confidence=0.95, 
                                 df=len(claim_data)-1,
                                 loc=np.mean(claim_data),
                                 scale=scipy.stats.sem(claim_data))
        }
    return None

metrics = {
    'CD': {
        'coverage': 'coverage',
        'relevance': 'relevance',
        'formulation': 'formulation',
        'explainability': 'explainability'
    },
    'ES': {
        'relevance': 'relevance',
        'effectiveness': 'effectiveness',
        'logical_connection': 'logical_connection'
    },
    'FC': {
        'coverage': 'coverage',
        'support': 'support',
        'alignment': 'allignment',
        'choice': 'choice'
    }
}

# Analyze each claim
claim_analysis = []
for idx, claim in claims_df.iterrows():
    claim_num = claim['number']
    claim_data = {
        'claim_number': claim_num,
        'text': claim['text'],
        'political_stance': claim['political_stance']
    }
    
    # Get CD metrics
    for metric_name, metric_id in metrics['CD'].items():
        stats = get_claim_stats(cd_df, claim_num, metric_id)
        if stats:
            claim_data[f'cd_{metric_name}_mean'] = stats['mean']
            claim_data[f'cd_{metric_name}_ci_lower'] = stats['ci'][0]
            claim_data[f'cd_{metric_name}_ci_upper'] = stats['ci'][1]
    
    # Get ES metrics
    for metric_name, metric_id in metrics['ES'].items():
        stats = get_claim_stats(es_df, claim_num, metric_id)
        if stats:
            claim_data[f'es_{metric_name}_mean'] = stats['mean']
            claim_data[f'es_{metric_name}_ci_lower'] = stats['ci'][0]
            claim_data[f'es_{metric_name}_ci_upper'] = stats['ci'][1]
    
    # Get FC metrics
    for metric_name, metric_id in metrics['FC'].items():
        stats = get_claim_stats(fc_df, claim_num, metric_id)
        if stats:
            claim_data[f'fc_{metric_name}_mean'] = stats['mean']
            claim_data[f'fc_{metric_name}_ci_lower'] = stats['ci'][0]
            claim_data[f'fc_{metric_name}_ci_upper'] = stats['ci'][1]
    
    claim_analysis.append(claim_data)

# Convert to DataFrame
claims_analysis_df = pd.DataFrame(claim_analysis)

# Save detailed analysis
claims_analysis_df.to_csv(f"{output_dir}/processed/claims_analysis.csv", index=False)

# Analyze by political stance
def print_stance_analysis(df, metric_prefix, metric_name):
    print(f"\n{metric_prefix.upper()} - {metric_name}")
    print("-" * 50)
    
    for stance in df['political_stance'].unique():
        stance_data = df[df['political_stance'] == stance]
        col_name = f"{metric_prefix}_{metric_name}_mean"
        mean = stance_data[col_name].mean()
        ci = scipy.stats.t.interval(confidence=0.95,
                            df=len(stance_data)-1,
                            loc=mean,
                            scale=scipy.stats.sem(stance_data[col_name]))
        print(f"{stance}:")
        print(f"Mean: {mean:.2f} (95% CI: [{ci[0]:.2f}, {ci[1]:.2f}])")
        print(f"N = {len(stance_data)}")

print("\nAnalysis by Political Stance:")
for phase, phase_metrics in metrics.items():
    for metric_name in phase_metrics.keys():
        print_stance_analysis(claims_analysis_df, phase.lower(), metric_name)

print("\nOverall Claim Performance:")
print("-" * 50)
for idx, claim in claims_analysis_df.iterrows():
    print(f"\nClaim {claim['claim_number']} ({claim['political_stance']}):")
    print(f"Text: {claim['text']}")
    
    # Print averages for each phase
    for phase in ['cd', 'es', 'fc']:
        phase_cols = [col for col in claims_analysis_df.columns if col.startswith(f"{phase}_") and col.endswith("_mean")]
        if phase_cols:
            phase_mean = claim[phase_cols].mean()
            print(f"{phase.upper()} Average: {phase_mean:.2f}")


Analysis by Political Stance:

CD - coverage
--------------------------------------------------
pro_republican:
Mean: 3.87 (95% CI: [2.64, 5.11])
N = 2
pro_democrat:
Mean: 3.88 (95% CI: [nan, nan])
N = 1
neutral:
Mean: 3.98 (95% CI: [3.67, 4.28])
N = 2

CD - relevance
--------------------------------------------------
pro_republican:
Mean: 3.83 (95% CI: [2.71, 4.94])
N = 2
pro_democrat:
Mean: 3.83 (95% CI: [nan, nan])
N = 1
neutral:
Mean: 3.90 (95% CI: [3.10, 4.70])
N = 2

CD - formulation
--------------------------------------------------
pro_republican:
Mean: 3.71 (95% CI: [1.80, 5.63])
N = 2
pro_democrat:
Mean: 3.78 (95% CI: [nan, nan])
N = 1
neutral:
Mean: 3.81 (95% CI: [3.26, 4.37])
N = 2

CD - explainability
--------------------------------------------------
pro_republican:
Mean: 3.87 (95% CI: [2.08, 5.66])
N = 2
pro_democrat:
Mean: 3.81 (95% CI: [nan, nan])
N = 1
neutral:
Mean: 3.92 (95% CI: [3.55, 4.29])
N = 2

ES - relevance
--------------------------------------------------
