In [2]:
import pandas as pd
import numpy as np
import json
from utils.helpers import get_git_root
from tqdm import tqdm
from io import StringIO

In [3]:
def read_large_json(file_path, max_lines=None):
    total_lines = 0
    with open(file_path, 'r') as f:
        for _ in f:
            total_lines += 1
    print(f"Total lines in file: {total_lines}")

    def read_json_in_chunks(file_path, chunksize=1000):
        with open(file_path, 'r') as f:
            chunks = []
            for i, line in enumerate(f):
                if max_lines and i >= max_lines:
                    break
                try:
                    chunks.append(pd.read_json(StringIO(line), lines=True))
                    print(f"Successfully processed line {i+1}")
                except Exception as e:
                    print(f"Error processing line {i+1}: {e}")
                if (i + 1) % chunksize == 0:
                    yield pd.concat(chunks)
                    chunks = []
            if chunks:
                yield pd.concat(chunks)
    
    chunks = []
    print("Reading file in chunks...")
    for chunk in tqdm(read_json_in_chunks(file_path, chunksize=1000)):
        chunks.append(chunk)
        print(f"Chunk processed, current total rows: {sum(len(c) for c in chunks)}")
    print("Concatenating chunks...")
    merged = pd.concat(chunks, ignore_index=True)
    print(f"Final number of rows: {len(merged)}")
    return merged

In [4]:
filename = 'politifact_all_labels.jsonl'
path = get_git_root() / 'llm_selection' / 'data'
max_lines = None
df = read_large_json(path / filename, max_lines=None)

Total lines in file: 117
Reading file in chunks...


0it [00:00, ?it/s]

Successfully processed line 1
Successfully processed line 2
Successfully processed line 3
Successfully processed line 4
Successfully processed line 5
Successfully processed line 6
Successfully processed line 7
Successfully processed line 8
Successfully processed line 9
Successfully processed line 10
Successfully processed line 11
Successfully processed line 12
Successfully processed line 13
Successfully processed line 14
Successfully processed line 15
Successfully processed line 16
Successfully processed line 17
Successfully processed line 18
Successfully processed line 19
Successfully processed line 20
Successfully processed line 21
Successfully processed line 22
Successfully processed line 23
Successfully processed line 24
Successfully processed line 25
Successfully processed line 26
Successfully processed line 27
Successfully processed line 28
Successfully processed line 29
Successfully processed line 30
Successfully processed line 31
Successfully processed line 32
Successfully proc

1it [00:34, 34.28s/it]

Successfully processed line 117
Chunk processed, current total rows: 117
Concatenating chunks...
Final number of rows: 117





In [4]:
# Overall statistics
print(f"Number of rows: {len(df)}")
print(f"Number of columns: {len(df.columns)}")
print("\nColumn Data Types:")
print(df.dtypes)
print("\nNull Values Count:")
print(df.isnull().sum())
print(df.describe(include='all'))
print("\nClass Distribution:")
print(df['class'].value_counts())
print("\nCategory Distribution:")
print(df['category'].value_counts())
print("\nSubcategory Distribution:")
print(df['subcategory'].value_counts())

Number of rows: 117
Number of columns: 7

Column Data Types:
class                     object
category                  object
subcategory               object
count                      int64
percentage_occurrence    float64
claims                    object
artifacts                 object
dtype: object

Null Values Count:
class                    0
category                 0
subcategory              0
count                    0
percentage_occurrence    0
claims                   0
artifacts                0
dtype: int64
              class        category subcategory       count  \
count           117             117         117  117.000000   
unique            5               4          11         NaN   
top     mostly-true  True Negatives    Politics         NaN   
freq             25              52          19         NaN   
mean            NaN             NaN         NaN    4.358974   
std             NaN             NaN         NaN    5.945821   
min             NaN            

In [5]:
print("\nUnique combinations in dataset:")
print(f"Number of unique classes: {df['class'].nunique()}")
print(f"Number of unique categories: {df['category'].nunique()}")
print(f"Number of unique subcategories: {df['subcategory'].nunique()}")


Unique combinations in dataset:
Number of unique classes: 5
Number of unique categories: 4
Number of unique subcategories: 11


In [6]:
well_classified = df[(df['category'] == 'True Negatives') | (df['category'] == 'True Positives')]
misclassified = df[(df['category'] == 'False Negatives') | (df['category'] == 'False Positives')]

w_class = well_classified.groupby('class')['count'].sum()
m_class = misclassified.groupby('class')['count'].sum()

print("\nWell-classified claims per class:")
print(w_class)

print("\nMisclassified claims per class:")
print(m_class)

print("\nPercentages of well-classified claims by class:")
print((w_class / w_class.sum() * 100).round(2))

print("\nPercentages of misclassified claims by class:")
print((m_class / m_class.sum() * 100).round(2))

num_claims_w, num_claims_m, total_claims = 0, 0, 0

for r in w_class:
    num_claims_w += r
print(f'\nTotal num of well classified claims: {num_claims_w}')
total_claims += num_claims_w

for r in m_class:
    num_claims_m += r
print(f'Total num of misclassified claims: {num_claims_m}')
total_claims += num_claims_m

print(f'Total num of claims: {total_claims}')
print("\nPercentage of well-classified claims:")
print(((num_claims_w / total_claims) * 100))

print("\nPercentage of misclassified claims:")
print(((num_claims_m / total_claims) * 100))


Well-classified claims per class:
class
barely-true    92
false          92
half-true      93
mostly-true    90
true           93
Name: count, dtype: int64

Misclassified claims per class:
class
barely-true    10
false          10
half-true       9
mostly-true    12
true            9
Name: count, dtype: int64

Percentages of well-classified claims by class:
class
barely-true    20.00
false          20.00
half-true      20.22
mostly-true    19.57
true           20.22
Name: count, dtype: float64

Percentages of misclassified claims by class:
class
barely-true    20.0
false          20.0
half-true      18.0
mostly-true    24.0
true           18.0
Name: count, dtype: float64

Total num of well classified claims: 460
Total num of misclassified claims: 50
Total num of claims: 510

Percentage of well-classified claims:
90.19607843137256

Percentage of misclassified claims:
9.803921568627452


In [7]:
w_sub = well_classified.groupby('subcategory')['count'].sum().sort_values(ascending=False)
m_sub = misclassified.groupby('subcategory')['count'].sum().sort_values(ascending=False)

print("\nWell-classified claims per subcategory:")
print(w_sub)

print("\nMisclassified claims per subcategory:")
print(m_sub)

print("\nPercentages of well-classified claims by subcategory:")
print((w_sub / w_sub.sum() * 100).round(2))

print("\nPercentages of misclassified claims by subcategory:")
print((m_sub / m_sub.sum() * 100).round(2))


Well-classified claims per subcategory:
subcategory
Other                  145
Politics               124
Immigration             54
Politicians             38
Guns                    25
Quotes                  19
Abortion                18
Ballot Box              18
Conspiracy Theories      8
Not Verifiable           8
Imagery                  3
Name: count, dtype: int64

Misclassified claims per subcategory:
subcategory
Politics               16
Other                  10
Immigration             6
Quotes                  6
Abortion                2
Ballot Box              2
Conspiracy Theories     2
Imagery                 2
Not Verifiable          2
Politicians             2
Name: count, dtype: int64

Percentages of well-classified claims by subcategory:
subcategory
Other                  31.52
Politics               26.96
Immigration            11.74
Politicians             8.26
Guns                    5.43
Quotes                  4.13
Abortion                3.91
Ballot Box       

In [8]:
w_class_sub = well_classified.groupby(['class', 'subcategory'])['count'].sum().sort_values(ascending=False)
m_class_sub = misclassified.groupby(['class', 'subcategory'])['count'].sum().sort_values(ascending=False)

print("\nWell-classified claims per class per subcategory:")
print(w_class_sub)

print("\nMisclassified claims per class per subcategory:")
print(m_class_sub)

print("\nPercentages of well-classified claims by class and subcategory:")
print((w_class_sub / w_class_sub.sum() * 100).round(2))

print("\nPercentages of misclassified claims by class and subcategory:")
print((m_class_sub / m_class_sub.sum() * 100).round(2))


Well-classified claims per class per subcategory:
class        subcategory        
half-true    Other                  31
false        Other                  30
mostly-true  Other                  29
true         Other                  28
barely-true  Other                  27
half-true    Politics               26
false        Politics               26
true         Politics               25
barely-true  Politics               24
mostly-true  Politics               23
true         Immigration            12
half-true    Immigration            11
false        Immigration            11
barely-true  Immigration            10
mostly-true  Immigration            10
half-true    Politicians             8
barely-true  Politicians             8
true         Politicians             8
mostly-true  Politicians             7
false        Politicians             7
barely-true  Guns                    5
mostly-true  Quotes                  5
true         Guns                    5
barely-true  Quotes

In [9]:
def analyze_row_data(row):
   """Analyze a single row's claims and artifacts with all their components"""
   stats = {}
   
   # Analyze claims
   claims_list = row['claims']
   stats['num_claims'] = len(claims_list)
   stats['claim_lengths'] = [len(str(c['claim'])) for c in claims_list]
   stats['claim_dates'] = [c['date'] for c in claims_list]
   
   # Analyze artifacts
   artifacts_list = row['artifacts']
   
   # For each claim in this category
   questions_per_claim = []
   justifications_per_claim = []
   explanations_per_claim = []
   question_lengths = []
   justification_lengths = []
   explanation_lengths = []
   urls_per_question = []
   evidence_dates = []
   page_content_lengths = []
   page_summary_lengths = []
   page_contents_per_question = []
   page_summaries_per_question = []
   
   
   for artifacts in artifacts_list:  # For each claim's artifacts
       # Count components per claim
       num_questions = len(artifacts)
       questions_per_claim.append(num_questions)
       
       num_justifications = len([a for a in artifacts if a.get('decomposed_justification')])
       justifications_per_claim.append(num_justifications)
       
       num_explanations = len([a for a in artifacts if a.get('decomposed_question_explanation')])
       explanations_per_claim.append(num_explanations)
       
       # Analyze each question's components
       for question_artifact in artifacts:
           # Component lengths
           question_lengths.append(len(str(question_artifact.get('decomposed_question', ''))))
           justification_lengths.append(len(str(question_artifact.get('decomposed_justification', ''))))
           explanation_lengths.append(len(str(question_artifact.get('decomposed_question_explanation', ''))))
           
           # Evidence analysis
           evidence = question_artifact.get('evidence', [])
           urls_per_question.append(len(evidence))
           
           # Count contents and summaries for this question
           num_contents = len([e for e in evidence if e.get('page_content')])
           num_summaries = len([e for e in evidence if e.get('page_summary')])
           page_contents_per_question.append(num_contents)
           page_summaries_per_question.append(num_summaries)
           
           # Analyze page content and summaries
           for e in evidence:
               page_content_lengths.append(len(str(e.get('page_content', ''))))
               page_summary_lengths.append(len(str(e.get('page_summary', ''))))
               if e.get('page_timestamp'):
                   evidence_dates.append(e['page_timestamp'])
   
   stats['questions_per_claim'] = questions_per_claim
   stats['justifications_per_claim'] = justifications_per_claim
   stats['explanations_per_claim'] = explanations_per_claim
   stats['question_lengths'] = question_lengths
   stats['justification_lengths'] = justification_lengths
   stats['explanation_lengths'] = explanation_lengths
   stats['urls_per_question'] = urls_per_question
   stats['evidence_dates'] = evidence_dates
   stats['page_content_lengths'] = page_content_lengths
   stats['page_summary_lengths'] = page_summary_lengths
   stats['page_contents_per_question'] = page_contents_per_question
   stats['page_summaries_per_question'] = page_summaries_per_question
   
   return stats

def aggregate_stats(stats_list):
   aggregated = {
       'avg_claims_per_combination': np.mean([s['num_claims'] for s in stats_list]),
       'avg_claim_length': np.mean([l for s in stats_list for l in s['claim_lengths']]),
       'avg_questions_per_claim': np.mean([q for s in stats_list for q in s['questions_per_claim']]),
       'avg_justifications_per_claim': np.mean([j for s in stats_list for j in s['justifications_per_claim']]),
       'avg_explanations_per_claim': np.mean([e for s in stats_list for e in s['explanations_per_claim']]),
       'avg_question_length': np.mean([l for s in stats_list for l in s['question_lengths']]),
       'avg_justification_length': np.mean([l for s in stats_list for l in s['justification_lengths']]),
       'avg_explanation_length': np.mean([l for s in stats_list for l in s['explanation_lengths']]),
       'avg_urls_per_question': np.mean([u for s in stats_list for u in s['urls_per_question']]),
       'avg_page_content_length': np.mean([l for s in stats_list for l in s['page_content_lengths']]),
       'avg_page_summary_length': np.mean([l for s in stats_list for l in s['page_summary_lengths']]),
       'avg_page_contents_per_question': np.mean([n for s in stats_list for n in s['page_contents_per_question']]),
       'avg_page_summaries_per_question': np.mean([n for s in stats_list for n in s['page_summaries_per_question']]),
       'total_claims': sum(s['num_claims'] for s in stats_list),
   }
   return aggregated

def analyze_distributions(stats_list):
   def get_distribution_stats(values):
       return {
           'min': np.min(values),
           'max': np.max(values),
           'median': np.median(values),
           'std': np.std(values),
           'count': len(values)
       }
   
   return {
       'claim_lengths': get_distribution_stats([l for s in stats_list for l in s['claim_lengths']]),
       'questions_per_claim': get_distribution_stats([q for s in stats_list for q in s['questions_per_claim']]),
       'justifications_per_claim': get_distribution_stats([j for s in stats_list for j in s['justifications_per_claim']]),
       'explanations_per_claim': get_distribution_stats([e for s in stats_list for e in s['explanations_per_claim']]),
       'question_lengths': get_distribution_stats([l for s in stats_list for l in s['question_lengths']]),
       'justification_lengths': get_distribution_stats([l for s in stats_list for l in s['justification_lengths']]),
       'explanation_lengths': get_distribution_stats([l for s in stats_list for l in s['explanation_lengths']]),
       'urls_per_question': get_distribution_stats([u for s in stats_list for u in s['urls_per_question']]),
       'page_content_lengths': get_distribution_stats([l for s in stats_list for l in s['page_content_lengths']]),
       'page_summary_lengths': get_distribution_stats([l for s in stats_list for l in s['page_summary_lengths']]),
       'page_contents_per_question': get_distribution_stats([n for s in stats_list for n in s['page_contents_per_question']]),
        'page_summaries_per_question': get_distribution_stats([n for s in stats_list for n in s['page_summaries_per_question']]),
   }

# Apply analysis
print("Analyzing well-classified data...")
well_stats = well_classified.apply(analyze_row_data, axis=1).tolist()
print("Analyzing misclassified data...")
mis_stats = misclassified.apply(analyze_row_data, axis=1).tolist()

# Print results
print("\nWell-classified patterns:")
print(aggregate_stats(well_stats))
print("\nWell-classified distributions:")
print(analyze_distributions(well_stats))

print("\nMisclassified patterns:")
print(aggregate_stats(mis_stats))
print("\nMisclassified distributions:")
print(analyze_distributions(mis_stats))

Analyzing well-classified data...
Analyzing misclassified data...

Well-classified patterns:
{'avg_claims_per_combination': 5.609756097560975, 'avg_claim_length': 155.75, 'avg_questions_per_claim': 10.0, 'avg_justifications_per_claim': 10.0, 'avg_explanations_per_claim': 10.0, 'avg_question_length': 100.65730994152047, 'avg_justification_length': 83.78050682261208, 'avg_explanation_length': 832.2463937621833, 'avg_urls_per_question': 8.100779727095517, 'avg_page_content_length': 158635.08205597132, 'avg_page_summary_length': 2031.3524797266405, 'avg_page_contents_per_question': 8.100779727095517, 'avg_page_summaries_per_question': 8.053411306042886, 'total_claims': 460}

Well-classified distributions:
{'claim_lengths': {'min': 66, 'max': 335, 'median': 147.0, 'std': 46.58373129901823, 'count': 460}, 'questions_per_claim': {'min': 10, 'max': 10, 'median': 10.0, 'std': 0.0, 'count': 513}, 'justifications_per_claim': {'min': 10, 'max': 10, 'median': 10.0, 'std': 0.0, 'count': 513}, 'expla

In [10]:
def analyze_dates(row):
    """Analyze claim dates and evidence dates for a single row"""
    date_stats = {
        'claim_dates': [],
        'evidence_dates': [],
        'claim_evidence_pairs': []  # To analyze claim-evidence relationships
    }
    
    # Get claim dates
    claims_list = row['claims']
    for claim in claims_list:
        if claim['date'] != 'placeholder':
            try:
                claim_date = pd.to_datetime(claim['date'])
                date_stats['claim_dates'].append(claim_date)
            except Exception:
                continue
    
    # Get evidence dates and pair them with claims
    artifacts_list = row['artifacts']
    for claim_date, artifacts in zip(date_stats['claim_dates'], artifacts_list):
        claim_evidence_dates = []
        for question_artifact in artifacts:
            evidence = question_artifact.get('evidence', [])
            for e in evidence:
                timestamp = e.get('page_timestamp')
                if timestamp and timestamp != 'placeholder':
                    try:
                        evidence_date = pd.to_datetime(timestamp)
                        date_stats['evidence_dates'].append(evidence_date)
                        claim_evidence_dates.append(evidence_date)
                    except Exception:
                        continue
        
        if claim_evidence_dates:
            date_stats['claim_evidence_pairs'].append({
                'claim_date': claim_date,
                'evidence_dates': claim_evidence_dates
            })
    
    return date_stats

def analyze_temporal_patterns(date_stats_list):
    """Analyze temporal patterns in the data"""
    
    # 1. Claims per year
    all_claim_dates = [d for stats in date_stats_list for d in stats['claim_dates']]
    claims_per_year = pd.Series(all_claim_dates).dt.year.value_counts().sort_index()
    
    # 2. Evidence temporal analysis
    time_differences = []
    evidence_age_distribution = []
    claims_with_future_evidence = 0
    claims_with_only_past_evidence = 0
    
    for stats in date_stats_list:
        for pair in stats['claim_evidence_pairs']:
            claim_date = pair['claim_date']
            
            # Calculate time differences for each piece of evidence
            for evidence_date in pair['evidence_dates']:
                time_diff = (claim_date - evidence_date).days
                time_differences.append(time_diff)
                
                # Track evidence age distribution
                evidence_age_distribution.append({
                    'days_before_claim': time_diff,
                    'evidence_year': evidence_date.year,
                    'claim_year': claim_date.year
                })
            
            # Check if claim has any future evidence
            has_future_evidence = any(e > claim_date for e in pair['evidence_dates'])
            if has_future_evidence:
                claims_with_future_evidence += 1
            else:
                claims_with_only_past_evidence += 1
    
    temporal_analysis = {
        'claims_distribution': {
            'claims_per_year': claims_per_year.to_dict(),
            'total_claims': len(all_claim_dates),
            'year_range': f"{min(claims_per_year.index)} - {max(claims_per_year.index)}"
        },
        'evidence_timing': {
            'avg_days_before_claim': np.mean(time_differences) if time_differences else None,
            'median_days_before_claim': np.median(time_differences) if time_differences else None,
            'max_days_before_claim': max(time_differences) if time_differences else None,
            'min_days_before_claim': min(time_differences) if time_differences else None,
            'claims_with_future_evidence': claims_with_future_evidence,
            'claims_with_only_past_evidence': claims_with_only_past_evidence
        },
        'evidence_age_stats': {
            'evidence_within_1_week': len([d for d in time_differences if 0 <= d <= 7]),
            'evidence_within_1_month': len([d for d in time_differences if 0 <= d <= 30]),
            'evidence_within_1_year': len([d for d in time_differences if 0 <= d <= 365]),
            'evidence_older_than_1_year': len([d for d in time_differences if d > 365])
        }
    }
    
    return temporal_analysis

# Apply analysis
print("Analyzing temporal patterns for well-classified claims...")
well_date_stats = well_classified.apply(analyze_dates, axis=1).tolist()
well_temporal_analysis = analyze_temporal_patterns(well_date_stats)

print("Analyzing temporal patterns for misclassified claims...")
mis_date_stats = misclassified.apply(analyze_dates, axis=1).tolist()
mis_temporal_analysis = analyze_temporal_patterns(mis_date_stats)

# Print results
print("\nWell-classified temporal patterns:")
print(json.dumps(well_temporal_analysis, indent=2, default=str))
print("\nMisclassified temporal patterns:")
print(json.dumps(mis_temporal_analysis, indent=2, default=str))

Analyzing temporal patterns for well-classified claims...
Analyzing temporal patterns for misclassified claims...

Well-classified temporal patterns:
{
  "claims_distribution": {
    "claims_per_year": {
      "2013": 5,
      "2014": 10,
      "2015": 16,
      "2016": 96,
      "2017": 64,
      "2018": 51,
      "2019": 43,
      "2020": 115,
      "2021": 55
    },
    "total_claims": 455,
    "year_range": "2013 - 2021"
  },
  "evidence_timing": {
    "avg_days_before_claim": 527.57419245825,
    "median_days_before_claim": 396.0,
    "max_days_before_claim": 8073,
    "min_days_before_claim": -2645,
    "claims_with_future_evidence": 174,
    "claims_with_only_past_evidence": 281
  },
  "evidence_age_stats": {
    "evidence_within_1_week": 860,
    "evidence_within_1_month": 1786,
    "evidence_within_1_year": 8311,
    "evidence_older_than_1_year": 18279
  }
}

Misclassified temporal patterns:
{
  "claims_distribution": {
    "claims_per_year": {
      "2015": 4,
      "2016": 1

In [18]:
# Get false positives and negatives for false claims
false_fp = df[(df['class'] == 'false') & (df['category'] == 'False Positives')]
false_fn = df[(df['class'] == 'false') & (df['category'] == 'False Negatives')]

def analyze_misclassified_claims(df_subset, category_name):
   """Analyze claims from a subset of misclassified data"""
   print(f"\n{'='*20} {category_name} {'='*20}")
   
   claim_count = 0
   for _, row in df_subset.iterrows():
       for claim_dict, artifacts_list in zip(row['claims'], row['artifacts']):
           claim_count += 1
           print(f"\nClaim #{claim_count}:")
           print(f"Text: {claim_dict['claim']}")
           print(f"Date: {claim_dict['date']}")
           
           print("\nQuestions, Explanations and Sources:")
           for i, artifact in enumerate(artifacts_list, 1):
               print(f"\n{i}. Question: {artifact.get('decomposed_question', 'No question available')}")
               print(f"   Explanation: {artifact.get('decomposed_question_explanation', 'No explanation available')}")
               
               # Print evidence sources
               evidence = artifact.get('evidence', [])
               if evidence:
                   print(f"   Sources ({len(evidence)} URLs):")
                   for j, e in enumerate(evidence, 1):
                       print(f"      {j}. {e.get('page_url', 'No URL available')}")
                       print(f"         Date: {e.get('page_timestamp', 'No date available')}")
               else:
                   print("   Sources: No evidence sources available")
           
           print("\n" + "-"*50)

# Analyze False Positives
print(f"\nAnalyzing False Positives (claims classified as false but weren't)...")
analyze_misclassified_claims(false_fp, "FALSE POSITIVES")

# Analyze False Negatives
print(f"\nAnalyzing False Negatives (claims that were false but weren't classified as such)...")
analyze_misclassified_claims(false_fn, "FALSE NEGATIVES")


Analyzing False Positives (claims classified as false but weren't)...


Claim #1:
Text: “(Retail) milk has gone up 7.5% since this time last year. The price farmers are paid has dropped 23%.”
Date: May 2, 2020

Questions, Explanations and Sources:

1. Question: Has the retail price of milk increased by 7.5% over the past year?
   Explanation: The price of milk was first increased by 10% and then decreased by 20%, resulting in a net percentage change of 12%. There is no information provided about a 7.5% increase in the retail price of milk over the past year. Additionally, there is no mention of the retail price of milk or any relevant details regarding its increase. Various economic indicators and retail sales growth are discussed, but milk prices are not specifically addressed.
   Sources (10 URLs):
      1. https://www.careerride.com/mchoice/percentage-quantitative-aptitude-mcq-questions-28545.aspx
         Date: 2016-01-01
      2. https://www.uktaxcalculators.co.uk/tax-guides/pers

In [19]:
def analyze_future_sources(df_subset, category_name):
   """Analyze claims that have sources from their future"""
   print(f"\n{'='*20} {category_name} {'='*20}")
   
   claim_count = 0
   for _, row in df_subset.iterrows():
       for claim_dict, artifacts_list in zip(row['claims'], row['artifacts']):
           try:
               claim_date = pd.to_datetime(claim_dict['date'])
               if claim_dict['date'] == 'placeholder':
                   continue
                   
               # Check if any evidence is from the future
               has_future_sources = False
               future_sources = []
               
               for artifact in artifacts_list:
                   evidence = artifact.get('evidence', [])
                   for e in evidence:
                       if e.get('page_timestamp') != 'placeholder':
                           try:
                               source_date = pd.to_datetime(e.get('page_timestamp'))
                               if source_date > claim_date:
                                   has_future_sources = True
                                   future_sources.append({
                                       'question': artifact.get('decomposed_question'),
                                       'url': e.get('page_url'),
                                       'source_date': source_date,
                                       'days_in_future': (source_date - claim_date).days
                                   })
                           except:
                               continue
               
               # If claim has future sources, print the information
               if has_future_sources:
                   claim_count += 1
                   print(f"\nClaim #{claim_count}:")
                   print(f"Text: {claim_dict['claim']}")
                   print(f"Date: {claim_date}")
                   
                   print("\nFuture Sources:")
                   for source in future_sources:
                       print(f"\nQuestion: {source['question']}")
                       print(f"URL: {source['url']}")
                       print(f"Source Date: {source['source_date']}")
                       print(f"Days in future: {source['days_in_future']}")
                   
                   print("\n" + "-"*50)
                   
           except Exception as e:
               continue

# Analyze both False Positives and False Negatives
print("\nAnalyzing claims with future sources...")
analyze_future_sources(false_fp, "FALSE POSITIVES WITH FUTURE SOURCES")
analyze_future_sources(false_fn, "FALSE NEGATIVES WITH FUTURE SOURCES")


Analyzing claims with future sources...


Claim #1:
Text: Says Donald Trump "says he has foreign policy experience because he ran the Miss Universe pageant in Russia."
Date: 2016-06-02 00:00:00

Future Sources:

Question: Did Donald Trump publicly state that he has foreign policy experience because he ran the Miss Universe pageant in Russia?
URL: https://www.pulse.ng/articles/world/hilary-clinton-democratic-presidential-candidate-says-trumps-foreign-policy-threat-to-2024080509223630198
Source Date: 2016-06-03 00:00:00
Days in future: 1

Question: Did Donald Trump explicitly deny claiming foreign policy experience due to hosting the Miss Universe pageant in Russia?
URL: https://www.pulse.ng/articles/world/hilary-clinton-democratic-presidential-candidate-says-trumps-foreign-policy-threat-to-2024080509223630198
Source Date: 2016-06-03 00:00:00
Days in future: 1

Question: Was the statement about foreign policy experience and the Miss Universe pageant intended as satire or a joke?
URL: ht

In [20]:
def analyze_future_sources_all_claims(df):
   """Analyze all claims that have sources from their future"""
   print(f"\n{'='*20} ALL CLAIMS WITH FUTURE SOURCES {'='*20}")
   
   claim_count = 0
   for _, row in df.iterrows():
       for claim_dict, artifacts_list in zip(row['claims'], row['artifacts']):
           try:
               # Skip claims with placeholder dates
               if claim_dict['date'] == 'placeholder':
                   continue
               claim_date = pd.to_datetime(claim_dict['date'])
               
               # Check if any evidence is from the future
               has_future_sources = False
               future_sources = []
               
               for artifact in artifacts_list:
                   evidence = artifact.get('evidence', [])
                   for e in evidence:
                       if e.get('page_timestamp') != 'placeholder':
                           try:
                               source_date = pd.to_datetime(e.get('page_timestamp'))
                               if source_date > claim_date:
                                   has_future_sources = True
                                   future_sources.append({
                                       'question': artifact.get('decomposed_question'),
                                       'url': e.get('page_url'),
                                       'source_date': source_date,
                                       'days_in_future': (source_date - claim_date).days
                                   })
                           except:
                               continue
               
               # If claim has future sources, print the information
               if has_future_sources:
                   claim_count += 1
                   print(f"\nClaim #{claim_count}:")
                   print(f"Class: {row['class']}")
                   print(f"Category: {row['category']}")
                   print(f"Text: {claim_dict['claim']}")
                   print(f"Date: {claim_date}")
                   
                   print("\nFuture Sources:")
                   for source in future_sources:
                       print(f"\nQuestion: {source['question']}")
                       print(f"URL: {source['url']}")
                       print(f"Source Date: {source['source_date']}")
                       print(f"Days in future: {source['days_in_future']}")
                   
                   print("\n" + "-"*50)
                   
           except Exception as e:
               continue

   print(f"\nTotal claims with future sources: {claim_count}")

# Analyze all claims
print("\nAnalyzing all claims with future sources...")
analyze_future_sources_all_claims(df)


Analyzing all claims with future sources...


Claim #1:
Class: barely-true
Category: False Positives
Text: “On February 7, the WHO warned about the limited stock of PPE. That same day, the Trump administration announced it was sending 18 tons of masks, gowns and respirators to China.”
Date: 2020-03-31 00:00:00

Future Sources:

Question: Did the Trump administration announce the shipment of 18 tons of PPE to China on February 7, 2020?
URL: https://www.rasmussenreports.com/public_content/political_commentary/commentary_by_michelle_malkin/questions_about_state_department_s_china_first_shipment
Source Date: 2020-04-01 00:00:00
Days in future: 1

Question: Did the Trump administration announce the shipment of 18 tons of PPE to China on February 7, 2020?
URL: https://www.cnn.com/2020/04/01/politics/pence-task-force-ppe-freeze/index.html
Source Date: 2020-04-01 00:00:00
Days in future: 1

Question: Did any official sources report the shipment of PPE from the U.S. to China on or around Febru

In [22]:
def analyze_future_sources_correlation():
    """Analyze correlation between future sources and misclassification"""
    
    # Initialize counters
    stats = {
        'well_classified': {'with_future': 0, 'total': 0},
        'misclassified': {'with_future': 0, 'total': 0}
    }
    
    # Process all claims
    for _, row in df.iterrows():
        is_misclassified = row['category'] in ['False Positives', 'False Negatives']
        category = 'misclassified' if is_misclassified else 'well_classified'
        
        for claim_dict, artifacts_list in zip(row['claims'], row['artifacts']):
            try:
                if claim_dict['date'] == 'placeholder':
                    continue
                claim_date = pd.to_datetime(claim_dict['date'])
                
                # Count total claims
                stats[category]['total'] += 1
                
                # Check if claim has future sources
                has_future_sources = False
                for artifact in artifacts_list:
                    evidence = artifact.get('evidence', [])
                    for e in evidence:
                        if e.get('page_timestamp') != 'placeholder':
                            try:
                                source_date = pd.to_datetime(e.get('page_timestamp'))
                                if source_date > claim_date:
                                    has_future_sources = True
                                    break
                            except:
                                continue
                    if has_future_sources:
                        break
                
                if has_future_sources:
                    stats[category]['with_future'] += 1
                    
            except Exception as e:
                continue
    
    # Calculate percentages and print results
    print("\nCorrelation Analysis: Future Sources vs Classification")
    print("=" * 50)
    
    for category in ['well_classified', 'misclassified']:
        total = stats[category]['total']
        with_future = stats[category]['with_future']
        percentage = (with_future / total * 100) if total > 0 else 0
        
        print(f"\n{category.replace('_', ' ').title()} Claims:")
        print(f"Total claims: {total}")
        print(f"Claims with future sources: {with_future}")
        print(f"Percentage with future sources: {percentage:.2f}%")
    
    # Calculate statistical significance
    contingency_table = [
        [stats['well_classified']['with_future'], 
         stats['well_classified']['total'] - stats['well_classified']['with_future']],
        [stats['misclassified']['with_future'], 
         stats['misclassified']['total'] - stats['misclassified']['with_future']]
    ]
    
    from scipy.stats import chi2_contingency
    chi2, p_value, _, _ = chi2_contingency(contingency_table)
    
    print("\nStatistical Analysis:")
    print(f"Chi-square statistic: {chi2:.2f}")
    print(f"p-value: {p_value:.4f}")
    print(f"Statistically significant correlation: {'Yes' if p_value < 0.05 else 'No'}")

# Run the analysis
analyze_future_sources_correlation()


Correlation Analysis: Future Sources vs Classification

Well Classified Claims:
Total claims: 455
Claims with future sources: 177
Percentage with future sources: 38.90%

Misclassified Claims:
Total claims: 50
Claims with future sources: 21
Percentage with future sources: 42.00%

Statistical Analysis:
Chi-square statistic: 0.07
p-value: 0.7845
Statistically significant correlation: No


In [24]:
def analyze_subcategory_correlation():
   """Analyze correlation between subcategories and misclassification"""
   
   # Initialize dictionaries to store counts
   stats = {
       'well_classified': {},  # {subcategory: count}
       'misclassified': {},   # {subcategory: count}
       'total_by_subcategory': {}  # {subcategory: total_count}
   }
   
   # Count claims by subcategory and classification status
   for _, row in df.iterrows():
       subcategory = row['subcategory']
       is_misclassified = row['category'] in ['False Positives', 'False Negatives']
       num_claims = len(row['claims'])
       
       # Update total counts
       stats['total_by_subcategory'][subcategory] = stats['total_by_subcategory'].get(subcategory, 0) + num_claims
       
       # Update counts by classification status
       if is_misclassified:
           stats['misclassified'][subcategory] = stats['misclassified'].get(subcategory, 0) + num_claims
       else:
           stats['well_classified'][subcategory] = stats['well_classified'].get(subcategory, 0) + num_claims
   
   # Calculate and print statistics
   print("\nSubcategory Correlation Analysis")
   print("=" * 50)
   
   # Sort subcategories by total number of claims
   sorted_subcategories = sorted(stats['total_by_subcategory'].items(), 
                               key=lambda x: x[1], 
                               reverse=True)
   
   print("\nDetailed Analysis by Subcategory:")
   print(f"{'Subcategory':<30} {'Total':<10} {'Misclassified':<15} {'% Misclassified':<15}")
   print("-" * 70)
   
   for subcategory, total in sorted_subcategories:
       misclassified = stats['misclassified'].get(subcategory, 0)
       percentage = (misclassified / total * 100) if total > 0 else 0
       
       print(f"{subcategory:<30} {total:<10} {misclassified:<15} {percentage:.2f}%")
   
   # Find subcategories with highest misclassification rates
   print("\nTop 5 Most Problematic Subcategories (by misclassification rate):")
   subcategory_rates = []
   for subcategory, total in stats['total_by_subcategory'].items():
       misclassified = stats['misclassified'].get(subcategory, 0)
       if total >= 5:  # Only consider subcategories with at least 5 claims
           rate = (misclassified / total * 100)
           subcategory_rates.append((subcategory, rate, total, misclassified))
   
   for subcategory, rate, total, misclassified in sorted(subcategory_rates, 
                                                        key=lambda x: x[1], 
                                                        reverse=True)[:5]:
       print(f"\nSubcategory: {subcategory}")
       print(f"Misclassification rate: {rate:.2f}%")
       print(f"Total claims: {total}")
       print(f"Misclassified claims: {misclassified}")

   # Calculate chi-square test for subcategories with sufficient data
   from scipy.stats import chi2_contingency
   print("\nStatistical Significance Test:")
   significant_subcategories = []
   
   for subcategory in stats['total_by_subcategory'].keys():
       contingency_table = [
           [stats['misclassified'].get(subcategory, 0),
            stats['well_classified'].get(subcategory, 0)],
           [sum(stats['misclassified'].values()) - stats['misclassified'].get(subcategory, 0),
            sum(stats['well_classified'].values()) - stats['well_classified'].get(subcategory, 0)]
       ]
       
       try:
           chi2, p_value, _, _ = chi2_contingency(contingency_table)
           if p_value < 0.05:
               significant_subcategories.append((subcategory, p_value))
       except:
           continue
   
   if significant_subcategories:
       print("\nStatistically significant correlations found:")
       for subcategory, p_value in sorted(significant_subcategories, key=lambda x: x[1]):
           print(f"{subcategory}: p-value = {p_value:.4f}")
   else:
       print("\nNo statistically significant correlations found.")

# Run the analysis
analyze_subcategory_correlation()


Subcategory Correlation Analysis

Detailed Analysis by Subcategory:
Subcategory                    Total      Misclassified   % Misclassified
----------------------------------------------------------------------
Other                          155        10              6.45%
Politics                       140        16              11.43%
Immigration                    60         6               10.00%
Politicians                    40         2               5.00%
Quotes                         25         6               24.00%
Guns                           25         0               0.00%
Ballot Box                     20         2               10.00%
Abortion                       20         2               10.00%
Conspiracy Theories            10         2               20.00%
Not Verifiable                 10         2               20.00%
Imagery                        5          2               40.00%

Top 5 Most Problematic Subcategories (by misclassification rate):

Subcat

In [33]:
from scipy.stats import chi2_contingency
import numpy as np

def analyze_quotes_subcategory_correlation():
    
    # Debug: Print unique subcategories
    print("\nUnique subcategories in dataset:")
    print(df['subcategory'].unique())
    
    # Debug: Print distribution of subcategories
    print("\nSubcategory distribution:")
    print(df['subcategory'].value_counts())
    
    # Debug: Look at quotes specifically
    quotes_df = df[df['subcategory'] == 'Quotes']
    print("\nNumber of rows with 'quotes' subcategory:", len(quotes_df))
    print("\nCategories in quotes subcategory:")
    print(quotes_df['category'].value_counts())
    
    misclassified_quotes = 0
    correct_quotes = 0
    misclassified_others = 0
    correct_others = 0
    
    for _, row in df.iterrows():
        is_quotes = row['subcategory'] == 'Quotes'
        is_misclassified = row['category'] in ['False Positives', 'False Negatives']
        num_claims = len(row['claims'])
        
        if is_quotes:
            if is_misclassified:
                misclassified_quotes += num_claims
            else:
                correct_quotes += num_claims
        else:
            if is_misclassified:
                misclassified_others += num_claims
            else:
                correct_others += num_claims
    
    contingency = np.array([
        [correct_others, misclassified_others],    # Row for other subcategories
        [correct_quotes, misclassified_quotes]     # Row for quotes subcategory
    ])
    # Chi-square test
    chi2, p_value, dof, expected = chi2_contingency(contingency)
    
    total_claims = contingency.sum()
    # Calculate Phi coefficient
    phi = np.sqrt(chi2 / total_claims)
    
    # Print results
    print("\nQuotes Subcategory and Misclassification Correlation Analysis")
    print("=" * 50)
    
    print("\nContingency Table:")
    print("                  Misclassified  Correctly Classified")
    print(f"Quotes          {contingency[1,1]:<14.0f} {contingency[1,0]:<20.0f}")
    print(f"Other           {contingency[0,1]:<14.0f} {contingency[0,0]:<20.0f}")
    
    # Calculate proportions for interpretation
    quotes_total = contingency[1,0] + contingency[1,1]
    other_total = contingency[0,0] + contingency[0,1]
    
    quotes_misclass_rate = (contingency[1,1] / quotes_total * 100) if quotes_total > 0 else 0
    other_misclass_rate = (contingency[0,1] / other_total * 100) if other_total > 0 else 0
    
    print("\nProportional Analysis:")
    print(f"Total claims in 'quotes' subcategory: {quotes_total:.0f}")
    print(f"Total claims in other subcategories: {other_total:.0f}")
    print(f"Misclassification rate for 'quotes' subcategory: {quotes_misclass_rate:.2f}%")
    print(f"Misclassification rate for other subcategories: {other_misclass_rate:.2f}%")
    
    print("\nStatistical Tests:")
    print(f"Chi-square statistic: {chi2:.4f}")
    print(f"P-value: {p_value:.4f}")
    print(f"Phi coefficient: {phi:.4f}")
    print(f"\nInterpretation:")
    print(f"- Statistical significance: {'Yes' if p_value < 0.05 else 'No'}")
    if p_value < 0.05:
        print(f"- Effect size (Phi coefficient): {interpret_phi(phi)}")

def interpret_phi(phi):
    """Interpret Phi coefficient based on common guidelines"""
    abs_phi = abs(phi)
    if abs_phi < 0.1:
        return "negligible effect"
    elif abs_phi < 0.3:
        return "small effect"
    elif abs_phi < 0.5:
        return "medium effect"
    else:
        return "large effect"

# Run the analysis
analyze_quotes_subcategory_correlation()


Unique subcategories in dataset:
['Other' 'Politics' 'Immigration' 'Quotes' 'Politicians' 'Ballot Box'
 'Conspiracy Theories' 'Imagery' 'Not Verifiable' 'Abortion' 'Guns']

Subcategory distribution:
subcategory
Politics               19
Other                  17
Immigration            13
Politicians            11
Quotes                 10
Ballot Box              9
Abortion                9
Guns                    9
Not Verifiable          8
Conspiracy Theories     7
Imagery                 5
Name: count, dtype: int64

Number of rows with 'quotes' subcategory: 10

Categories in quotes subcategory:
category
True Negatives     5
False Positives    2
True Positives     2
False Negatives    1
Name: count, dtype: int64

Quotes Subcategory and Misclassification Correlation Analysis

Contingency Table:
                  Misclassified  Correctly Classified
Quotes          6              19                  
Other           44             441                 

Proportional Analysis:
Total claim

In [37]:
from urllib.parse import urlparse
from collections import defaultdict

def extract_domain(url):
    """Extract base domain from URL"""
    try:
        parsed = urlparse(url)
        # Get domain, remove 'www.' if present
        domain = parsed.netloc.lower()
        if domain.startswith('www.'):
            domain = domain[4:]
        return domain
    except:
        return None

def analyze_source_correlation():
    """Analyze correlation between source domains and misclassification"""
    
    # Initialize counters for each domain
    domain_stats = defaultdict(lambda: {'misclassified': 0, 'correct': 0})
    
    # Process all claims
    for _, row in df.iterrows():
        is_misclassified = row['category'] in ['False Positives', 'False Negatives']
        
        # Process each claim's artifacts
        for artifacts in row['artifacts']:
            for artifact in artifacts:
                evidence = artifact.get('evidence', [])
                for e in evidence:
                    url = e.get('page_url')
                    if url:
                        domain = extract_domain(url)
                        if domain:
                            if is_misclassified:
                                domain_stats[domain]['misclassified'] += 1
                            else:
                                domain_stats[domain]['correct'] += 1

    # Calculate statistics for domains with sufficient data
    print("\nDomain Correlation Analysis")
    print("=" * 50)
    
    # Filter and sort domains by total occurrences
    analyzed_domains = []
    for domain, stats in domain_stats.items():
        total = stats['misclassified'] + stats['correct']
        if total >= 5:  # Only analyze domains with at least 5 occurrences
            misclass_rate = (stats['misclassified'] / total * 100)
            analyzed_domains.append({
                'domain': domain,
                'total': total,
                'misclassified': stats['misclassified'],
                'correct': stats['correct'],
                'misclass_rate': misclass_rate
            })
    
    # Sort by total occurrences
    analyzed_domains = sorted(analyzed_domains, key=lambda x: x['total'], reverse=True)
    
    print("\nTop domains by occurrence (minimum 5 occurrences):")
    print(f"{'Domain':<30} {'Total':<10} {'Misclass':<10} {'Rate %':<10}")
    print("-" * 60)
    for domain in analyzed_domains:
        print(f"{domain['domain']:<30} {domain['total']:<10} {domain['misclassified']:<10} {domain['misclass_rate']:.2f}")
    
    # Chi-square test for top domains
    for domain in analyzed_domains:
        contingency = np.array([
            [domain['correct'], domain['misclassified']],
            [sum(d['correct'] for d in analyzed_domains) - domain['correct'],
             sum(d['misclassified'] for d in analyzed_domains) - domain['misclassified']]
        ])
        
        try:
            # print(contingency)
            chi2, p_value, _, _ = chi2_contingency(contingency)
            phi = np.sqrt(chi2 / contingency.sum())
            
            if p_value < 0.05:
                print(f"\nSignificant correlation found for {domain['domain']}:")
                print(f"Chi-square statistic: {chi2:.4f}")
                print(f"P-value: {p_value:.4f}")
                print(f"Phi coefficient: {phi:.4f}")
                print(f"Effect size: {interpret_phi(phi)}")
        except:
            continue

# Run the analysis
analyze_source_correlation()


Domain Correlation Analysis

Top domains by occurrence (minimum 5 occurrences):
Domain                         Total      Misclass   Rate %    
------------------------------------------------------------
cnn.com                        1825       306        16.77
npr.org                        1375       122        8.87
nbcnews.com                    1365       110        8.06
vox.com                        1270       180        14.17
cbsnews.com                    1205       130        10.79
pewresearch.org                1015       118        11.63
huffpost.com                   930        106        11.40
time.com                       880        118        13.41
abcnews.go.com                 855        100        11.70
pbs.org                        735        82         11.16
theatlantic.com                715        62         8.67
forbes.com                     700        68         9.71
foxnews.com                    660        60         9.09
bbc.com                        6

In [45]:
def get_claim_url_counts(claims, artifacts):
    """
    Return a list of integers where each integer is the # of URLs for that claim.
    """
    claim_url_counts = []
    for i in range(len(claims)):
        artifact_list = artifacts[i]
        num_urls = 0
        for artifact in artifact_list:
            evidence = artifact.get('evidence', [])
            num_urls += sum(1 for e in evidence if e.get('page_url'))
        claim_url_counts.append(num_urls)
    return claim_url_counts

def flatten_claims(df):
    """
    Flatten the original DataFrame so each row corresponds to exactly ONE claim.
    We'll create a new DataFrame with these columns (for example):
      - 'class'
      - 'category'
      - 'subcategory'
      - 'claim' (the text or content of the claim)
      - 'num_urls' (# of URLs for this specific claim)
      - 'is_misclassified' (True/False)
      - etc.
    """
    flattened_rows = []
    
    for _, row in df.iterrows():
        claims = row['claims']
        artifacts = row['artifacts']
        
        # Count URLs per claim
        urls_per_claim = get_claim_url_counts(claims, artifacts)
        
        # For each claim, create one row in the flattened structure
        for i, claim_dict in enumerate(claims):
            flattened_rows.append({
                'class': row['class'],
                'category': row['category'],
                'subcategory': row['subcategory'],
                'claim': claim_dict.get('claim', ''),
                'num_urls': urls_per_claim[i],
                'is_misclassified': row['category'] in ['False Positives', 'False Negatives']
            })
    
    # Create a new DataFrame
    flattened_df = pd.DataFrame(flattened_rows)
    return flattened_df

def analyze_urls_vs_misclassification(df):
    """
    Example of analyzing correlation between (binned) # of URLs per *claim*
    and misclassification (True/False) at the *claim* level.
    """
    # First, flatten the DataFrame so each row is a single claim
    flattened_df = flatten_claims(df)
    
    # Define bins for the number of URLs
    # E.g., [0-4], [5-7], [8-9], [10], etc. Adjust as needed
    def bin_urls(num_urls):
        if 11 <= num_urls <= 20:
            return '11–20'
        elif 21 <= num_urls <= 40:
            return '21–40'
        elif 41 <= num_urls <= 60:
            return '41–60'
        elif 61 <= num_urls <= 80:
            return '61–80'
        else:  # 81–100
            return '81–100'

    
    flattened_df['url_bin'] = flattened_df['num_urls'].apply(bin_urls)
    
    # Build contingency table: rows = url_bin, columns = misclassified (True/False)
    contingency_table = pd.crosstab(flattened_df['url_bin'], flattened_df['is_misclassified'])
    
    # For a quick look:
    print("\nContingency Table (rows = url_bin, cols = [False, True])")
    print(contingency_table)
    
    # Convert to numpy array for chi2
    contingency_values = contingency_table.values  # shape = (n_bins, 2)
    
    # If there's not enough data or if certain bins are empty, we can remove them:
    # Example: remove any row that has zero total
    row_sums = contingency_values.sum(axis=1)
    keep_rows = row_sums > 0
    contingency_values = contingency_values[keep_rows, :]
    final_index = contingency_table.index[keep_rows]
    
    # Now check shape
    if contingency_values.shape[0] < 2:
        print("\nNot enough non-empty bins to run chi-square.")
        return
    
    chi2, p_val, dof, expected = chi2_contingency(contingency_values)
    
    # Compute Cramér's V for effect size
    n = contingency_values.sum()
    r, c = contingency_values.shape
    min_dim = min(r, c)
    cramer_v = np.sqrt(chi2 / (n * (min_dim - 1)))
    
    print("\nChi-square Test:")
    print(f"  chi2 = {chi2:.4f}, p = {p_val:.4f}, dof = {dof}")
    print(f"  Cramér's V = {cramer_v:.4f}")

# Usage Example:
analyze_urls_vs_misclassification(df)



Contingency Table (rows = url_bin, cols = [False, True])
is_misclassified  False  True 
url_bin                       
11–20                 9      0
21–40                 5      0
41–60                39      4
61–80               128     13
81–100              279     33

Chi-square Test:
  chi2 = 1.7992, p = 0.7726, dof = 4
  Cramér's V = 0.0594


In [44]:
flattened_df = flatten_claims(df)

# 1. Print how many total claims we have
total_claims = len(flattened_df)
print(f"Total number of claims: {total_claims}")

# 2. Show the frequency of each distinct num_urls value
url_counts = flattened_df['num_urls'].value_counts().sort_index()
print("\nFrequency of each URL count (num_urls):")
print(url_counts)

# 3. Quick check: how many claims have exactly 10 URLs?
num_with_10 = (flattened_df['num_urls'] == 10).sum()
print(f"\nNumber of claims with exactly 10 URLs: {num_with_10}")

# 4. Print the percentage that have 10 URLs
pct_with_10 = (num_with_10 / total_claims) * 100 if total_claims > 0 else 0
print(f"Percentage of claims with 10 URLs: {pct_with_10:.2f}%")

# 5. Optional: check if *all* claims have 10 URLs
all_10 = (flattened_df['num_urls'] == 10).all()
print(f"\nDo all claims have exactly 10 URLs? {all_10}")


Total number of claims: 510

Frequency of each URL count (num_urls):
num_urls
11      4
19      5
39      5
46      5
48      5
49      5
53      8
54      5
55      5
56      5
59      5
64      5
65      9
66     10
68     15
69     14
70     10
72      5
73     10
74     19
76     10
77      5
78     15
79     10
80      4
81     10
82     25
83      5
85     15
86     10
87     19
89     13
90     19
91     37
92     10
93     25
94     15
95     20
96     15
97     10
98     20
99     10
100    34
Name: count, dtype: int64

Number of claims with exactly 10 URLs: 0
Percentage of claims with 10 URLs: 0.00%

Do all claims have exactly 10 URLs? False


In [49]:
from scipy.stats import pointbiserialr

def total_summary_size(artifacts_for_one_claim):
    """
    For a single claim's artifacts (a list of artifact dicts),
    sum the length of 'page_summary' across all evidence entries.
    """
    total_size = 0
    for artifact in artifacts_for_one_claim:
        evidence_list = artifact.get('evidence', [])
        for e in evidence_list:
            summary = e.get('page_summary', "")  # Default to empty string if missing
            total_size += len(summary)
    return total_size

def flatten_claims_with_summary(df):
    """
    Convert each row in the original DataFrame into multiple rows,
    one per claim, while computing the total summary size for each claim.
    """
    rows = []
    
    for _, row in df.iterrows():
        claims_list = row['claims']            # list of dicts
        artifacts_list = row['artifacts']      # parallel list of artifact lists
        category = row['category']             # e.g., 'False Positives', 'True Negatives', etc.
        
        # is_misclassified is True if category is in ['False Positives', 'False Negatives']
        is_misclass = category in ['False Positives', 'False Negatives']
        
        # Process each claim
        for i, claim_dict in enumerate(claims_list):
            # Corresponding artifacts for this claim
            artifacts_for_this_claim = artifacts_list[i]
            
            # Calculate total summary size
            summary_size = total_summary_size(artifacts_for_this_claim)
            
            # Store flattened row
            rows.append({
                'class': row['class'],
                'category': category,
                'subcategory': row['subcategory'],
                'claim': claim_dict.get('claim', ''),
                'is_misclassified': is_misclass,
                'total_summary_size': summary_size
            })
    
    return pd.DataFrame(rows)

def analyze_summary_size_correlation(df):
    """
    Flatten the data so each row = one claim, compute correlation
    between total summary size and misclassification (binary).
    """
    flattened_df = flatten_claims_with_summary(df)
    
    # Convert is_misclassified to numeric (0 or 1) for correlation
    flattened_df['misclass_int'] = flattened_df['is_misclassified'].astype(int)
    print(flattened_df)
    # Point-biserial correlation between:
    #   X = total_summary_size (continuous)
    #   Y = misclass_int (0 or 1)
    corr, pval = pointbiserialr(flattened_df['misclass_int'], 
                                flattened_df['total_summary_size'])
    print(pval)
    print(pval < 0.05)
    # Print results
    print("\nCorrelation Between Total Summary Size and Misclassification")
    print("===========================================================")
    print(f"Point-biserial correlation (r): {corr:.4f}")
    print(f"P-value: {pval:.4e}")  # scientific notation if very small
    
    # Interpretation
    if pval < 0.05:
        print("=> Statistically SIGNIFICANT correlation.")
    else:
        print("=> Not statistically significant (p >= 0.05).")
    
    if corr > 0:
        print("=> Positive correlation: larger summary size associated with higher odds of misclassification.")
    elif corr < 0:
        print("=> Negative correlation: larger summary size associated with lower odds of misclassification.")
    else:
        print("=> Zero correlation: no relationship.")

# Finally, call your function on the main df:
analyze_summary_size_correlation(df)


           class         category          subcategory  \
0    barely-true  False Positives                Other   
1    barely-true  False Positives                Other   
2    barely-true  False Positives             Politics   
3    barely-true  False Negatives             Politics   
4    barely-true  False Negatives             Politics   
..           ...              ...                  ...   
505         true   True Negatives           Ballot Box   
506         true   True Negatives  Conspiracy Theories   
507         true   True Negatives  Conspiracy Theories   
508         true   True Negatives       Not Verifiable   
509         true   True Negatives              Imagery   

                                                 claim  is_misclassified  \
0    “On February 7, the WHO warned about the limit...              True   
1    “It wasn’t all women that lost jobs (in the De...              True   
2    In first week as governor, Tim Kaine "proposed...              True   

In [50]:
def flatten_claims_for_length(df):
    """
    Convert each row in the original DataFrame into multiple rows, one per claim.
    We'll record the text length of each claim and whether it was misclassified.
    """
    rows = []
    
    for _, row in df.iterrows():
        claims_list = row['claims']     # list of dicts: [{ 'claim': ..., 'date': ... }, ...]
        category = row['category']      # e.g., 'False Positives', 'True Negatives', etc.
        
        # Determine if this row is misclassified
        is_misclass = category in ['False Positives', 'False Negatives']
        
        for claim_dict in claims_list:
            # Extract the claim text (some key 'claim')
            claim_text = claim_dict.get('claim', '')
            
            # Calculate length (could be # of characters, # of tokens, etc.)
            claim_length = len(claim_text)  
            
            rows.append({
                'category': category,
                'claim': claim_text,
                'claim_length': claim_length,
                'is_misclassified': is_misclass
            })
    
    return pd.DataFrame(rows)

def analyze_claim_length_correlation(df):
    """
    Flatten the data so each row = one claim,
    then compute the point-biserial correlation between
    claim length (continuous) and misclassification (binary).
    """
    # 1. Flatten into a new DataFrame
    flattened_df = flatten_claims_for_length(df)
    
    # 2. Convert is_misclassified to numeric (0 or 1) for correlation
    flattened_df['misclass_int'] = flattened_df['is_misclassified'].astype(int)
    
    # 3. Compute point-biserial correlation
    corr, pval = pointbiserialr(flattened_df['misclass_int'], 
                                flattened_df['claim_length'])
    
    print("\nCorrelation Between Claim Length and Misclassification")
    print("=======================================================")
    print(f"Point-biserial correlation (r): {corr:.4f}")
    print(f"P-value: {pval:.4e}")
    
    # Interpretation
    if pval < 0.05:
        print("=> Statistically SIGNIFICANT correlation.")
    else:
        print("=> Not statistically significant (p >= 0.05).")
    
    if corr > 0:
        print("=> Positive correlation: longer claims tend to be associated with misclassification.")
    elif corr < 0:
        print("=> Negative correlation: longer claims tend to be correctly classified (less misclassification).")
    else:
        print("=> Zero correlation: no relationship.")

# Finally, call your function on the main df:
analyze_claim_length_correlation(df)



Correlation Between Claim Length and Misclassification
Point-biserial correlation (r): -0.0060
P-value: 8.9202e-01
=> Not statistically significant (p >= 0.05).
=> Negative correlation: longer claims tend to be correctly classified (less misclassification).


In [52]:
import pandas as pd
from textstat import flesch_kincaid_grade
from scipy.stats import pointbiserialr

def flatten_claims_for_readability(df):
    """
    Convert each row in the original DataFrame into multiple rows, one per claim.
    For each claim, compute a reading-level score (e.g., Flesch–Kincaid Grade).
    """
    flattened_rows = []
    
    for _, row in df.iterrows():
        claims_list = row['claims']     # list of dicts: [ {'claim': "...", 'date': ...}, ...]
        category = row['category']      # e.g., 'False Positives', 'True Negatives', etc.
        
        # Determine if this row is misclassified (binary)
        is_misclassified = category in ['False Positives', 'False Negatives']
        
        for claim_info in claims_list:
            claim_text = claim_info.get('claim', '')
            
            # Calculate the Flesch–Kincaid Grade Level
            # (Other textstat functions: flesch_reading_ease, smog_index, etc.)
            fk_grade = flesch_kincaid_grade(claim_text)
            
            flattened_rows.append({
                'claim': claim_text,
                'category': category,
                'is_misclassified': is_misclassified,
                'fk_grade': fk_grade
            })
    
    return pd.DataFrame(flattened_rows)

def analyze_reading_level_correlation(df):
    """
    1. Flatten the DataFrame so each row = one claim.
    2. Compute Flesch–Kincaid Grade for each claim.
    3. Correlate reading-level score (continuous) with misclassification (binary).
    """
    # Step 1: Flatten
    flattened_df = flatten_claims_for_readability(df)
    
    # Convert is_misclassified to numeric (0 or 1) for correlation
    flattened_df['misclass_int'] = flattened_df['is_misclassified'].astype(int)
    
    # Step 2: Pull out the reading-level scores and binary label
    reading_scores = flattened_df['fk_grade']
    misclass_ints = flattened_df['misclass_int']
    
    # Step 3: Point-biserial correlation
    corr, pval = pointbiserialr(misclass_ints, reading_scores)
    
    # Print results
    print("\nCorrelation Between Claim Reading Level and Misclassification")
    print("==============================================================")
    print(f"Point-biserial correlation (r): {corr:.4f}")
    print(f"P-value: {pval:.4e}")
    
    # Interpretation
    if pval < 0.05:
        print("=> Statistically SIGNIFICANT correlation.")
    else:
        print("=> Not statistically significant (p >= 0.05).")
    
    if corr > 0:
        print("=> Positive correlation: Higher reading level claims are more likely misclassified.")
    elif corr < 0:
        print("=> Negative correlation: Higher reading level claims are less likely misclassified.")
    else:
        print("=> Zero correlation: No relationship.")

# Finally, call the function:
analyze_reading_level_correlation(df)



Correlation Between Claim Reading Level and Misclassification
Point-biserial correlation (r): 0.0439
P-value: 3.2280e-01
=> Not statistically significant (p >= 0.05).
=> Positive correlation: Higher reading level claims are more likely misclassified.


In [54]:
import pandas as pd
from textblob import TextBlob
from scipy.stats import pointbiserialr

def flatten_claims_for_subjectivity(df):
    """
    Convert each row in the original DataFrame into multiple rows, one per claim.
    For each claim, compute a subjectivity score (0.0 = objective, 1.0 = subjective).
    """
    flattened_rows = []
    
    for _, row in df.iterrows():
        claims_list = row['claims']     # list of dicts: [{'claim': "...", 'date': ...}, ...]
        category = row['category']      # e.g., 'False Positives', 'True Negatives', etc.
        
        # is_misclassified = True if category is in ['False Positives', 'False Negatives']
        is_misclassified = category in ['False Positives', 'False Negatives']
        
        for claim_info in claims_list:
            claim_text = claim_info.get('claim', '')
            
            # Use TextBlob to compute sentiment subjectivity
            blob = TextBlob(claim_text)
            subjectivity = blob.sentiment.subjectivity
            
            flattened_rows.append({
                'claim': claim_text,
                'category': category,
                'is_misclassified': is_misclassified,
                'subjectivity': subjectivity
            })
    
    return pd.DataFrame(flattened_rows)

def analyze_subjectivity_correlation(df):
    """
    1. Flatten the DataFrame so each row = one claim.
    2. Compute TextBlob subjectivity score for each claim.
    3. Correlate subjectivity (continuous) with misclassification (binary) using point-biserial.
    """
    # Step 1: Flatten
    flattened_df = flatten_claims_for_subjectivity(df)
    
    # Convert is_misclassified to numeric (0 or 1) for correlation
    flattened_df['misclass_int'] = flattened_df['is_misclassified'].astype(int)
    
    # Step 2: Subjectivity scores vs. misclassification
    subj_scores = flattened_df['subjectivity']
    misclass_ints = flattened_df['misclass_int']
    
    # Step 3: Point-biserial correlation
    corr, pval = pointbiserialr(misclass_ints, subj_scores)
    
    print("\nCorrelation Between Claim Subjectivity and Misclassification")
    print("============================================================")
    print(f"Point-biserial correlation (r): {corr:.4f}")
    print(f"P-value: {pval:.4e}")
    
    # Interpretation
    if pval < 0.05:
        print("=> Statistically SIGNIFICANT correlation.")
    else:
        print("=> Not statistically significant (p >= 0.05).")
    
    if corr > 0:
        print("=> Positive correlation: More subjective claims tend to be misclassified more often.")
    elif corr < 0:
        print("=> Negative correlation: More subjective claims tend to be misclassified less often.")
    else:
        print("=> Zero correlation: No relationship.")

# Finally, run your analysis:
analyze_subjectivity_correlation(df)



Correlation Between Claim Subjectivity and Misclassification
Point-biserial correlation (r): 0.0325
P-value: 4.6340e-01
=> Not statistically significant (p >= 0.05).
=> Positive correlation: More subjective claims tend to be misclassified more often.


In [55]:
def flatten_claims_for_keywords(df):
    """
    Flatten DataFrame so each row = one claim.
    We'll also flag whether each claim contains any of a set of keywords.
    """
    # Define your keyword list
    keywords = [
    "fake", "hoax", "fraud", "scam", "lie",
    "deceit", "deception", "dishonesty", "falsify",
    "fabrication", "misrepresentation", "manipulation",
    "duplicity", "subterfuge", "impostor", "con",
    "rip-off", "swindle", "hustle", "shell game",
    "phony", "bogus", "counterfeit", "forgery",
    "scheme", "untrue", "falsehood", "lying",
    "inaccuracy", "misleading", "spurious", "sham",
    "fake news", "tall tale", "tall story",
    "too good to be true", "get-rich-quick",
    "miracle cure", "guaranteed results",
    "limited time offer", "secret formula",
    "hidden agenda"
]
    
    flattened_rows = []
    
    for _, row in df.iterrows():
        claims_list = row['claims']  # e.g. [{'claim': "...", 'date': ...}, ...]
        category = row['category']
        
        # Is it misclassified?
        is_misclassified = category in ['False Positives', 'False Negatives']
        
        for claim_info in claims_list:
            claim_text = claim_info.get('claim', '').lower()
            
            # Check if any keyword appears in the claim_text
            keyword_found = any(kw.lower() in claim_text for kw in keywords)
            
            flattened_rows.append({
                'claim': claim_text,
                'is_misclassified': is_misclassified,
                'keyword_present': keyword_found
            })
    
    return pd.DataFrame(flattened_rows)

def analyze_keyword_presence_chi2(df):
    """
    1) Flatten the data into one claim per row.
    2) Build a 2x2 contingency table: keyword_present (True/False) vs. misclassified (True/False).
    3) Run a Chi-square test of independence.
    """
    flattened_df = flatten_claims_for_keywords(df)
    
    # Build a 2x2 contingency table using pandas crosstab
    contingency = pd.crosstab(flattened_df['keyword_present'], 
                              flattened_df['is_misclassified'])
    
    print("\nContingency Table (rows=keyword_present, cols=is_misclassified):")
    print(contingency)
    
    # Make sure we have both True/False in each dimension 
    # If the data is skewed, you might only get one row or one column.
    if contingency.shape != (2, 2):
        print("\nWARNING: We don't have a full 2x2 table. Possibly all claims are in one category.")
        return
    
    chi2, p, dof, expected = chi2_contingency(contingency)
    
    print("\nChi-Square Test of Independence")
    print("===============================")
    print(f"Chi2 Statistic: {chi2:.4f}")
    print(f"P-value:        {p:.4e}")
    print(f"Degrees of Freedom: {dof}")
    print("\nExpected Frequencies:")
    print(expected)

    # Quick check if significant
    if p < 0.05:
        print("\n=> Statistically SIGNIFICANT association between keyword presence and misclassification.")
    else:
        print("\n=> No statistically significant association (p >= 0.05).")

def analyze_keyword_presence_correlation(df):
    """
    Alternatively, treat both keyword_present and is_misclassified as binary numeric (0/1)
    and compute the point-biserial correlation.
    """
    flattened_df = flatten_claims_for_keywords(df)
    
    # Convert booleans to int (0/1)
    flattened_df['keyword_present_int'] = flattened_df['keyword_present'].astype(int)
    flattened_df['misclass_int'] = flattened_df['is_misclassified'].astype(int)
    
    # Point-biserial correlation
    corr, p = pointbiserialr(flattened_df['misclass_int'], 
                             flattened_df['keyword_present_int'])
    
    print("\nPoint-Biserial Correlation for Keyword Presence vs. Misclassification")
    print("======================================================================")
    print(f"r = {corr:.4f}, p = {p:.4e}")
    
    if p < 0.05:
        print("=> Statistically SIGNIFICANT correlation.")
    else:
        print("=> Not statistically significant (p >= 0.05).")
    
    # Interpretation of sign
    if corr > 0:
        print("=> Positive correlation: claims with these keywords are more often misclassified.")
    elif corr < 0:
        print("=> Negative correlation: claims with these keywords are less often misclassified.")
    else:
        print("=> Zero correlation: no relationship.")

# Example usage:
analyze_keyword_presence_chi2(df)
analyze_keyword_presence_correlation(df)



Contingency Table (rows=keyword_present, cols=is_misclassified):
is_misclassified  False  True 
keyword_present               
False               350     40
True                110     10

Chi-Square Test of Independence
Chi2 Statistic: 0.1971
P-value:        6.5706e-01
Degrees of Freedom: 1

Expected Frequencies:
[[351.76470588  38.23529412]
 [108.23529412  11.76470588]]

=> No statistically significant association (p >= 0.05).

Point-Biserial Correlation for Keyword Presence vs. Misclassification
r = -0.0274, p = 5.3651e-01
=> Not statistically significant (p >= 0.05).
=> Negative correlation: claims with these keywords are less often misclassified.


In [56]:
import re
import pandas as pd
from scipy.stats import chi2_contingency, pointbiserialr

# Regex pattern to match years (very simple: 4 digits, 19xx or 20xx, up to 2099, etc.)
# You can refine as needed.
YEAR_PATTERN = re.compile(r'\b(19[0-9]{2}|20[0-9]{2}|2100)\b')

def flatten_claims_for_temporal(df):
    """
    Flatten DataFrame so each row = one claim.
    We'll detect if that claim contains an explicit year reference.
    """
    flattened_rows = []
    
    for _, row in df.iterrows():
        claims_list = row['claims']     # e.g., [{'claim': "...", 'date': ...}, ...]
        category = row['category']      # e.g. 'False Positives', etc.
        
        is_misclassified = category in ['False Positives', 'False Negatives']
        
        for claim_info in claims_list:
            claim_text = claim_info.get('claim', '')
            # Convert to lower case for consistency
            claim_text_lower = claim_text.lower()
            
            # Check if there's at least one match for a year
            found_year = bool(YEAR_PATTERN.search(claim_text_lower))
            
            flattened_rows.append({
                'claim': claim_text,
                'claim_lower': claim_text_lower,
                'is_misclassified': is_misclassified,
                'has_explicit_year': found_year
            })
    
    return pd.DataFrame(flattened_rows)

def analyze_temporal_references(df):
    """
    1. Flatten the DataFrame so each row = one claim.
    2. Detect explicit year references (via regex).
    3. Perform statistical tests on whether 'has_explicit_year' correlates with misclassification.
    """
    flattened_df = flatten_claims_for_temporal(df)
    
    # ------------------------------
    # A) Chi-square test (2×2 table)
    # ------------------------------
    contingency = pd.crosstab(flattened_df['has_explicit_year'], 
                              flattened_df['is_misclassified'])
    
    print("\nContingency Table (rows = has_explicit_year, cols = is_misclassified):")
    print(contingency)
    
    # If shape != (2,2), it means one of the categories had zero data.
    if contingency.shape == (2, 2):
        chi2, p, dof, expected = chi2_contingency(contingency)
        print("\nChi-Square Test of Independence (Temporal Reference vs. Misclassification)")
        print("=============================================================================")
        print(f"Chi2 Statistic: {chi2:.4f}")
        print(f"P-value:        {p:.4e}")
        print(f"Degrees of Freedom: {dof}")
        print("Expected frequencies:")
        print(expected)
        
        if p < 0.05:
            print("\n=> Statistically SIGNIFICANT association (p < 0.05).")
        else:
            print("\n=> No statistically significant association (p >= 0.05).")
    else:
        print("\nWARNING: The data does not form a 2x2 table (perhaps all are True or all are False).")
    
    # --------------------------------------------------------
    # B) Point-Biserial correlation (both variables binary)
    # --------------------------------------------------------
    # Convert booleans to int (0/1)
    flattened_df['year_int'] = flattened_df['has_explicit_year'].astype(int)
    flattened_df['misclass_int'] = flattened_df['is_misclassified'].astype(int)
    
    corr, p_val = pointbiserialr(flattened_df['misclass_int'], flattened_df['year_int'])
    print("\nPoint-Biserial Correlation (Temporal Reference vs. Misclassification)")
    print("=====================================================================")
    print(f"r = {corr:.4f}, p-value = {p_val:.4e}")
    
    if p_val < 0.05:
        print("=> Statistically SIGNIFICANT correlation.")
    else:
        print("=> Not statistically significant (p >= 0.05).")
    
    if corr > 0:
        print("=> Positive correlation: claims referencing explicit years are more often misclassified.")
    elif corr < 0:
        print("=> Negative correlation: claims referencing explicit years are less often misclassified.")
    else:
        print("=> Zero correlation: no relationship.")

# Example usage:
analyze_temporal_references(df)



Contingency Table (rows = has_explicit_year, cols = is_misclassified):
is_misclassified   False  True 
has_explicit_year              
False                444     46
True                  16      4

Chi-Square Test of Independence (Temporal Reference vs. Misclassification)
Chi2 Statistic: 1.3943
P-value:        2.3768e-01
Degrees of Freedom: 1
Expected frequencies:
[[441.96078431  48.03921569]
 [ 18.03921569   1.96078431]]

=> No statistically significant association (p >= 0.05).

Point-Biserial Correlation (Temporal Reference vs. Misclassification)
r = 0.0693, p-value = 1.1819e-01
=> Not statistically significant (p >= 0.05).
=> Positive correlation: claims referencing explicit years are more often misclassified.


In [57]:
import pandas as pd
from scipy.stats import chi2_contingency, pointbiserialr

def flatten_claims_for_quotes(df):
    """
    Convert each row in the original DataFrame into multiple rows, 
    one per claim, recording if the subcategory is 'Quotes' and if misclassified.
    """
    flattened_rows = []
    
    for _, row in df.iterrows():
        subcat = row['subcategory']      # e.g., 'Quotes', 'immigration', etc.
        category = row['category']       # e.g. 'False Positives', 'True Negatives', etc.
        claims_list = row['claims']      # list of dicts like [{'claim': 'some text', ...}, ...]
        
        # is_misclassified = True if category in ['False Positives', 'False Negatives']
        is_misclassified = category in ['False Positives', 'False Negatives']
        
        for claim_info in claims_list:
            flattened_rows.append({
                'subcategory': subcat,
                'is_misclassified': is_misclassified,
                'claim': claim_info.get('claim', '')
            })
    
    return pd.DataFrame(flattened_rows)

def analyze_quotes_subcategory_correlation(df):
    """
    1. Flatten the data, one claim per row.
    2. Create a boolean for 'is_quotes' subcategory.
    3. Build a 2x2 contingency table and run a chi-square test.
    4. Optionally, do a point-biserial correlation as well.
    """
    flattened_df = flatten_claims_for_quotes(df)
    
    # Boolean flag for quotes vs. other subcategories
    flattened_df['is_quotes'] = (flattened_df['subcategory'] == 'Quotes')
    
    # Build 2x2 table:
    # Rows = is_quotes (False/True), Cols = is_misclassified (False/True)
    contingency = pd.crosstab(flattened_df['is_quotes'], flattened_df['is_misclassified'])
    
    print("\nContingency Table (rows=is_quotes, columns=is_misclassified):")
    print(contingency)
    
    if contingency.shape == (2, 2):
        chi2, p, dof, expected = chi2_contingency(contingency)
        
        print("\nChi-Square Test of Independence (Quotes vs. Misclassification)")
        print("================================================================")
        print(f"Chi2 Statistic: {chi2:.4f}")
        print(f"P-value:        {p:.4e}")
        print(f"Degrees of Freedom: {dof}")
        print("Expected Frequencies:")
        print(expected)
        
        if p < 0.05:
            print("\n=> Statistically SIGNIFICANT association (p < 0.05).")
        else:
            print("\n=> No statistically significant association (p >= 0.05).")
    else:
        print("\nWARNING: The table did not form a 2x2 matrix. Possibly all are 'Quotes' or none are.")
    
    # (Optional) Point-biserial correlation for binary vs. binary
    flattened_df['quotes_int'] = flattened_df['is_quotes'].astype(int)
    flattened_df['misclass_int'] = flattened_df['is_misclassified'].astype(int)
    
    corr, p_val = pointbiserialr(flattened_df['quotes_int'], flattened_df['misclass_int'])
    print("\nPoint-Biserial Correlation (Quotes vs. Misclassification)")
    print("=========================================================")
    print(f"r = {corr:.4f}, p-value = {p_val:.4e}")
    
    if p_val < 0.05:
        print("=> Statistically SIGNIFICANT correlation.")
    else:
        print("=> Not statistically significant (p >= 0.05).")
    
    if corr > 0:
        print("=> Positive correlation: 'Quotes' subcategory tends to be misclassified more often.")
    elif corr < 0:
        print("=> Negative correlation: 'Quotes' subcategory tends to be misclassified less often.")
    else:
        print("=> Zero correlation: no relationship.")

# Finally, run the analysis:
analyze_quotes_subcategory_correlation(df)



Contingency Table (rows=is_quotes, columns=is_misclassified):
is_misclassified  False  True 
is_quotes                     
False               441     44
True                 19      6

Chi-Square Test of Independence (Quotes vs. Misclassification)
Chi2 Statistic: 4.4220
P-value:        3.5478e-02
Degrees of Freedom: 1
Expected Frequencies:
[[437.45098039  47.54901961]
 [ 22.54901961   2.45098039]]

=> Statistically SIGNIFICANT association (p < 0.05).

Point-Biserial Correlation (Quotes vs. Misclassification)
r = 0.1084, p-value = 1.4329e-02
=> Statistically SIGNIFICANT correlation.
=> Positive correlation: 'Quotes' subcategory tends to be misclassified more often.


In [58]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
import math

def flatten_claims_subcategories(df):
    """
    Flatten the DataFrame so each row = exactly one claim.
    We'll store:
      - subcategory
      - is_misclassified
    """
    flattened_rows = []
    
    for _, row in df.iterrows():
        # category: e.g., 'False Positives', 'True Positives', etc.
        main_category = row['category']
        subcat = row['subcategory']  # e.g., 'Quotes', 'immigration', ...
        
        # is_misclassified = True if in ['False Positives', 'False Negatives']
        is_misclassified = main_category in ['False Positives', 'False Negatives']
        
        # row['claims'] is a list of claim dicts
        claims_list = row['claims']
        
        for claim_info in claims_list:
            flattened_rows.append({
                'subcategory': subcat,
                'is_misclassified': is_misclassified
            })
    
    return pd.DataFrame(flattened_rows)

def analyze_subcategories_all(df):
    """
    1) Flatten the data so each row is one claim, with subcategory info.
    2) Build an r×2 contingency table: each row = a subcategory, 
       columns = [not misclassified, misclassified].
    3) Perform chi-square and compute Cramér’s V.
    """
    # Flatten to claim-level
    flattened_df = flatten_claims_subcategories(df)
    
    # Build contingency table: rows = subcategory, columns = is_misclassified (False/True)
    contingency = pd.crosstab(flattened_df['subcategory'], 
                              flattened_df['is_misclassified'])
    
    # Just for clarity, rename columns
    # Right now columns are [False, True]. Let's rename them:
    contingency.columns = ['Correct', 'Misclassified']
    
    print("\nContingency Table (Subcategory vs. Misclassification):")
    print("-------------------------------------------------------")
    print(contingency)
    
    # Sum of all counts
    total_counts = contingency.values.sum()
    print(f"\nTotal claims in table: {total_counts}")
    
    # If your data has subcategories with zero total counts, filter them out
    row_sums = contingency.sum(axis=1)
    # Keep only subcats with > 0 total
    keep_mask = (row_sums > 0)
    contingency = contingency.loc[keep_mask]
    
    # If after filtering we have fewer than 2 subcategories, chi-square won't be valid
    if len(contingency) < 2:
        print("\nNot enough subcategories with data to run chi-square.")
        return
    
    # Chi-square test
    chi2, p_value, dof, expected = chi2_contingency(contingency)
    
    print("\nChi-Square Test for Subcategories vs. Misclassification")
    print("=======================================================")
    print(f"Chi-square statistic: {chi2:.4f}")
    print(f"P-value:             {p_value:.4e}")
    print(f"Degrees of freedom:  {dof}")
    
    print("\nExpected Frequencies:")
    print(expected)
    
    # Check significance
    if p_value < 0.05:
        print("=> Statistically SIGNIFICANT association (p < 0.05).")
    else:
        print("=> No statistically significant association (p >= 0.05).")
    
    # Compute Cramér’s V for r×2 table
    r, c = contingency.shape
    min_dim = min(r, c)  # should be 2 if c=2
    # Cramér’s V = sqrt(Chi2 / (N * (k - 1)))
    cramer_v = math.sqrt(chi2 / (total_counts * (min_dim - 1)))
    print(f"\nCramér's V = {cramer_v:.4f} (Effect size)")

# Example usage:
analyze_subcategories_all(df)



Contingency Table (Subcategory vs. Misclassification):
-------------------------------------------------------
                     Correct  Misclassified
subcategory                                
Abortion                  18              2
Ballot Box                18              2
Conspiracy Theories        8              2
Guns                      25              0
Imagery                    3              2
Immigration               54              6
Not Verifiable             8              2
Other                    145             10
Politicians               38              2
Politics                 124             16
Quotes                    19              6

Total claims in table: 510

Chi-Square Test for Subcategories vs. Misclassification
Chi-square statistic: 19.3579
P-value:             3.5945e-02
Degrees of freedom:  10

Expected Frequencies:
[[ 18.03921569   1.96078431]
 [ 18.03921569   1.96078431]
 [  9.01960784   0.98039216]
 [ 22.54901961   2.45098039]
 [  4.

In [59]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency

def flatten_claims_subcategories(df):
    """
    Flatten the DataFrame so each row = exactly one claim.
    We'll store:
      - subcategory
      - is_misclassified
    """
    flattened_rows = []
    
    for _, row in df.iterrows():
        category = row['category']  # e.g., 'False Positives'
        subcat = row['subcategory']
        
        # is_misclassified = True if category in ['False Positives', 'False Negatives']
        is_misclassified = category in ['False Positives', 'False Negatives']
        
        # row['claims'] is a list of claim dicts
        claims_list = row['claims']
        
        for claim_info in claims_list:
            flattened_rows.append({
                'subcategory': subcat,
                'is_misclassified': is_misclassified
            })
    
    return pd.DataFrame(flattened_rows)

def analyze_each_subcategory_separately(df):
    """
    For each unique subcategory:
      1) Build a 2x2 table:
         - Row 1 = subcategory == X
         - Row 2 = subcategory != X
         - Col 1 = Correct (not misclassified)
         - Col 2 = Misclassified
      2) Perform a chi-square test.
      3) Print results.

    NOTE: If you have many subcategories, you may wish to apply
    multiple-comparison corrections (Bonferroni, Holm, etc.).
    """
    flattened_df = flatten_claims_subcategories(df)
    
    # Unique subcategories
    unique_subcats = flattened_df['subcategory'].unique()
    
    # We'll store results to possibly do multiple-comparison correction
    results = []
    
    for subcat in unique_subcats:
        # Boolean mask: is this row's subcategory = current subcat?
        is_this_subcat = (flattened_df['subcategory'] == subcat)
        
        # We also have 'is_misclassified'
        # Build a 2x2 table:
        # Rows: [this_subcat, not_this_subcat]
        # Cols: [Correct, Misclassified]
        
        # Count how many claims are subcat + correct
        subcat_correct = np.sum(is_this_subcat & (flattened_df['is_misclassified'] == False))
        # subcat + misclassified
        subcat_misclass = np.sum(is_this_subcat & (flattened_df['is_misclassified'] == True))
        # not_subcat + correct
        not_subcat_correct = np.sum((~is_this_subcat) & (flattened_df['is_misclassified'] == False))
        # not_subcat + misclassified
        not_subcat_misclass = np.sum((~is_this_subcat) & (flattened_df['is_misclassified'] == True))
        
        contingency = np.array([
            [subcat_correct, subcat_misclass],
            [not_subcat_correct, not_subcat_misclass]
        ])
        
        # Check if we have a valid table (both rows/cols > 0 total)
        if contingency.sum() == 0:
            # All zeros? skip
            results.append((subcat, 0, 1.0, "Not enough data"))  # no effect
            continue
        
        # Do chi-square
        chi2, p, dof, expected = chi2_contingency(contingency)
        
        results.append((subcat, chi2, p, "ok"))
    
    # Optionally, apply a multiple-comparison correction if you have many subcats
    # Example: Bonferroni correction
    # n_tests = len(results)
    # for i, (sc, chi2, pval, status) in enumerate(results):
    #     corrected_p = min(pval * n_tests, 1.0)  # Bonferroni
    #     results[i] = (sc, chi2, pval, corrected_p, status)
    
    # Print results
    print("Chi-Square Tests for Each Subcategory (vs. all other subcategories)\n")
    
    # If you did a correction, you might adjust your output format
    # but here we'll just print the raw p-values
    for subcat, chi2, p, status in results:
        print(f"Subcategory: {subcat}")
        print(f"  Chi2 = {chi2:.4f}, p = {p:.4e}, status = {status}")
        if p < 0.05:
            print("  => Statistically SIGNIFICANT (p < 0.05).")
        else:
            print("  => Not significant (p >= 0.05).")
        print("------------------------------------------------------")
    
    print("\nDone.")

# Example usage
analyze_each_subcategory_separately(df)


Chi-Square Tests for Each Subcategory (vs. all other subcategories)

Subcategory: Other
  Chi2 = 2.3115, p = 1.2842e-01, status = ok
  => Not significant (p >= 0.05).
------------------------------------------------------
Subcategory: Politics
  Chi2 = 0.3506, p = 5.5377e-01, status = ok
  => Not significant (p >= 0.05).
------------------------------------------------------
Subcategory: Immigration
  Chi2 = 0.0000, p = 1.0000e+00, status = ok
  => Not significant (p >= 0.05).
------------------------------------------------------
Subcategory: Quotes
  Chi2 = 4.4220, p = 3.5478e-02, status = ok
  => Statistically SIGNIFICANT (p < 0.05).
------------------------------------------------------
Subcategory: Politicians
  Chi2 = 0.6200, p = 4.3106e-01, status = ok
  => Not significant (p >= 0.05).
------------------------------------------------------
Subcategory: Ballot Box
  Chi2 = 0.0000, p = 1.0000e+00, status = ok
  => Not significant (p >= 0.05).
--------------------------------------

In [60]:
import pandas as pd
from dateutil import parser
from scipy.stats import pointbiserialr
import numpy as np

def extract_year(date_str):
    """
    Try to parse the date_str and return the year as an integer.
    If parsing fails or date_str is missing, return None.
    """
    if not date_str:
        return None
    try:
        dt = parser.parse(date_str)
        return dt.year
    except:
        return None

def flatten_claims_for_year(df):
    """
    Flatten the DataFrame so each row = exactly one claim,
    extracting the claim's year from 'date' if available.
    We'll store:
      - year (int or None)
      - is_misclassified (bool)
    """
    flattened_rows = []
    
    for _, row in df.iterrows():
        category = row['category']  # e.g. 'False Positives', etc.
        is_misclassified = category in ['False Positives', 'False Negatives']
        
        # row['claims'] is a list of dicts, e.g. [{'claim': "some text", 'date': "..."}, ...]
        claims_list = row['claims']
        
        for claim_info in claims_list:
            # Extract the 'date' field and parse its year if possible
            date_str = claim_info.get('date', '')
            claim_year = extract_year(date_str)
            
            flattened_rows.append({
                'year': claim_year,
                'is_misclassified': is_misclassified
            })
    
    return pd.DataFrame(flattened_rows)

def analyze_claim_year_correlation(df):
    """
    1) Flatten data so each row is one claim, extracting the year.
    2) Compute a point-biserial correlation between (year) and (is_misclassified).
    """
    flattened_df = flatten_claims_for_year(df)
    
    # Filter out rows with no valid year
    valid_df = flattened_df.dropna(subset=['year']).copy()
    if len(valid_df) == 0:
        print("No valid years found in the data. Cannot do correlation.")
        return
    
    # Convert is_misclassified to int (0/1)
    valid_df['misclass_int'] = valid_df['is_misclassified'].astype(int)
    
    # Numeric variable = 'year'
    # Binary variable = 'misclass_int'
    corr, pval = pointbiserialr(valid_df['misclass_int'], valid_df['year'])
    
    print("\nCorrelation Between Claim Year and Misclassification")
    print("====================================================")
    print(f"Point-biserial correlation (r): {corr:.4f}")
    print(f"P-value: {pval:.4e}")
    
    if pval < 0.05:
        print("=> Statistically SIGNIFICANT correlation (p < 0.05).")
    else:
        print("=> Not statistically significant (p >= 0.05).")
    
    if corr > 0:
        print("=> Positive correlation: claims from later years are misclassified more often.")
    elif corr < 0:
        print("=> Negative correlation: claims from later years are misclassified less often.")
    else:
        print("=> Zero correlation: no relationship.")

# Example usage:
analyze_claim_year_correlation(df)



Correlation Between Claim Year and Misclassification
Point-biserial correlation (r): -0.0046
P-value: 9.1712e-01
=> Not statistically significant (p >= 0.05).
=> Negative correlation: claims from later years are misclassified less often.


In [61]:
import pandas as pd
from scipy.stats import pointbiserialr

def sum_explanation_length(artifacts_for_one_claim):
    """
    Sum the length of 'decomposed_question_explanation' strings across all artifacts
    for a single claim. We assume 'artifacts_for_one_claim' is a list of artifact dicts:
    [
      {
        'decomposed_question': ...,
        'decomposed_justification': ...,
        'decomposed_question_explanation': ...,
        'evidence': ...
      }, 
      ...
    ]
    """
    total_len = 0
    for artifact in artifacts_for_one_claim:
        explanation_text = artifact.get('decomposed_question_explanation', '')
        total_len += len(str(explanation_text))  # Convert to string safely, then measure length
    return total_len

def flatten_claims_for_explanation_length(df):
    """
    Convert each row in the DataFrame into multiple rows, one per claim.
    For each claim, compute the sum of explanation lengths from artifacts.
    We'll store:
      - explanation_length (int)
      - is_misclassified (bool)
    """
    flattened_rows = []
    
    for _, row in df.iterrows():
        category = row['category']  # e.g., 'False Positives', 'True Positives', etc.
        # is_misclassified = True if in ['False Positives', 'False Negatives']
        is_misclassified = category in ['False Positives', 'False Negatives']
        
        claims_list = row['claims']       # list of claim dicts
        artifacts_list = row['artifacts'] # parallel list of artifact lists, same length as claims
        
        # Iterate over each claim in this row
        for i, claim_info in enumerate(claims_list):
            # Artifacts for the i-th claim
            artifacts_for_this_claim = artifacts_list[i]  # e.g., a list of artifact dicts
            
            # Sum up explanation lengths
            explanation_sum = sum_explanation_length(artifacts_for_this_claim)
            
            flattened_rows.append({
                'is_misclassified': is_misclassified,
                'explanation_length': explanation_sum,
                # optionally store the claim text or other fields
                'claim': claim_info.get('claim', '')
            })
    
    return pd.DataFrame(flattened_rows)

def analyze_explanation_length_correlation(df):
    """
    1) Flatten the data so each row = one claim, summing up 
       'decomposed_question_explanation' length across artifacts.
    2) Run a point-biserial correlation between explanation_length (continuous)
       and is_misclassified (binary).
    """
    flattened_df = flatten_claims_for_explanation_length(df)
    
    # Convert boolean to int (0=correct, 1=misclassified)
    flattened_df['misclass_int'] = flattened_df['is_misclassified'].astype(int)
    
    # We'll correlate 'explanation_length' with 'misclass_int'
    corr, pval = pointbiserialr(flattened_df['misclass_int'], flattened_df['explanation_length'])
    
    print("\nCorrelation Between Explanation Length and Misclassification")
    print("============================================================")
    print(f"Point-biserial correlation (r): {corr:.4f}")
    print(f"P-value:                        {pval:.4e}")
    
    if pval < 0.05:
        print("=> Statistically SIGNIFICANT correlation (p < 0.05).")
    else:
        print("=> Not statistically significant (p >= 0.05).")
    
    if corr > 0:
        print("=> Positive correlation: longer explanations tend to be misclassified more often.")
    elif corr < 0:
        print("=> Negative correlation: longer explanations tend to be misclassified less often.")
    else:
        print("=> Zero correlation: no relationship.")

# Example usage:
analyze_explanation_length_correlation(df)



Correlation Between Explanation Length and Misclassification
Point-biserial correlation (r): -0.0927
P-value:                        3.6382e-02
=> Statistically SIGNIFICANT correlation (p < 0.05).
=> Negative correlation: longer explanations tend to be misclassified less often.


In [22]:
import pandas as pd
from scipy.stats import pointbiserialr

def sum_explanation_length(artifacts_for_one_claim):
    """
    Sum the length of 'decomposed_question_explanation' strings across all artifacts
    for a single claim. We assume 'artifacts_for_one_claim' is a list of artifact dicts:
    [
      {
        'decomposed_question': ...,
        'decomposed_justification': ...,
        'decomposed_question_explanation': ...,
        'evidence': ...
      }, 
      ...
    ]
    """
    total_len = 0
    for artifact in artifacts_for_one_claim:
        explanation_text = artifact.get('decomposed_question_explanation', '')
        total_len += len(str(explanation_text))  # Convert to string safely, then measure length
    return total_len

def flatten_claims_for_explanation_length(df):
    """
    Convert each row in the DataFrame into multiple rows, one per claim.
    For each claim, compute the sum of explanation lengths from artifacts.
    We'll store:
      - explanation_length (int)
      - is_misclassified (bool)
    """
    flattened_rows = []
    
    for _, row in df.iterrows():
        category = row['category']  # e.g., 'False Positives', 'True Positives', etc.
        # is_misclassified = True if in ['False Positives', 'False Negatives']
        is_misclassified = category in ['False Positives', 'False Negatives']
        
        claims_list = row['claims']       # list of claim dicts
        artifacts_list = row['artifacts'] # parallel list of artifact lists, same length as claims
        
        # Iterate over each claim in this row
        for i, claim_info in enumerate(claims_list):
            # Artifacts for the i-th claim
            artifacts_for_this_claim = artifacts_list[i]  # e.g., a list of artifact dicts
            
            # Sum up explanation lengths
            explanation_sum = sum_explanation_length(artifacts_for_this_claim)
            
            flattened_rows.append({
                'is_misclassified': is_misclassified,
                'explanation_length': explanation_sum,
                # optionally store the claim text or other fields
                'claim': claim_info.get('claim', '')
            })
    
    return pd.DataFrame(flattened_rows)

def analyze_explanation_length_correlation(df):
    """
    1) Flatten the data so each row = one claim, summing up 
       'decomposed_question_explanation' length across artifacts.
    2) Run a point-biserial correlation between explanation_length (continuous)
       and is_misclassified (binary).
    """
    flattened_df = flatten_claims_for_explanation_length(df)
    
    # Convert boolean to int (0=correct, 1=misclassified)
    flattened_df['misclass_int'] = flattened_df['is_misclassified'].astype(int)
    
    # We'll correlate 'explanation_length' with 'misclass_int'
    corr, pval = pointbiserialr(flattened_df['misclass_int'], flattened_df['explanation_length'])
    
    print("\nCorrelation Between Explanation Length and Misclassification")
    print("============================================================")
    print(f"Point-biserial correlation (r): {corr:.4f}")
    print(f"P-value:                        {pval:.4e}")
    
    if pval < 0.05:
        print("=> Statistically SIGNIFICANT correlation (p < 0.05).")
    else:
        print("=> Not statistically significant (p >= 0.05).")
    
    if corr > 0:
        print("=> Positive correlation: longer explanations tend to be misclassified more often.")
    elif corr < 0:
        print("=> Negative correlation: longer explanations tend to be misclassified less often.")
    else:
        print("=> Zero correlation: no relationship.")

# Example usage:
analyze_explanation_length_correlation(df)



Correlation Between Explanation Length and Misclassification
Point-biserial correlation (r): -0.0927
P-value:                        3.6382e-02
=> Statistically SIGNIFICANT correlation (p < 0.05).
=> Negative correlation: longer explanations tend to be misclassified less often.


In [62]:
import pandas as pd
from scipy.stats import pointbiserialr

def sum_page_content(evidences):
    """
    Sum the length of 'page_content' from a list of evidence dicts.
    Each evidence dict might look like:
      {
        'page_url': ...,
        'page_content': ...,
        'page_summary': ...,
        'page_timestamp': ...
      }
    """
    total_len = 0
    for e in evidences:
        content_text = e.get('page_content', '')
        total_len += len(str(content_text))
    return total_len

def sum_evidence_contents_for_claim(artifacts_for_one_claim):
    """
    For a single claim's artifacts (a list of artifact dicts), 
    each artifact has an 'evidence' key which is a list of evidence dicts.
    We sum all 'page_content' lengths from all evidence in all artifacts.
    """
    total_length = 0
    for artifact in artifacts_for_one_claim:
        # 'evidence' is typically a list of dictionaries
        evidence_list = artifact.get('evidence', [])
        total_length += sum_page_content(evidence_list)
    return total_length

def flatten_claims_for_page_content(df):
    """
    Flatten the DataFrame so each row = exactly one claim.
    We'll calculate the sum of page_content lengths for all evidence.
    """
    flattened_rows = []
    
    for _, row in df.iterrows():
        category = row['category']  # e.g., 'False Positives', 'True Negatives'
        # Determine misclassification
        is_misclassified = category in ['False Positives', 'False Negatives']
        
        claims_list = row['claims']       # list of dicts
        artifacts_list = row['artifacts'] # parallel list of artifact lists
        
        # For each claim in this row
        for i, claim_info in enumerate(claims_list):
            artifacts_for_this_claim = artifacts_list[i]  # list of artifact dicts
            
            # Sum total page content across all evidence in these artifacts
            total_page_content = sum_evidence_contents_for_claim(artifacts_for_this_claim)
            
            flattened_rows.append({
                'is_misclassified': is_misclassified,
                'total_page_content_length': total_page_content,
                'claim': claim_info.get('claim', '')
            })
    
    return pd.DataFrame(flattened_rows)

def analyze_page_content_correlation(df):
    """
    1) Flatten the data: one claim per row, summing page content length of all evidences.
    2) Compute point-biserial correlation between total_page_content_length (continuous)
       and is_misclassified (binary).
    """
    flattened_df = flatten_claims_for_page_content(df)
    
    # Convert boolean to int for correlation: 1 = misclassified, 0 = correct
    flattened_df['misclass_int'] = flattened_df['is_misclassified'].astype(int)
    
    # Our numeric variable = total_page_content_length
    # Our binary variable = misclass_int
    corr, pval = pointbiserialr(flattened_df['misclass_int'], 
                                flattened_df['total_page_content_length'])
    
    print("\nCorrelation Between Total Page Content Length and Misclassification")
    print("==================================================================")
    print(f"Point-biserial correlation (r): {corr:.4f}")
    print(f"P-value: {pval:.4e}")
    
    # Check significance
    if pval < 0.05:
        print("=> Statistically SIGNIFICANT correlation (p < 0.05).")
    else:
        print("=> Not statistically significant (p >= 0.05).")
    
    # Interpretation of sign
    if corr > 0:
        print("=> Positive correlation: claims with more page content tend to be misclassified more often.")
    elif corr < 0:
        print("=> Negative correlation: claims with more page content tend to be misclassified less often.")
    else:
        print("=> Zero correlation: no relationship.")

# Example usage:
analyze_page_content_correlation(df)



Correlation Between Total Page Content Length and Misclassification
Point-biserial correlation (r): 0.0593
P-value: 1.8094e-01
=> Not statistically significant (p >= 0.05).
=> Positive correlation: claims with more page content tend to be misclassified more often.


In [65]:
import pandas as pd
from scipy.stats import pointbiserialr

def sum_page_content(evidences):
    """
    Sum the length of 'page_content' from a list of evidence dicts.
    Each evidence dict might look like:
      {
        'page_url': ...,
        'page_content': ...,
        'page_summary': ...,
        'page_timestamp': ...
      }
    """
    total_len = 0
    for e in evidences:
        content_text = e.get('page_summary', '')
        total_len += len(str(content_text))
    return total_len

def sum_evidence_contents_for_claim(artifacts_for_one_claim):
    """
    For a single claim's artifacts (a list of artifact dicts), 
    each artifact has an 'evidence' key which is a list of evidence dicts.
    We sum all 'page_content' lengths from all evidence in all artifacts.
    """
    total_length = 0
    for artifact in artifacts_for_one_claim:
        # 'evidence' is typically a list of dictionaries
        evidence_list = artifact.get('evidence', [])
        total_length += sum_page_content(evidence_list)
    return total_length

def flatten_claims_for_page_content(df):
    """
    Flatten the DataFrame so each row = exactly one claim.
    We'll calculate the sum of page_content lengths for all evidence.
    """
    flattened_rows = []
    
    for _, row in df.iterrows():
        category = row['category']  # e.g., 'False Positives', 'True Negatives'
        # Determine misclassification
        is_misclassified = category in ['False Positives', 'False Negatives']
        
        claims_list = row['claims']       # list of dicts
        artifacts_list = row['artifacts'] # parallel list of artifact lists
        
        # For each claim in this row
        for i, claim_info in enumerate(claims_list):
            artifacts_for_this_claim = artifacts_list[i]  # list of artifact dicts
            
            # Sum total page content across all evidence in these artifacts
            total_page_content = sum_evidence_contents_for_claim(artifacts_for_this_claim)
            
            flattened_rows.append({
                'is_misclassified': is_misclassified,
                'total_page_content_length': total_page_content,
                'claim': claim_info.get('claim', '')
            })
    
    return pd.DataFrame(flattened_rows)

def analyze_page_content_correlation(df):
    """
    1) Flatten the data: one claim per row, summing page content length of all evidences.
    2) Compute point-biserial correlation between total_page_content_length (continuous)
       and is_misclassified (binary).
    """
    flattened_df = flatten_claims_for_page_content(df)
    
    # Convert boolean to int for correlation: 1 = misclassified, 0 = correct
    flattened_df['misclass_int'] = flattened_df['is_misclassified'].astype(int)
    
    # Our numeric variable = total_page_content_length
    # Our binary variable = misclass_int
    corr, pval = pointbiserialr(flattened_df['misclass_int'], 
                                flattened_df['total_page_content_length'])
    
    print("\nCorrelation Between Total Page Content Length and Misclassification")
    print("==================================================================")
    print(f"Point-biserial correlation (r): {corr:.4f}")
    print(f"P-value: {pval:.4e}")
    
    # Check significance
    if pval < 0.05:
        print("=> Statistically SIGNIFICANT correlation (p < 0.05).")
    else:
        print("=> Not statistically significant (p >= 0.05).")
    
    # Interpretation of sign
    if corr > 0:
        print("=> Positive correlation: claims with more page summary tend to be misclassified more often.")
    elif corr < 0:
        print("=> Negative correlation: claims with more page summary tend to be misclassified less often.")
    else:
        print("=> Zero correlation: no relationship.")

# Example usage:
analyze_page_content_correlation(df)



Correlation Between Total Page Content Length and Misclassification
Point-biserial correlation (r): 0.0416
P-value: 3.4840e-01
=> Not statistically significant (p >= 0.05).
=> Positive correlation: claims with more page summary tend to be misclassified more often.


In [66]:
from urllib.parse import urlparse

def get_domain(url):
    """
    Extract the domain from a URL, removing 'www.' if present.
    Returns None if parsing fails or if url is empty.
    """
    if not url:
        return None
    try:
        parsed = urlparse(url)
        domain = parsed.netloc.lower()
        if domain.startswith('www.'):
            domain = domain[4:]
        return domain
    except:
        return None

def count_distinct_domains_for_claim(artifacts_for_one_claim):
    """
    Given a list of artifact dicts for one claim, each artifact may contain:
      {
        'evidence': [
          {'page_url': ..., 'page_content': ..., ...},
          ...
        ],
        ...
      }
    We extract all 'page_url' from 'evidence' and count distinct domains.
    """
    distinct_domains = set()
    
    for artifact in artifacts_for_one_claim:
        evidence_list = artifact.get('evidence', [])
        for e in evidence_list:
            url = e.get('page_url', '')
            domain = get_domain(url)
            if domain:
                distinct_domains.add(domain)
    
    return len(distinct_domains)

def flatten_claims_for_distinct_domains(df):
    """
    Flatten so each row = exactly one claim.
    For each claim, we compute the number of distinct domains among all evidence URLs.
    """
    flattened_rows = []
    
    for _, row in df.iterrows():
        category = row['category']  # e.g., 'False Positives', 'True Negatives'
        # misclassified if category is in ['False Positives', 'False Negatives']
        is_misclassified = category in ['False Positives', 'False Negatives']
        
        claims_list = row['claims']       # list of dicts: [{'claim': ..., 'date': ...}, ...]
        artifacts_list = row['artifacts'] # parallel list of artifact lists, same length as claims
        
        for i, claim_info in enumerate(claims_list):
            # Artifacts for the i-th claim
            artifacts_for_this_claim = artifacts_list[i]
            
            # Count distinct domains in these artifacts
            distinct_domains_count = count_distinct_domains_for_claim(artifacts_for_this_claim)
            
            flattened_rows.append({
                'is_misclassified': is_misclassified,
                'num_distinct_domains': distinct_domains_count,
                'claim': claim_info.get('claim', '')
            })
    
    return pd.DataFrame(flattened_rows)

from scipy.stats import pointbiserialr

def analyze_distinct_domains_correlation(df):
    """
    1) Flatten the data: one row per claim, counting how many distinct domains in evidence.
    2) Point-biserial correlation between num_distinct_domains (continuous) and is_misclassified (binary).
    """
    flattened_df = flatten_claims_for_distinct_domains(df)
    
    # Convert boolean to int for correlation
    flattened_df['misclass_int'] = flattened_df['is_misclassified'].astype(int)
    
    # Numeric variable = num_distinct_domains, Binary variable = misclass_int
    corr, pval = pointbiserialr(flattened_df['misclass_int'], 
                                flattened_df['num_distinct_domains'])
    
    print("\nCorrelation Between Number of Distinct Domains and Misclassification")
    print("===================================================================")
    print(f"Point-biserial correlation (r): {corr:.4f}")
    print(f"P-value: {pval:.4e}")
    
    if pval < 0.05:
        print("=> Statistically SIGNIFICANT correlation (p < 0.05).")
    else:
        print("=> Not statistically significant (p >= 0.05).")
    
    # Interpretation of sign
    if corr > 0:
        print("=> Positive correlation: more distinct domains → more misclassification.")
    elif corr < 0:
        print("=> Negative correlation: more distinct domains → less misclassification.")
    else:
        print("=> No linear relationship (r = 0).")

# Example usage:
analyze_distinct_domains_correlation(df)



Correlation Between Number of Distinct Domains and Misclassification
Point-biserial correlation (r): 0.0447
P-value: 3.1340e-01
=> Not statistically significant (p >= 0.05).
=> Positive correlation: more distinct domains → more misclassification.


In [None]:
from urllib.parse import urlparse

def get_domain(url):
    """Extract domain from a URL, removing 'www.' if present."""
    if not url:
        return None
    try:
        parsed = urlparse(url)
        domain = parsed.netloc.lower()
        if domain.startswith('www.'):
            domain = domain[4:]
        return domain
    except:
        return None

def gather_domain_categories_for_claim(artifacts_for_one_claim):
    """
    For each claim's artifacts (list of artifact dicts),
    each artifact has 'evidence': list of evidence dicts with 'page_url'.
    We'll gather distinct domain categories for all the evidence.
    """
    categories_used = set()
    
    for artifact in artifacts_for_one_claim:
        evidence_list = artifact.get('evidence', [])
        for e in evidence_list:
            url = e.get('page_url', '')
            domain = get_domain(url)
            if domain:
                cat = get_domain_category(domain)  # uses the mapping or heuristic
                categories_used.add(cat)
    
    return categories_used

def flatten_claims_for_domain_categories(df):
    """
    Flatten so each row = exactly one claim,
    storing a set of domain categories used by that claim.
    """
    flattened_rows = []
    
    for _, row in df.iterrows():
        category = row['category']  # e.g., 'False Positives', 'True Negatives', etc.
        is_misclassified = category in ['False Positives', 'False Negatives']
        
        claims_list = row['claims']       # list of dicts
        artifacts_list = row['artifacts'] # parallel list of artifact lists
        
        for i, claim_info in enumerate(claims_list):
            artifacts_for_this_claim = artifacts_list[i]
            
            # Gather domain categories for this claim
            domain_cats = gather_domain_categories_for_claim(artifacts_for_this_claim)
            
            flattened_rows.append({
                'is_misclassified': is_misclassified,
                'domain_categories': domain_cats,  # store as a set
                'claim': claim_info.get('claim', '')
            })
    
    return pd.DataFrame(flattened_rows)

def analyze_domain_categories_separately(df):
    """
    1) Flatten data so each row = one claim, with a set of domain categories used.
    2) For each category, build a 2x2 table:
        - "Uses category" vs. "Does not use category"
        - "Misclassified" vs. "Correct"
       Run a chi-square test.
    3) Print results.
    
    NOTE: If you have many categories, consider multiple-comparison correction.
    """
    flattened_df = flatten_claims_for_domain_categories(df)
    
    # Get a list/set of all categories in the data
    all_categories = set()
    for cats in flattened_df['domain_categories']:
        all_categories.update(cats)
    
    all_categories = sorted(list(all_categories))  # for consistent ordering
    
    results = []
    
    for cat in all_categories:
        # We'll see how many claims use this cat vs. not
        # Then how many are misclassified vs. not
        
        # "Uses cat" AND "misclassified"
        uses_cat_misclass = 0
        # "Uses cat" AND "correct"
        uses_cat_correct = 0
        # "Does not use cat" AND "misclassified"
        no_cat_misclass = 0
        # "Does not use cat" AND "correct"
        no_cat_correct = 0
        
        for _, row in flattened_df.iterrows():
            uses_cat = (cat in row['domain_categories'])
            misclass = row['is_misclassified']
            
            if uses_cat and misclass:
                uses_cat_misclass += 1
            elif uses_cat and not misclass:
                uses_cat_correct += 1
            elif (not uses_cat) and misclass:
                no_cat_misclass += 1
            else:
                no_cat_correct += 1
        
        contingency = np.array([
            [uses_cat_correct, uses_cat_misclass],
            [no_cat_correct, no_cat_misclass]
        ])
        
        # If all zero, skip
        if contingency.sum() == 0:
            results.append((cat, 0, 1.0, "No data"))
            continue
        
        chi2, p, dof, expected = chi2_contingency(contingency)
        results.append((cat, chi2, p, "Ok"))
    
    # Print or store results
    print("Chi-Square Tests for Each Domain Category vs. Misclassification\n")
    for cat, chi2, p, status in results:
        print(f"Category: {cat}")
        print(f"  Chi2 = {chi2:.4f}, p = {p:.4e}, status = {status}")
        if p < 0.05:
            print("  => Statistically SIGNIFICANT (p < 0.05).")
        else:
            print("  => Not significant (p >= 0.05).")
        print("-" * 60)

    print("\nDone.\n")


In [67]:
import re

def claim_has_numeric_data(claim_text):
    """
    Return True if the claim text contains at least one digit (0-9).
    This is a simple proxy for 'quantitative data.'
    """
    # Simple regex to detect any digit
    pattern = re.compile(r'\d')
    if pattern.search(claim_text):
        return True
    return False

def flatten_claims_for_quant_data(df):
    """
    Flatten the DataFrame so each row = exactly one claim.
    We'll detect if the claim text has numeric data (digits) 
    and whether it's misclassified.
    """
    flattened_rows = []
    
    for _, row in df.iterrows():
        category = row['category']  # e.g., 'False Positives', 'True Positives', etc.
        # True if misclassified, else False
        is_misclassified = category in ['False Positives', 'False Negatives']
        
        claims_list = row['claims']  # list of dicts: [{'claim': '...', 'date': '...'}, ...]
        
        for claim_info in claims_list:
            claim_text = claim_info.get('claim', '')
            has_quant_data = claim_has_numeric_data(claim_text)
            
            flattened_rows.append({
                'is_misclassified': is_misclassified,
                'has_quant_data': has_quant_data,
                'claim': claim_text
            })
    
    return pd.DataFrame(flattened_rows)

import pandas as pd
from scipy.stats import chi2_contingency

def analyze_quant_data_chi2(df):
    """
    1) Flatten data, each row = one claim, includes 'has_quant_data' and 'is_misclassified'.
    2) Build a 2x2 contingency table: rows=has_quant_data, cols=is_misclassified.
    3) Perform chi-square.
    """
    flattened_df = flatten_claims_for_quant_data(df)
    
    contingency = pd.crosstab(flattened_df['has_quant_data'], 
                              flattened_df['is_misclassified'])
    
    # This should produce a 2×2 table if we have both True and False in each dimension
    print("\nContingency Table (rows = has_quant_data, cols = is_misclassified):")
    print(contingency)
    
    if contingency.shape != (2, 2):
        print("\nWarning: not a full 2x2 table (maybe all claims are one type).")
        return
    
    chi2, p_value, dof, expected = chi2_contingency(contingency)
    
    print("\nChi-Square Test (Quantitative Data vs. Misclassification)")
    print("=========================================================")
    print(f"Chi2 statistic: {chi2:.4f}")
    print(f"P-value:        {p_value:.4e}")
    print(f"Degrees of Freedom: {dof}")
    print("Expected frequencies:")
    print(expected)
    
    if p_value < 0.05:
        print("\n=> Statistically SIGNIFICANT association (p < 0.05).")
    else:
        print("\n=> Not statistically significant (p >= 0.05).")
from scipy.stats import pointbiserialr

def analyze_quant_data_correlation(df):
    """
    Similar approach but uses point-biserial correlation for two binary variables.
    """
    flattened_df = flatten_claims_for_quant_data(df)
    
    # Convert booleans to int
    flattened_df['quant_data_int'] = flattened_df['has_quant_data'].astype(int)
    flattened_df['misclass_int'] = flattened_df['is_misclassified'].astype(int)
    
    corr, pval = pointbiserialr(flattened_df['misclass_int'], 
                                flattened_df['quant_data_int'])
    
    print("\nPoint-Biserial Correlation (Quantitative Data vs. Misclassification)")
    print("====================================================================")
    print(f"r = {corr:.4f}, p-value = {pval:.4e}")
    
    if pval < 0.05:
        print("=> Statistically SIGNIFICANT correlation (p < 0.05).")
    else:
        print("=> Not statistically significant (p >= 0.05).")
    
    if corr > 0:
        print("=> Positive correlation: claims with numeric data are misclassified more often.")
    elif corr < 0:
        print("=> Negative correlation: claims with numeric data are misclassified less often.")
    else:
        print("=> Zero correlation: no relationship.")

# Example usage:
analyze_quant_data_chi2(df)
analyze_quant_data_correlation(df)




Contingency Table (rows = has_quant_data, cols = is_misclassified):
is_misclassified  False  True 
has_quant_data                
False               293     32
True                167     18

Chi-Square Test (Quantitative Data vs. Misclassification)
Chi2 statistic: 0.0000
P-value:        1.0000e+00
Degrees of Freedom: 1
Expected frequencies:
[[293.1372549  31.8627451]
 [166.8627451  18.1372549]]

=> Not statistically significant (p >= 0.05).

Point-Biserial Correlation (Quantitative Data vs. Misclassification)
r = -0.0019, p-value = 9.6618e-01
=> Not statistically significant (p >= 0.05).
=> Negative correlation: claims with numeric data are misclassified less often.


In [68]:
import pandas as pd
from textblob import TextBlob
from scipy.stats import pointbiserialr

def flatten_claims_for_sentiment(df):
    """
    Flatten the DataFrame so each row = exactly one claim.
    For each claim, compute a sentiment polarity score with TextBlob.
    We'll store:
      - sentiment_polarity (float in [-1, +1])
      - is_misclassified (bool)
    """
    flattened_rows = []
    
    for _, row in df.iterrows():
        category = row['category']  # e.g. 'False Positives', 'True Negatives', etc.
        is_misclassified = category in ['False Positives', 'False Negatives']
        
        claims_list = row['claims']  # e.g. [{'claim': 'Some text', 'date': ...}, ...]
        
        for claim_dict in claims_list:
            claim_text = claim_dict.get('claim', '')
            
            # Use TextBlob to get sentiment polarity
            blob = TextBlob(claim_text)
            polarity = blob.sentiment.polarity
            # Range: [-1.0, +1.0] (negative to positive)
            
            flattened_rows.append({
                'claim': claim_text,
                'is_misclassified': is_misclassified,
                'polarity': polarity
            })
    
    return pd.DataFrame(flattened_rows)

def analyze_claim_polarity_correlation(df):
    """
    1) Flatten data so each row = a single claim with a sentiment polarity score.
    2) Compute point-biserial correlation between polarity (continuous) and misclassification (binary).
    """
    flattened_df = flatten_claims_for_sentiment(df)
    
    # Convert misclassification to 0/1
    flattened_df['misclass_int'] = flattened_df['is_misclassified'].astype(int)
    
    # X = polarity, Y = binary misclass_int
    corr, pval = pointbiserialr(flattened_df['misclass_int'], flattened_df['polarity'])
    
    print("\nCorrelation Between Claim Polarity (TextBlob) and Misclassification")
    print("====================================================================")
    print(f"Point-biserial correlation (r): {corr:.4f}")
    print(f"P-value: {pval:.4e}")
    
    if pval < 0.05:
        print("=> Statistically SIGNIFICANT correlation (p < 0.05).")
    else:
        print("=> Not statistically significant (p >= 0.05).")
    
    if corr > 0:
        print("=> Positive correlation: more positive claims more likely misclassified.")
    elif corr < 0:
        print("=> Negative correlation: more positive claims less likely misclassified.")
    else:
        print("=> Zero correlation: no linear relationship.")
    
    print("\nDone.")

# Usage:
analyze_claim_polarity_correlation(df)




Correlation Between Claim Polarity (TextBlob) and Misclassification
Point-biserial correlation (r): 0.1184
P-value: 7.4338e-03
=> Statistically SIGNIFICANT correlation (p < 0.05).
=> Positive correlation: more positive claims more likely misclassified.

Done.


In [72]:
import spacy
from spacy.lang.en import English
0,00743
# Load the spaCy English model (small). Adjust if you have a larger or custom model.
nlp = spacy.load("en_core_web_sm")

def get_named_entities_info(text):
    """
    Use spaCy to extract named entities from the text.
    Returns:
      - total_count: total # of entities found
      - type_counts: dict { 'PERSON': X, 'ORG': Y, 'GPE': Z, ... }
    """
    doc = nlp(text)
    total_count = 0
    type_counts = {}
    
    for ent in doc.ents:
        total_count += 1
        ent_label = ent.label_
        type_counts[ent_label] = type_counts.get(ent_label, 0) + 1
    
    return total_count, type_counts

def flatten_claims_for_ner(df):
    """
    Flatten the DataFrame so each row = exactly one claim.
    For each claim, extract named entity info (total count, type counts).
    We'll store:
      - total_ents
      - type_counts (dict)
      - is_misclassified
      - claim text
    """
    flattened_rows = []
    
    for _, row in df.iterrows():
        category = row['category']
        is_misclassified = (category in ['False Positives', 'False Negatives'])
        
        claims_list = row['claims']
        
        for claim_info in claims_list:
            claim_text = claim_info.get('claim', '')
            
            total_count, type_counts = get_named_entities_info(claim_text)
            
            flattened_rows.append({
                'is_misclassified': is_misclassified,
                'claim': claim_text,
                'total_ents': total_count,
                'type_counts': type_counts  # e.g., { 'PERSON': 2, 'ORG': 1, ... }
            })
    
    return pd.DataFrame(flattened_rows)

from scipy.stats import pointbiserialr

def analyze_total_entities_correlation(df):
    """
    1) Flatten the data: one row per claim, storing total_ents.
    2) Point-biserial correlation between total_ents (continuous) and is_misclassified (binary).
    """
    flattened_df = flatten_claims_for_ner(df)
    
    # Convert the boolean misclassification to int
    flattened_df['misclass_int'] = flattened_df['is_misclassified'].astype(int)
    
    corr, pval = pointbiserialr(flattened_df['misclass_int'], flattened_df['total_ents'])
    
    print("\nCorrelation Between Total Named Entities and Misclassification")
    print("=============================================================")
    print(f"Point-biserial correlation (r): {corr:.4f}")
    print(f"P-value: {pval:.4e}")
    
    if pval < 0.05:
        print("=> Statistically SIGNIFICANT correlation.")
    else:
        print("=> Not statistically significant.")
    
    if corr > 0:
        print("=> Positive correlation: claims with more named entities are misclassified more often.")
    elif corr < 0:
        print("=> Negative correlation: claims with more named entities are misclassified less often.")
    else:
        print("=> No relationship (r=0).")


import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency

def analyze_entity_types_separately(df):
    """
    1) Flatten data, extracting a dict of type_counts.
    2) Collect all possible entity types from all claims.
    3) For each entity type, build a 2x2 table:
         - row 1 = has entity type, row 2 = does not have entity type
         - col 1 = correct, col 2 = misclassified
       Then run chi-square.
    """
    flattened_df = flatten_claims_for_ner(df)
    
    # Gather all entity types used in the dataset
    all_types = set()
    for type_counts in flattened_df['type_counts']:
        all_types.update(type_counts.keys())
    all_types = sorted(list(all_types))
    
    results = []
    
    for ent_type in all_types:
        # We'll do a 2x2 table
        # rows: [has ent_type, does not have ent_type]
        # cols: [correct, misclassified]
        
        has_type_correct = 0
        has_type_misclass = 0
        no_type_correct = 0
        no_type_misclass = 0
        
        for _, row in flattened_df.iterrows():
            is_mis = row['is_misclassified']
            tc = row['type_counts']
            
            # This claim has ent_type if type_counts[ent_type] > 0
            # (We'll use get(ent_type, 0) to avoid key error.)
            has_it = (tc.get(ent_type, 0) > 0)
            
            if has_it and is_mis:
                has_type_misclass += 1
            elif has_it and not is_mis:
                has_type_correct += 1
            elif not has_it and is_mis:
                no_type_misclass += 1
            else:
                no_type_correct += 1
        
        contingency = np.array([
            [has_type_correct, has_type_misclass],
            [no_type_correct, no_type_misclass]
        ])
        
        if contingency.sum() == 0:
            # No data? skip
            results.append((ent_type, 0, 1.0, "No data"))
            continue
        
        chi2, p, dof, expected = chi2_contingency(contingency)
        results.append((ent_type, chi2, p, "Ok"))
    
    print("Chi-Square Tests for Each Named Entity Type (Presence vs. Misclassification)\n")
    for ent_type, chi2, p, status in results:
        print(f"Entity Type: {ent_type}")
        print(f"  Chi2 = {chi2:.4f}, p = {p:.4e}, status = {status}")
        if p < 0.05:
            print("  => Statistically SIGNIFICANT.")
        else:
            print("  => Not significant.")
        print("-"*50)
    
    print("\nDone.")

analyze_total_entities_correlation(df)
analyze_entity_types_separately(df)


Correlation Between Total Named Entities and Misclassification
Point-biserial correlation (r): 0.0435
P-value: 3.2708e-01
=> Not statistically significant.
=> Positive correlation: claims with more named entities are misclassified more often.
Chi-Square Tests for Each Named Entity Type (Presence vs. Misclassification)

Entity Type: CARDINAL
  Chi2 = 2.1039, p = 1.4692e-01, status = Ok
  => Not significant.
--------------------------------------------------
Entity Type: DATE
  Chi2 = 3.6756, p = 5.5215e-02, status = Ok
  => Not significant.
--------------------------------------------------
Entity Type: EVENT
  Chi2 = 2.3291, p = 1.2697e-01, status = Ok
  => Not significant.
--------------------------------------------------
Entity Type: FAC
  Chi2 = 0.3114, p = 5.7680e-01, status = Ok
  => Not significant.
--------------------------------------------------
Entity Type: GPE
  Chi2 = 0.6434, p = 4.2249e-01, status = Ok
  => Not significant.
----------------------------------------------

In [75]:
import spacy
import numpy as np
from scipy.stats import pointbiserialr

nlp = spacy.load("en_core_web_md")  # use a model with vectors, e.g. 'md' or 'lg'

def average_question_similarity(questions):
    """
    Given a list of question strings, compute average pairwise similarity using spaCy vectors.
    Returns a single float: the mean similarity in [0,1].
    If there's only 0 or 1 question, return 1.0 or 0.0 or any suitable default.
    """
    if len(questions) <= 1:
        return 1.0  # or 0.0, depending on how you define "similarity" of a single question
    
    docs = [nlp(q) for q in questions]
    sims = []
    
    for i in range(len(docs)):
        for j in range(i+1, len(docs)):
            sim = docs[i].similarity(docs[j])  # spaCy doc.similarity
            sims.append(sim)
    
    if not sims:
        return 0.0
    
    return np.mean(sims)

def flatten_claims_for_question_similarity(df):
    flattened_rows = []
    
    for _, row in df.iterrows():
        category = row['category']
        is_misclassified = category in ['False Positives', 'False Negatives']
        
        claims_list = row['claims']
        artifacts_list = row['artifacts']
        
        for i, claim_info in enumerate(claims_list):
            artifacts_for_this_claim = artifacts_list[i]
            
            # Gather the question texts
            questions = []
            for artifact in artifacts_for_this_claim:
                dq = artifact.get('decomposed_question', '')
                dq_str = str(dq).strip()
                if dq_str:
                    questions.append(dq_str)
            
            avg_sim = average_question_similarity(questions)
            
            flattened_rows.append({
                'is_misclassified': is_misclassified,
                'avg_question_similarity': avg_sim,
                'claim': claim_info.get('claim', '')
            })
    
    return pd.DataFrame(flattened_rows)

def analyze_question_similarity_correlation(df):
    flattened_df = flatten_claims_for_question_similarity(df)
    flattened_df['misclass_int'] = flattened_df['is_misclassified'].astype(int)
    
    corr, pval = pointbiserialr(flattened_df['misclass_int'], 
                                flattened_df['avg_question_similarity'])
    
    print("\nCorrelation: Avg Similarity of Decomposed Qs vs. Misclassification")
    print("==================================================================")
    print(f"r = {corr:.4f}, p = {pval:.4e}")
    
    if pval < 0.05:
        print("=> Statistically SIGNIFICANT correlation.")
    else:
        print("=> Not statistically significant.")
    
    if corr > 0:
        print("=> Positive correlation: more similar questions → more misclassification.")
    elif corr < 0:
        print("=> Negative correlation: more similar questions → less misclassification.")
    else:
        print("=> No linear relationship.")

analyze_question_similarity_correlation(df)


Correlation: Avg Similarity of Decomposed Qs vs. Misclassification
r = -0.0327, p = 4.6135e-01
=> Not statistically significant.
=> Negative correlation: more similar questions → less misclassification.


In [76]:
import spacy
from scipy.stats import pointbiserialr

# Load a spaCy model that has NER
nlp = spacy.load("en_core_web_sm")

def has_relative_temporal_reference(claim_text):
    """
    Returns True if the claim text contains a DATE entity
    that is likely a relative expression (no explicit digits).
    This is a simple heuristic approach.
    """
    doc = nlp(claim_text)
    for ent in doc.ents:
        if ent.label_ == "DATE":
            # Check if the entity text has digits or not
            # If no digits, we guess it might be something like 'last year', 'this Monday', etc.
            # Example heuristic: if there's no digit in ent.text, treat it as a relative time
            if not any(char.isdigit() for char in ent.text):
                return True
    return False

def flatten_claims_for_temporal_expressions(df):
    """
    Flatten the DataFrame so each row = exactly one claim.
    We'll check if there's a relative temporal expression (heuristic).
    """
    flattened_rows = []
    
    for _, row in df.iterrows():
        category = row['category']  # e.g., 'False Positives'
        is_misclassified = category in ['False Positives', 'False Negatives']
        
        claims_list = row['claims']  # list of dicts: [{'claim': '...', 'date': ...}, ...]
        
        for claim_info in claims_list:
            claim_text = claim_info.get('claim', '')
            # Check for relative time reference
            has_relative = has_relative_temporal_reference(claim_text)
            
            flattened_rows.append({
                'is_misclassified': is_misclassified,
                'claim': claim_text,
                'has_relative_temporal': has_relative
            })
    
    return pd.DataFrame(flattened_rows)

def analyze_relative_temporal_correlation(df):
    """
    1) Flatten the data, checking each claim for 'has_relative_temporal' reference.
    2) Correlate presence/absence (binary) with misclassification.
    """
    flattened_df = flatten_claims_for_temporal_expressions(df)
    
    # Convert booleans to int
    flattened_df['relative_temp_int'] = flattened_df['has_relative_temporal'].astype(int)
    flattened_df['misclass_int'] = flattened_df['is_misclassified'].astype(int)
    
    # Point-biserial correlation for two binary variables is valid,
    # but often a simple chi-square on the 2×2 table is typical. We'll do both:

    # A) Chi-square
    import pandas as pd
    from scipy.stats import chi2_contingency
    
    contingency = pd.crosstab(flattened_df['has_relative_temporal'],
                              flattened_df['is_misclassified'])
    print("\nContingency Table (Relative Temp vs. Misclassification):")
    print(contingency)
    
    if contingency.shape == (2, 2):
        chi2, p_value, dof, expected = chi2_contingency(contingency)
        print("\nChi-Square Test:")
        print(f"  Chi2 = {chi2:.4f}, p = {p_value:.4e}, dof = {dof}")
        if p_value < 0.05:
            print("  => Statistically SIGNIFICANT association.")
        else:
            print("  => No significant association.")
    else:
        print("\nWarning: Not a 2×2 table. Possibly all claims are one category.")
    
    # B) Point-biserial correlation
    from scipy.stats import pointbiserialr
    corr, pval = pointbiserialr(flattened_df['misclass_int'],
                                flattened_df['relative_temp_int'])
    
    print("\nPoint-Biserial Correlation (Relative Temp vs. Misclassification)")
    print("=================================================================")
    print(f"r = {corr:.4f}, p = {pval:.4e}")
    if pval < 0.05:
        print("=> Statistically SIGNIFICANT correlation.")
    else:
        print("=> Not statistically significant.")

# Example usage:
analyze_relative_temporal_correlation(df)



Contingency Table (Relative Temp vs. Misclassification):
is_misclassified       False  True 
has_relative_temporal              
False                    392     38
True                      68     12

Chi-Square Test:
  Chi2 = 2.2420, p = 1.3430e-01, dof = 1
  => No significant association.

Point-Biserial Correlation (Relative Temp vs. Misclassification)
r = 0.0754, p = 8.9072e-02
=> Not statistically significant.


In [79]:
import pandas as pd
from scipy.stats import chi2_contingency, pointbiserialr

RELATIVE_TIME_PHRASES = [
    "last year", "last month", "last week", 
    "next year", "next month", "next week", 
    "yesterday", "tomorrow", "today",
    "two days ago", "three days ago", "one week ago", 
    "this month", "this week", "this year",
    "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"
    # Add more as needed
]

def has_manual_relative_time(claim_text):
    """
    Returns True if any phrase in RELATIVE_TIME_PHRASES 
    is found in the claim_text (case-insensitive).
    """
    text_lower = claim_text.lower()
    for phrase in RELATIVE_TIME_PHRASES:
        if phrase in text_lower:
            return True
    return False


def flatten_claims_for_relative_time(df):
    """
    Flatten the DataFrame so each row = exactly one claim.
    We'll detect if the claim text has any 'relative time' phrase.
    """
    flattened_rows = []
    
    for _, row in df.iterrows():
        # Determine if misclassified
        category = row['category']  # e.g. 'False Positives', 'True Negatives'
        is_misclassified = category in ['False Positives', 'False Negatives']
        
        claims_list = row['claims']  # e.g. [{'claim': 'some text', 'date': ...}, ...]
        
        for claim_dict in claims_list:
            claim_text = claim_dict.get('claim', '')
            relative_time_present = has_manual_relative_time(claim_text)
            
            flattened_rows.append({
                'is_misclassified': is_misclassified,
                'claim': claim_text,
                'has_relative_time': relative_time_present
            })
    
    return pd.DataFrame(flattened_rows)


def analyze_relative_time_correlation(df):
    """
    1) Flatten data so each row is one claim with has_relative_time = True/False.
    2) Perform both:
       - Chi-square test (2x2)
       - Point-biserial correlation (binary vs. binary)
    """
    flattened_df = flatten_claims_for_relative_time(df)
    
    # --- Chi-square approach ---
    contingency = pd.crosstab(flattened_df['has_relative_time'], 
                              flattened_df['is_misclassified'])
    
    print("\nContingency Table (rows = has_relative_time, cols = is_misclassified):")
    print(contingency)
    
    if contingency.shape == (2,2):
        chi2, p_value, dof, expected = chi2_contingency(contingency)
        print("\nChi-Square Test (Relative Time vs. Misclassification)")
        print("=====================================================")
        print(f"Chi2 Statistic: {chi2:.4f}")
        print(f"P-value:        {p_value:.4e}")
        print(f"Degrees of freedom: {dof}")
        print("Expected Frequencies:")
        print(expected)
        
        if p_value < 0.05:
            print("\n=> Statistically SIGNIFICANT association (p < 0.05).")
        else:
            print("\n=> No significant association (p >= 0.05).")
    else:
        print("\nWARNING: The data did not form a 2×2 table. Possibly all claims are in one category.")
    
    # --- Point-biserial correlation ---
    # Convert booleans to int
    flattened_df['rel_time_int'] = flattened_df['has_relative_time'].astype(int)
    flattened_df['misclass_int'] = flattened_df['is_misclassified'].astype(int)
    
    corr, pval = pointbiserialr(flattened_df['rel_time_int'], flattened_df['misclass_int'])
    print("\nPoint-Biserial Correlation (Relative Time vs. Misclassification)")
    print("================================================================")
    print(f"r = {corr:.4f}, p-value = {pval:.4e}")
    
    if pval < 0.05:
        print("=> Statistically SIGNIFICANT correlation.")
    else:
        print("=> Not statistically significant (p >= 0.05).")
    
    if corr > 0:
        print("=> Positive correlation: claims with relative time references tend to be misclassified more often.")
    elif corr < 0:
        print("=> Negative correlation: claims with relative time references tend to be misclassified less often.")
    else:
        print("=> No linear relationship (r = 0).")

# Example usage:
analyze_relative_time_correlation(df)



Contingency Table (rows = has_relative_time, cols = is_misclassified):
is_misclassified   False  True 
has_relative_time              
False                442     48
True                  18      2

Chi-Square Test (Relative Time vs. Misclassification)
Chi2 Statistic: 0.0000
P-value:        1.0000e+00
Degrees of freedom: 1
Expected Frequencies:
[[441.96078431  48.03921569]
 [ 18.03921569   1.96078431]]

=> No significant association (p >= 0.05).

Point-Biserial Correlation (Relative Time vs. Misclassification)
r = 0.0013, p-value = 9.7606e-01
=> Not statistically significant (p >= 0.05).
=> Positive correlation: claims with relative time references tend to be misclassified more often.


In [80]:
# Example hedge words list (expand as needed for your domain)
HEDGE_WORDS = [
    "might", "could", "possibly", "likely", "reportedly", "allegedly", 
    "suggests", "seems", "appear", "somewhat", "around", "about", "apparently", 
    "arguably", "allege", "presumably", "tentative", "hypothetical", "uncertain", 
    "questionable", "doubtful"
]

def contains_hedge_language(claim_text):
    """
    Returns True if any hedge word (case-insensitive) is found in the claim_text.
    Simple substring approach—may have false positives (e.g., 'likelyhood' contains 'likely').
    Could refine with regex word boundaries if needed.
    """
    text_lower = claim_text.lower()
    for hw in HEDGE_WORDS:
        if hw in text_lower:
            return True
    return False

import re

hedge_pattern = re.compile(r"\b(" + "|".join(map(re.escape, HEDGE_WORDS)) + r")\b", re.IGNORECASE)

def contains_hedge_language(claim_text):
    return bool(hedge_pattern.search(claim_text))
def flatten_for_hedge_words(df):
    """
    Flatten the DataFrame so each row = exactly one claim.
    For each claim, detect if it contains at least one hedge word.
    """
    flattened_rows = []
    
    for _, row in df.iterrows():
        category = row['category']  # e.g. 'False Positives', 'True Negatives', etc.
        is_misclassified = category in ['False Positives', 'False Negatives']
        
        claims_list = row['claims']  # e.g., [{'claim': 'some text', 'date': ...}, ...]
        
        for claim_info in claims_list:
            claim_text = claim_info.get('claim', '')
            # Check hedge words
            has_hedge = contains_hedge_language(claim_text)
            
            flattened_rows.append({
                'is_misclassified': is_misclassified,
                'has_hedge_words': has_hedge,
                'claim': claim_text
            })
    
    return pd.DataFrame(flattened_rows)

import pandas as pd
from scipy.stats import chi2_contingency, pointbiserialr

import pandas as pd
from scipy.stats import chi2_contingency, pointbiserialr

def analyze_hedge_words_correlation(df):
    """
    1) Flatten data so each row is a single claim with has_hedge_words = True/False.
    2) Perform:
       - Chi-square test (2x2)
       - Point-biserial correlation (binary vs. binary)
    """
    flattened_df = flatten_for_hedge_words(df)
    
    # --- A) Chi-square on 2x2 table ---
    contingency = pd.crosstab(flattened_df['has_hedge_words'], 
                              flattened_df['is_misclassified'])
    
    print("\nContingency Table (Hedge Words vs. Misclassification):")
    print(contingency)
    
    if contingency.shape == (2,2):
        chi2, p_value, dof, expected = chi2_contingency(contingency)
        print("\nChi-Square Test (Hedge Words vs. Misclassification)")
        print("====================================================")
        print(f"Chi2 Statistic: {chi2:.4f}")
        print(f"P-value:        {p_value:.4e}")
        print(f"Degrees of freedom: {dof}")
        print("Expected Frequencies:")
        print(expected)
        
        if p_value < 0.05:
            print("=> Statistically SIGNIFICANT association (p < 0.05).")
        else:
            print("=> No significant association (p >= 0.05).")
    else:
        print("\nWARNING: Did not form a full 2×2 table. Possibly all claims are in one category.")
    
    # --- B) Point-biserial correlation for binary vs. binary ---
    flattened_df['hedge_int'] = flattened_df['has_hedge_words'].astype(int)
    flattened_df['misclass_int'] = flattened_df['is_misclassified'].astype(int)
    
    corr, pval = pointbiserialr(flattened_df['hedge_int'], flattened_df['misclass_int'])
    print("\nPoint-Biserial Correlation (Hedge Words vs. Misclassification)")
    print("================================================================")
    print(f"r = {corr:.4f}, p-value = {pval:.4e}")
    
    if pval < 0.05:
        print("=> Statistically SIGNIFICANT correlation (p < 0.05).")
    else:
        print("=> Not statistically significant (p >= 0.05).")
    
    if corr > 0:
        print("=> Positive correlation: claims with hedge words are misclassified more often.")
    elif corr < 0:
        print("=> Negative correlation: claims with hedge words are misclassified less often.")
    else:
        print("=> No relationship (r=0).")

# Finally, just run:
analyze_hedge_words_correlation(df)


Contingency Table (Hedge Words vs. Misclassification):
is_misclassified  False  True 
has_hedge_words               
False               396     44
True                 64      6

Chi-Square Test (Hedge Words vs. Misclassification)
Chi2 Statistic: 0.0246
P-value:        8.7527e-01
Degrees of freedom: 1
Expected Frequencies:
[[396.8627451  43.1372549]
 [ 63.1372549   6.8627451]]
=> No significant association (p >= 0.05).

Point-Biserial Correlation (Hedge Words vs. Misclassification)
r = -0.0165, p-value = 7.0956e-01
=> Not statistically significant (p >= 0.05).
=> Negative correlation: claims with hedge words are misclassified less often.


In [81]:
import pandas as pd

def flatten_claims_subcategories_well_classified(df):
    """
    Flatten the DataFrame so each row = exactly one claim.
    We'll store:
      - subcategory
      - is_well_classified (True if category in [True Positives, True Negatives])
    """
    flattened_rows = []
    
    for _, row in df.iterrows():
        # category: e.g. 'False Positives', 'True Positives', etc.
        cat = row['category']
        
        # is_well_classified = True for (True Positives, True Negatives)
        #                      False for (False Positives, False Negatives)
        is_well_classified = cat in ['True Positives', 'True Negatives']
        
        subcat = row['subcategory']  # e.g., 'Quotes', 'immigration', ...
        claims_list = row['claims']  # typically a list of dicts
        
        # Each original row can contain multiple claims. Flatten them:
        for claim_info in claims_list:
            flattened_rows.append({
                'subcategory': subcat,
                'is_well_classified': is_well_classified,
                # optional: store the claim text if needed
                'claim': claim_info.get('claim', '')
            })
    
    return pd.DataFrame(flattened_rows)

from scipy.stats import chi2_contingency
import numpy as np
import math

def analyze_well_classified_subcategories_r_by_2(df):
    """
    1) Flatten data so each row is one claim with 'subcategory' and 'is_well_classified'.
    2) Build an r×2 contingency table:
       - Rows = subcategories
       - Columns = [False, True] (not well-classified vs well-classified)
    3) Run a chi-square test + compute Cramér’s V.
    """
    flattened_df = flatten_claims_subcategories_well_classified(df)
    
    # Build the contingency table using crosstab
    contingency = pd.crosstab(flattened_df['subcategory'], 
                              flattened_df['is_well_classified'])
    
    # Rename columns just for clarity
    # If subcategories appear in rows, columns might be [False, True]
    # We'll rename them to: ['Not Well', 'Well'] for printing clarity
    # Only do this if both columns exist:
    columns_in_table = list(contingency.columns)
    if len(columns_in_table) == 2:
        new_col_names = {False: 'Not Well', True: 'Well'}
        contingency.rename(columns=new_col_names, inplace=True)
    
    print("\nContingency Table (Subcategory vs. Well-Classified):")
    print("-----------------------------------------------------")
    print(contingency)
    
    total_counts = contingency.values.sum()
    print(f"\nTotal claims in table: {total_counts}")
    
    # Filter out subcategories with zero total
    row_sums = contingency.sum(axis=1)
    keep_mask = (row_sums > 0)
    contingency = contingency.loc[keep_mask]
    
    if len(contingency) < 2 or contingency.shape[1] < 2:
        print("\nNot enough data to run a valid chi-square (need at least 2 rows & 2 columns).")
        return
    
    # Chi-square
    chi2, p_value, dof, expected = chi2_contingency(contingency)
    
    print("\nChi-Square Test for Subcategory vs. Well Classification")
    print("========================================================")
    print(f"Chi-square statistic: {chi2:.4f}")
    print(f"P-value:             {p_value:.4e}")
    print(f"Degrees of freedom:  {dof}")
    
    print("\nExpected Frequencies:")
    print(expected)
    
    if p_value < 0.05:
        print("=> Statistically SIGNIFICANT association (p < 0.05).")
    else:
        print("=> No statistically significant association (p >= 0.05).")
    
    # Cramér’s V
    r, c = contingency.shape
    min_dim = min(r, c)
    cramer_v = math.sqrt(chi2 / (total_counts * (min_dim - 1)))
    print(f"\nCramér's V = {cramer_v:.4f} (Effect size)")

def analyze_well_classified_subcategories_individual(df):
    """
    For each unique subcategory:
      1) Build a 2x2 table:
         - Row 1 = subcategory == X, Row 2 = subcategory != X
         - Col 1 = not well classified, Col 2 = well classified
      2) Run chi-square for each.
      3) Print results.
      
    NOTE: This can produce multiple p-values. Consider multiple-comparison corrections.
    """
    flattened_df = flatten_claims_subcategories_well_classified(df)
    unique_subcats = flattened_df['subcategory'].unique()
    
    results = []
    
    for sc in unique_subcats:
        # For this subcat, we see if each claim is in sc or not
        # Then compare well_classified or not
        in_sc_well = 0
        in_sc_not_well = 0
        not_sc_well = 0
        not_sc_not_well = 0
        
        for _, row in flattened_df.iterrows():
            is_sc = (row['subcategory'] == sc)
            is_well = row['is_well_classified']
            
            if is_sc and is_well:
                in_sc_well += 1
            elif is_sc and not is_well:
                in_sc_not_well += 1
            elif not is_sc and is_well:
                not_sc_well += 1
            else:
                not_sc_not_well += 1
        
        contingency = pd.DataFrame([
            [in_sc_not_well, in_sc_well],
            [not_sc_not_well, not_sc_well]
        ], columns=['Not Well', 'Well'], index=[f"Subcat={sc}", f"Subcat!= {sc}"])
        
        # If everything is zero, skip
        if contingency.values.sum() == 0:
            results.append((sc, 0, 1.0, "No Data"))
            continue
        
        chi2, p, dof, expected = chi2_contingency(contingency)
        results.append((sc, chi2, p, "Ok"))
    
    # Print or store
    print("Chi-Square Tests for Each Subcategory (Well-Classified vs. Others)\n")
    for (subcat, chi2, pval, status) in results:
        print(f"Subcategory: {subcat}")
        print(f"  Chi2 = {chi2:.4f}, p = {pval:.4e}, status = {status}")
        if pval < 0.05:
            print("  => Statistically SIGNIFICANT (p < 0.05).")
        else:
            print("  => Not significant (p >= 0.05).")
        print("-"*60)
    
    print("\nNOTE: If you're doing many subcategories, consider multiple-testing corrections.\n")

analyze_well_classified_subcategories_individual(df)
analyze_well_classified_subcategories_individual(df)

Chi-Square Tests for Each Subcategory (Well-Classified vs. Others)

Subcategory: Other
  Chi2 = 2.3115, p = 1.2842e-01, status = Ok
  => Not significant (p >= 0.05).
------------------------------------------------------------
Subcategory: Politics
  Chi2 = 0.3506, p = 5.5377e-01, status = Ok
  => Not significant (p >= 0.05).
------------------------------------------------------------
Subcategory: Immigration
  Chi2 = 0.0000, p = 1.0000e+00, status = Ok
  => Not significant (p >= 0.05).
------------------------------------------------------------
Subcategory: Quotes
  Chi2 = 4.4220, p = 3.5478e-02, status = Ok
  => Statistically SIGNIFICANT (p < 0.05).
------------------------------------------------------------
Subcategory: Politicians
  Chi2 = 0.6200, p = 4.3106e-01, status = Ok
  => Not significant (p >= 0.05).
------------------------------------------------------------
Subcategory: Ballot Box
  Chi2 = 0.0000, p = 1.0000e+00, status = Ok
  => Not significant (p >= 0.05).
---------

In [82]:
def sum_explanation_length(artifacts_for_one_claim):
    """
    Sum the length of 'decomposed_question_explanation' across all artifacts
    for a single claim.
    
    Each artifact dict might look like:
      {
        'decomposed_question': ...,
        'decomposed_justification': ...,
        'decomposed_question_explanation': ...,
        'evidence': ...
      }
    """
    total_len = 0
    for artifact in artifacts_for_one_claim:
        explanation_text = artifact.get('decomposed_question_explanation', '')
        total_len += len(str(explanation_text))  # Convert to string in case it's None
    return total_len

def flatten_claims_for_explanation_length_well_classified(df):
    """
    Flatten the DataFrame so each row = exactly one claim.
    For each claim, compute the sum of explanation lengths from artifacts.
    We'll store:
      - explanation_length (int)
      - is_well_classified (bool)
    """
    flattened_rows = []
    
    for _, row in df.iterrows():
        category = row['category']  # e.g. 'False Positives', 'True Positives', etc.
        
        # is_well_classified = True for (True Positives, True Negatives)
        #                     = False for (False Positives, False Negatives)
        is_well_classified = category in ['True Positives', 'True Negatives']
        
        claims_list = row['claims']       # list of claim dicts
        artifacts_list = row['artifacts'] # parallel list of artifact lists (same length)
        
        # Process each claim in this row
        for i, claim_info in enumerate(claims_list):
            # Artifacts for the i-th claim
            artifacts_for_this_claim = artifacts_list[i]
            
            # Sum explanation lengths
            explanation_sum = sum_explanation_length(artifacts_for_this_claim)
            
            flattened_rows.append({
                'is_well_classified': is_well_classified,
                'explanation_length': explanation_sum,
                # optionally store the claim text or other fields
                'claim': claim_info.get('claim', '')
            })
    
    return pd.DataFrame(flattened_rows)

from scipy.stats import pointbiserialr

def analyze_explanation_length_well_classified_correlation(df):
    """
    1) Flatten the data so each row = one claim, summing the length of explanations.
    2) Run a point-biserial correlation between explanation_length (continuous)
       and is_well_classified (binary).
    """
    flattened_df = flatten_claims_for_explanation_length_well_classified(df)
    
    # Convert boolean to int for correlation: 1 = well-classified, 0 = not well-classified
    flattened_df['well_class_int'] = flattened_df['is_well_classified'].astype(int)
    
    # Our numeric variable = explanation_length
    # Our binary variable = well_class_int
    corr, pval = pointbiserialr(flattened_df['well_class_int'], 
                                flattened_df['explanation_length'])
    
    print("\nCorrelation Between Explanation Length and Being Well-Classified")
    print("=================================================================")
    print(f"Point-biserial correlation (r): {corr:.4f}")
    print(f"P-value: {pval:.4e}")
    
    if pval < 0.05:
        print("=> Statistically SIGNIFICANT correlation (p < 0.05).")
    else:
        print("=> Not statistically significant (p >= 0.05).")
    
    # Interpret the sign of corr
    if corr > 0:
        print("=> Positive correlation: longer explanations tend to be well-classified more often.")
    elif corr < 0:
        print("=> Negative correlation: longer explanations tend to be well-classified less often.")
    else:
        print("=> Zero correlation: no relationship.")

# Example usage:
analyze_explanation_length_well_classified_correlation(df)



Correlation Between Explanation Length and Being Well-Classified
Point-biserial correlation (r): 0.0927
P-value: 3.6382e-02
=> Statistically SIGNIFICANT correlation (p < 0.05).
=> Positive correlation: longer explanations tend to be well-classified more often.


In [86]:
import pandas as pd
from textblob import TextBlob
from scipy.stats import pointbiserialr

def flatten_claims_for_sentiment(df):
    """
    Flatten the DataFrame so each row = exactly one claim.
    For each claim, compute a sentiment polarity score with TextBlob.
    We'll store:
      - sentiment_polarity (float in [-1, +1])
      - is_misclassified (bool)
    """
    flattened_rows = []
    
    for _, row in df.iterrows():
        category = row['category']  # e.g. 'False Positives', 'True Negatives', etc.
        is_misclassified = category in ['True Positives', 'True Negatives']
        
        claims_list = row['claims']  # e.g. [{'claim': 'Some text', 'date': ...}, ...]
        
        for claim_dict in claims_list:
            claim_text = claim_dict.get('claim', '')
            
            # Use TextBlob to get sentiment polarity
            blob = TextBlob(claim_text)
            polarity = blob.sentiment.polarity
            # Range: [-1.0, +1.0] (negative to positive)
            
            flattened_rows.append({
                'claim': claim_text,
                'is_misclassified': is_misclassified,
                'polarity': polarity
            })
    
    return pd.DataFrame(flattened_rows)

def analyze_claim_polarity_correlation(df):
    """
    1) Flatten data so each row = a single claim with a sentiment polarity score.
    2) Compute point-biserial correlation between polarity (continuous) and misclassification (binary).
    """
    flattened_df = flatten_claims_for_sentiment(df)
    
    # Convert misclassification to 0/1
    flattened_df['misclass_int'] = flattened_df['is_misclassified'].astype(int)
    
    # X = polarity, Y = binary misclass_int
    corr, pval = pointbiserialr(flattened_df['misclass_int'], flattened_df['polarity'])
    
    print("\nCorrelation Between Claim Polarity (TextBlob) and Misclassification")
    print("====================================================================")
    print(f"Point-biserial correlation (r): {corr:.4f}")
    print(f"P-value: {pval:.4e}")
    
    if pval < 0.05:
        print("=> Statistically SIGNIFICANT correlation (p < 0.05).")
    else:
        print("=> Not statistically significant (p >= 0.05).")
    
    if corr > 0:
        print("=> Positive correlation: more positive claims more likely well classified.")
    elif corr < 0:
        print("=> Negative correlation: more positive claims less likely well classified.")
    else:
        print("=> Zero correlation: no linear relationship.")
    
    print("\nDone.")

# Usage:
analyze_claim_polarity_correlation(df)




Correlation Between Claim Polarity (TextBlob) and Misclassification
Point-biserial correlation (r): -0.1184
P-value: 7.4338e-03
=> Statistically SIGNIFICANT correlation (p < 0.05).
=> Negative correlation: more positive claims less likely well classified.

Done.


In [87]:
def sum_page_summaries(evidences):
    """
    Sum the length of 'page_summary' fields from a list of evidence dicts.
    Each evidence dict might look like:
      {
        'page_url': ...,
        'page_content': ...,
        'page_summary': ...,
        'page_timestamp': ...
      }
    """
    total_len = 0
    for e in evidences:
        summary_text = e.get('page_summary', '')
        total_len += len(str(summary_text))
    return total_len

def sum_page_summaries_for_claim(artifacts_for_one_claim):
    """
    For a single claim's artifacts (list of artifact dicts),
    each artifact has an 'evidence' list of evidence dicts.
    We sum the lengths of 'page_summary' across ALL evidence.
    """
    total_length = 0
    for artifact in artifacts_for_one_claim:
        evidence_list = artifact.get('evidence', [])
        total_length += sum_page_summaries(evidence_list)
    return total_length

def flatten_claims_for_summary_length_well_classified(df):
    """
    Flatten the DataFrame so each row = exactly one claim.
    For each claim, we sum the total length of 'page_summary' fields
    from all evidences, and note whether the claim is well-classified
    (True Positives or True Negatives).
    """
    flattened_rows = []
    
    for _, row in df.iterrows():
        category = row['category']  # e.g., 'True Positives', 'False Negatives', etc.
        # is_well_classified = True for True Positives or True Negatives
        is_well_classified = category in ['True Positives', 'True Negatives']
        
        claims_list = row['claims']       # list of claim dicts
        artifacts_list = row['artifacts'] # parallel list of artifact lists
        
        for i, claim_info in enumerate(claims_list):
            artifacts_for_this_claim = artifacts_list[i]  # list of artifact dicts
            
            # Sum page_summary lengths across all evidence
            total_page_summary = sum_page_summaries_for_claim(artifacts_for_this_claim)
            
            flattened_rows.append({
                'is_well_classified': is_well_classified,
                'total_page_summary_length': total_page_summary,
                'claim': claim_info.get('claim', '')
            })
    
    return pd.DataFrame(flattened_rows)
from scipy.stats import pointbiserialr

def analyze_page_summary_well_classified_correlation(df):
    """
    1) Flatten data so each row = one claim, summing page_summary length of all evidences.
    2) Use point-biserial correlation between total_page_summary_length (continuous)
       and is_well_classified (binary).
    """
    flattened_df = flatten_claims_for_summary_length_well_classified(df)
    
    # Convert boolean to int: 1 = well-classified, 0 = not well-classified
    flattened_df['well_class_int'] = flattened_df['is_well_classified'].astype(int)
    
    # Our numeric variable = total_page_summary_length
    # Our binary variable = well_class_int
    corr, pval = pointbiserialr(flattened_df['total_page_summary_length'], 
                                flattened_df['well_class_int'])
    
    print("\nCorrelation Between Page Summary Length and Good Classification")
    print("================================================================")
    print(f"Point-biserial correlation (r): {corr:.4f}")
    print(f"P-value: {pval:.4e}")
    
    # Significance
    if pval < 0.05:
        print("=> Statistically SIGNIFICANT correlation (p < 0.05).")
    else:
        print("=> Not statistically significant (p >= 0.05).")
    
    # Interpret sign
    if corr > 0:
        print("=> Positive correlation: claims with longer summaries are more often well-classified.")
    elif corr < 0:
        print("=> Negative correlation: claims with longer summaries are less often well-classified.")
    else:
        print("=> Zero correlation: no relationship.")

# Example usage:
analyze_page_summary_well_classified_correlation(df)



Correlation Between Page Summary Length and Good Classification
Point-biserial correlation (r): -0.0416
P-value: 3.4840e-01
=> Not statistically significant (p >= 0.05).
=> Negative correlation: claims with longer summaries are less often well-classified.


In [88]:
import csv

def save_domains_csv(domains, filename='domains.csv'):
    unique_domains = sorted(set(domains))
    
    with open(filename, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['domain'])  # Header
        for domain in unique_domains:
            writer.writerow([domain])
            
domains = []
for _, row in df.iterrows():
        for artifacts in row['artifacts']:
            for artifact in artifacts:
                evidence = artifact.get('evidence', [])
                for e in evidence:
                    url = e.get('page_url')
                    if url:
                        domain = extract_domain(url)
                        domains.append(domain)

save_domains_csv(domains)

In [91]:
def sum_page_content(evidences):
    """
    Sum the length of 'page_content' fields from a list of evidence dicts.
    Each evidence dict might look like:
      {
        'page_url': ...,
        'page_content': ...,
        'page_summary': ...,
        'page_timestamp': ...
      }
    """
    total_len = 0
    for e in evidences:
        content_text = e.get('page_content', '')
        total_len += len(str(content_text))  # Handle None or non-string
    return total_len

def sum_page_content_for_claim(artifacts_for_one_claim):
    """
    For a single claim's artifacts (list of artifact dicts),
    each artifact has an 'evidence' list of evidence dicts.
    We'll sum all 'page_content' lengths across ALL evidence.
    """
    total_length = 0
    for artifact in artifacts_for_one_claim:
        evidence_list = artifact.get('evidence', [])
        total_length += sum_page_content(evidence_list)
    return total_length

def flatten_claims_for_page_content_well_classified(df):
    """
    Flatten the DataFrame so each row = exactly one claim.
    For each claim, sum total 'page_content' length of all evidences,
    and label is_well_classified if category ∈ {True Positives, True Negatives}.
    """
    flattened_rows = []
    
    for _, row in df.iterrows():
        category = row['category']  # e.g., 'True Positives', 'False Negatives', etc.
        
        # is_well_classified = True for True Positives or True Negatives
        is_well_classified = category in ['True Positives', 'True Negatives']
        
        claims_list = row['claims']       # list of claim dicts
        artifacts_list = row['artifacts'] # parallel list of artifact lists
        
        for i, claim_info in enumerate(claims_list):
            artifacts_for_this_claim = artifacts_list[i]
            
            # Sum page_content length across all evidence in these artifacts
            total_page_content = sum_page_content_for_claim(artifacts_for_this_claim)
            
            flattened_rows.append({
                'is_well_classified': is_well_classified,
                'total_page_content_length': total_page_content,
                'claim': claim_info.get('claim', '')
            })
    
    return pd.DataFrame(flattened_rows)

from scipy.stats import pointbiserialr

def analyze_page_content_well_classified_correlation(df):
    """
    1) Flatten data so each row = one claim with sum of all 'page_content' lengths.
    2) Use a point-biserial correlation between total_page_content_length (continuous)
       and is_well_classified (binary).
    """
    flattened_df = flatten_claims_for_page_content_well_classified(df)
    
    # Convert boolean to int: 1 = well-classified, 0 = not well-classified
    flattened_df['well_class_int'] = flattened_df['is_well_classified'].astype(int)
    
    # Numeric variable = total_page_content_length
    # Binary variable = well_class_int
    corr, pval = pointbiserialr(flattened_df['total_page_content_length'], 
                                flattened_df['well_class_int'])
    
    print("\nCorrelation Between Page Content Length and Good Classification")
    print("================================================================")
    print(f"Point-biserial correlation (r): {corr:.4f}")
    print(f"P-value:                        {pval:.4e}")
    
    if pval < 0.05:
        print("=> Statistically SIGNIFICANT correlation (p < 0.05).")
    else:
        print("=> Not statistically significant (p >= 0.05).")
    
    # Interpret correlation sign
    if corr > 0:
        print("=> Positive correlation: claims with more page content tend to be well-classified more often.")
    elif corr < 0:
        print("=> Negative correlation: claims with more page content tend to be well-classified less often.")
    else:
        print("=> Zero correlation: no relationship.")

# Example usage:
analyze_page_content_well_classified_correlation(df)



Correlation Between Page Content Length and Good Classification
Point-biserial correlation (r): -0.0593
P-value:                        1.8094e-01
=> Not statistically significant (p >= 0.05).
=> Negative correlation: claims with more page content tend to be well-classified less often.


In [5]:
import re

# A simple regex that matches any common quote-like characters: " ' “ ”
quote_pattern = re.compile(r'[\"“”\']')

def has_quotes_in_text(claim_text):
    """
    Returns True if the text contains at least one quotation mark
    (single or double quotes, including fancy quotes).
    """
    return bool(quote_pattern.search(claim_text))

def flatten_claims_for_quotes(df):
    """
    Flatten the DataFrame so each row = exactly one claim.
    We'll detect if the claim text has quotes and if it's misclassified.
    """
    flattened_rows = []
    
    for _, row in df.iterrows():
        category = row['category']  # e.g., 'False Positives', 'True Positives', etc.
        is_misclassified = category in ['False Positives', 'False Negatives']
        
        claims_list = row['claims']  # typically a list of dicts: [{'claim': '...', 'date': ...}, ...]
        
        for claim_info in claims_list:
            claim_text = claim_info.get('claim', '')
            # Check presence of quotes
            has_quotes = has_quotes_in_text(claim_text)
            
            flattened_rows.append({
                'is_misclassified': is_misclassified,
                'claim': claim_text,
                'has_quotes': has_quotes
            })
    
    return pd.DataFrame(flattened_rows)

import pandas as pd
from scipy.stats import chi2_contingency

def analyze_quotes_misclass_chi2(df):
    """
    1) Flatten the data into one claim per row, with `has_quotes` and `is_misclassified`.
    2) Build a 2x2 contingency table and run a chi-square test.
    """
    flattened_df = flatten_claims_for_quotes(df)
    
    contingency = pd.crosstab(flattened_df['has_quotes'],
                              flattened_df['is_misclassified'])
    
    print("\nContingency Table (rows = has_quotes, cols = is_misclassified):")
    print(contingency)
    
    # If shape != (2,2), it means there's no variation in the data (e.g., all claims have no quotes).
    if contingency.shape == (2,2):
        chi2, p_value, dof, expected = chi2_contingency(contingency)
        print("\nChi-Square Test (Quotes vs. Misclassification)")
        print("=================================================")
        print(f"Chi2 Statistic: {chi2:.4f}")
        print(f"P-value:        {p_value:.4e}")
        print(f"Degrees of Freedom: {dof}")
        print("Expected Frequencies:")
        print(expected)
        
        if p_value < 0.05:
            print("=> Statistically SIGNIFICANT association (p < 0.05).")
        else:
            print("=> No statistically significant association (p >= 0.05).")
    else:
        print("\nWARNING: Did not form a 2x2 table. Possibly all are True or all are False for quotes.")

from scipy.stats import pointbiserialr

def analyze_quotes_misclass_correlation(df):
    """
    1) Flatten data (has_quotes, is_misclassified).
    2) Convert booleans to 0/1 and compute point-biserial correlation.
    """
    flattened_df = flatten_claims_for_quotes(df)
    
    # Convert booleans to int
    flattened_df['quotes_int'] = flattened_df['has_quotes'].astype(int)
    flattened_df['misclass_int'] = flattened_df['is_misclassified'].astype(int)
    
    corr, pval = pointbiserialr(flattened_df['quotes_int'],
                                flattened_df['misclass_int'])
    
    print("\nPoint-Biserial Correlation (Quotes vs. Misclassification)")
    print("==========================================================")
    print(f"r = {corr:.4f}, p-value = {pval:.4e}")
    
    if pval < 0.05:
        print("=> Statistically SIGNIFICANT correlation (p < 0.05).")
    else:
        print("=> Not statistically significant (p >= 0.05).")
    
    # Interpretation of sign
    if corr > 0:
        print("=> Positive correlation: claims with quotes more likely to be misclassified.")
    elif corr < 0:
        print("=> Negative correlation: claims with quotes less likely to be misclassified.")
    else:
        print("=> r=0 → No linear relationship.")

# Usage examples:
analyze_quotes_misclass_chi2(df)
analyze_quotes_misclass_correlation(df)




Contingency Table (rows = has_quotes, cols = is_misclassified):
is_misclassified  False  True 
has_quotes                    
False                16      4
True                444     46

Chi-Square Test (Quotes vs. Misclassification)
Chi2 Statistic: 1.3943
P-value:        2.3768e-01
Degrees of Freedom: 1
Expected Frequencies:
[[ 18.03921569   1.96078431]
 [441.96078431  48.03921569]]
=> No statistically significant association (p >= 0.05).

Point-Biserial Correlation (Quotes vs. Misclassification)
r = -0.0693, p-value = 1.1819e-01
=> Not statistically significant (p >= 0.05).
=> Negative correlation: claims with quotes less likely to be misclassified.


In [9]:
import os
import openai
import pandas as pd
import os
openai.api_key = os.getenv("OPENAI_API_KEY")  # If stored in an environment variable

def categorize_domain(domain):
    """
    Calls the OpenAI API to classify the domain into a broad category.
    Returns a string category label.
    """
    system_instructions = """
    You are an expert at categorizing website domains. Your goal is to classify domains into the most appropriate category.
    
    Initial suggested categories include:
    - Government (gov sites, public services)
    - News & Media (news outlets, magazines)
    - Social Media (social networks, community platforms)
    - E-commerce (online stores, marketplaces)
    - Education (universities, online learning)
    - Technology (tech companies, software)
    - Entertainment (streaming, games, movies)
    - Business (company websites, B2B services)
    - Finance (banking, investment, crypto)
    - Healthcare (medical, wellness)
    - Sports (teams, leagues, fitness)
    - Reference (wikis, databases, research)
    - Forums & Communities
    - Blogs & Personal
    - Non-Profit & Advocacy
    
    However, if you identify a more accurate or specific category that better describes the domain's primary purpose, 
    you should use that instead. Always prioritize accuracy and specificity over using the suggested categories.
    
    Return ONLY the category name, nothing else. Be specific but concise (1-3 words maximum).
    """

    user_prompt = f"Categorize this domain based on its likely primary purpose: {domain}"

    try:
        response = openai.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": system_instructions},
                {"role": "user", "content": user_prompt},
            ],
            temperature=0.0,  # for more deterministic output
            max_tokens=50
        )

        # Extract the assistant's reply
        reply_content = response.choices[0].message.content.strip()
        print(f'Domain: {domain}\n Category: {reply_content}')
        return reply_content
    
    except Exception as e:
        print(f"Error categorizing domain {domain}: {e}")
        return "Error"

def categorize_domains_csv(input_csv, output_csv):
    # 1) Load data
    df = pd.read_csv(input_csv)

    # Ensure there's a 'domain' column
    if 'domain' not in df.columns:
        raise ValueError("CSV must have a 'domain' column")

    # 2) Create a new column 'category'
    categories = []
    total_domains = len(df)
    
    for index, domain in enumerate(df['domain'], 1):
        cat_label = categorize_domain(domain)
        categories.append(cat_label)
        
        # Progress counter
        remaining = total_domains - index
        print(f"Processed {index}/{total_domains} domains. {remaining} remaining...", end='\r')

    # 3) Add this list to the DataFrame
    df['category'] = categories

    # 4) Save to new CSV
    df.to_csv(output_csv, index=False)
    print(f"Categorized domains saved to {output_csv}.")

categorize_domains_csv('domains.csv', 'url_categorization.csv')

Domain: nan
 Category: I'm sorry, but I need the domain name in order to categorize it accurately. Could you please provide the domain name you would like me to categorize?
Domain: 10news.comomains. 1450 remaining...
 Category: News & Media
Domain: 12160.infoomains. 1449 remaining...
 Category: News & Media
Domain: 1news.co.nzmains. 1448 remaining...
 Category: News & Media
Domain: 2016election.procon.orgremaining...
 Category: Reference
Domain: 6abc.com domains. 1446 remaining...
 Category: News & Media
Domain: abajournal.comns. 1445 remaining...
 Category: News & Media
Domain: abc.net.auomains. 1444 remaining...
 Category: News & Media
Domain: abc11.comdomains. 1443 remaining...
 Category: News & Media
Domain: abc13.comdomains. 1442 remaining...
 Category: News & Media
Domain: abc15.com domains. 1441 remaining...
 Category: News & Media
Domain: abc3340.comomains. 1440 remaining...
 Category: News & Media
Domain: abc6onyourside.com 1439 remaining...
 Category: News & Media
Domain: abc

In [16]:
def load_domain_category_map(categorization_csv):
    """
    Reads 'url_categorization.csv' which has columns: domain, category
    Returns a dict: { 'nytimes.com': 'News', 'facebook.com': 'Social Media', ... }
    """
    cat_df = pd.read_csv(categorization_csv)
    # Ensure columns exist
    if 'domain' not in cat_df.columns or 'category' not in cat_df.columns:
        raise ValueError("url_categorization.csv must have columns [domain, category]")
    
    # Build a dictionary
    domain_to_cat = dict(zip(cat_df['domain'], cat_df['category']))
    return domain_to_cat

def extract_domain(url):
    """
    Extract netloc from the URL, remove 'www.' if present.
    """
    if not url or pd.isna(url):
        return None
    parsed = urlparse(url)
    domain = parsed.netloc.lower()
    if domain.startswith('www.'):
        domain = domain[4:]
    return domain

def flatten_claims_for_domain_categories(df, domain_map):
    """
    Flatten the DataFrame so each row = exactly one claim.
    For each claim:
      1. Collect all domains from the artifacts' evidence.
      2. Map each domain to a category (via 'domain_map').
      3. We'll store a set of categories used by that claim.
      4. We'll label 'is_misclassified' = True if category ∈ {False Positives, False Negatives}.
    Returns a new DataFrame with columns:
      - 'is_misclassified' (bool)
      - 'categories_used' (set of strings)
    """
    flattened_rows = []
    
    for _, row in df.iterrows():
        cat = row['category']  # e.g. 'False Positives', 'True Positives', etc.
        is_misclassified = cat in ['False Positives', 'False Negatives']
        
        # row['artifacts'] is a list of artifact lists, each artifact has 'evidence'
        claims_list = row['claims']
        artifacts_list = row['artifacts']
        
        # We'll collect all domain categories for this claim
        claim_categories = set()
        
        # Iterate each claim in 'claims_list'
        for i, _ in enumerate(claims_list):
            # Artifacts for the i-th claim
            artifact_list = artifacts_list[i]
            for artifact in artifact_list:
                evidence_list = artifact.get('evidence', [])
                for e in evidence_list:
                    url = e.get('page_url', '')
                    domain = extract_domain(url)
                    if domain:
                        # Look up its category
                        cat_label = domain_map.get(domain, "Other")  # default to 'Other' if not in map
                        claim_categories.add(cat_label)
        
        flattened_rows.append({
            'is_misclassified': is_misclassified,
            'categories_used': claim_categories
        })
    
    return pd.DataFrame(flattened_rows)

def get_all_categories(domain_map):
    """
    domain_map is like { 'nytimes.com': 'News', 'cnn.com': 'News', 'facebook.com': 'Social Media', ... }
    Return the unique set of category values.
    """
    return set(domain_map.values())

def analyze_url_categories_misclassification(flattened_df, categories):
    results = []
    
    for cat in categories:
        has_cat_misclass = 0
        has_cat_correct = 0
        no_cat_misclass = 0
        no_cat_correct = 0
        
        for _, row in flattened_df.iterrows():
            miscl = row['is_misclassified']
            cat_set = row['categories_used']
            
            if cat in cat_set:
                if miscl:
                    has_cat_misclass += 1
                else:
                    has_cat_correct += 1
            else:
                if miscl:
                    no_cat_misclass += 1
                else:
                    no_cat_correct += 1
        
        # Build the 2x2 table
        contingency = np.array([
            [has_cat_misclass,  has_cat_correct ],
            [no_cat_misclass,   no_cat_correct  ]
        ])
        
        # Check if the entire table is empty
        if contingency.sum() == 0:
            # e.g., no claims at all for this category
            results.append((cat, 0, 1.0, "No Data (all zero)"))
            continue
        
        # --- FIX: Check row/column sums to avoid the "zero element at (0,0)" error
        row_sums = contingency.sum(axis=1)
        col_sums = contingency.sum(axis=0)
        if (row_sums[0] == 0 or row_sums[1] == 0 or
            col_sums[0] == 0 or col_sums[1] == 0):
            # This means at least one row or column is completely zero
            # leading to an expected frequency of zero in chi-square
            results.append((cat, 0, 1.0, "Skipped (row or column sum=0)"))
            continue
        
        # Now safe to run chi-square
        chi2, p, dof, expected = chi2_contingency(contingency)
        results.append((cat, chi2, p, "Ok"))
    
    print("Chi-Square Tests for URL Categories vs. Misclassification\n")
    for cat, chi2, pval, status in results:
        print(f"Category: {cat}")
        print(f"  Chi2 = {chi2:.4f}, p = {pval:.4e}, status = {status}")
        if pval < 0.05 and status == "Ok":
            print("  => Statistically SIGNIFICANT (p < 0.05).")
        elif status == "Ok":
            print("  => Not significant (p >= 0.05).")
        else:
            print("  =>", status)
        print("-"*60)
    
    print("\nDone.")


def run_category_misclass_analysis(df, url_categorization_csv):
    # 1. Load domain->category map
    domain_map = load_domain_category_map(url_categorization_csv)
    # 2. Flatten claims
    flattened_df = flatten_claims_for_domain_categories(df, domain_map)
    # 3. Gather all unique categories from domain_map + "Other"
    unique_cats = set(domain_map.values())
    unique_cats.add("Other")  # in case some domains were not found
    analyze_url_categories_misclassification(flattened_df, unique_cats)

run_category_misclass_analysis(df, "url_categorization.csv")

Chi-Square Tests for URL Categories vs. Misclassification

Category: Conspiracy Theory
  Chi2 = 0.0000, p = 9.9660e-01, status = Ok
  => Not significant (p >= 0.05).
------------------------------------------------------------
Category: Religion & Spirituality
  Chi2 = 0.7962, p = 3.7223e-01, status = Ok
  => Not significant (p >= 0.05).
------------------------------------------------------------
Category: Science News
  Chi2 = 1.7951, p = 1.8031e-01, status = Ok
  => Not significant (p >= 0.05).
------------------------------------------------------------
Category: Medical Research
  Chi2 = 0.7962, p = 3.7223e-01, status = Ok
  => Not significant (p >= 0.05).
------------------------------------------------------------
Category: Legal Journals
  Chi2 = 0.9880, p = 3.2023e-01, status = Ok
  => Not significant (p >= 0.05).
------------------------------------------------------------
Category: Satire News
  Chi2 = 0.9880, p = 3.2023e-01, status = Ok
  => Not significant (p >= 0.05).
---

In [17]:
import numpy as np
from scipy.stats import chi2_contingency

def analyze_url_categories_misclassification_direction(flattened_df, categories):
    """
    For each category in 'categories':
      1. Build a 2×2 table:
            Rows = (has_this_category, does_not_have_this_category)
            Columns = (misclassified=True, misclassified=False)
      2. Run chi-square to see if there's a significant association.
      3. If significant, compare misclassification rates to determine the direction
         (positive vs. negative correlation).
    """
    results = []
    
    for cat in categories:
        has_cat_misclass = 0
        has_cat_correct = 0
        no_cat_misclass = 0
        no_cat_correct = 0
        
        for _, row in flattened_df.iterrows():
            miscl = row['is_misclassified']
            cat_set = row['categories_used']
            
            if cat in cat_set:
                if miscl:
                    has_cat_misclass += 1
                else:
                    has_cat_correct += 1
            else:
                if miscl:
                    no_cat_misclass += 1
                else:
                    no_cat_correct += 1
        
        # Build the 2×2 table
        contingency = np.array([
            [has_cat_misclass,  has_cat_correct ],
            [no_cat_misclass,   no_cat_correct  ]
        ])
        
        # If there's no data at all, skip
        if contingency.sum() == 0:
            results.append((cat, 0.0, 1.0, "NoData", "No Data"))
            continue
        
        # Check row/column sums to avoid zero-sum rows/columns
        row_sums = contingency.sum(axis=1)
        col_sums = contingency.sum(axis=0)
        if (row_sums[0] == 0 or row_sums[1] == 0 or
            col_sums[0] == 0 or col_sums[1] == 0):
            results.append((cat, 0.0, 1.0, "Skipped", "Row/Col=0"))
            continue
        
        # Chi-square
        chi2, p, dof, expected = chi2_contingency(contingency)
        
        # We'll also compute misclass rates with vs. without the category
        has_cat_total = has_cat_misclass + has_cat_correct
        no_cat_total  = no_cat_misclass + no_cat_correct
        if has_cat_total == 0 or no_cat_total == 0:
            # Can't compute rates if zero total
            results.append((cat, chi2, p, "Ok", "No Rate"))
            continue
        
        rate_with_cat = has_cat_misclass / has_cat_total
        rate_without_cat = no_cat_misclass / no_cat_total
        
        # Determine direction
        if p < 0.05:
            if rate_with_cat > rate_without_cat:
                direction = "Positive Correlation (HIGHER misclassification with category)"
            elif rate_with_cat < rate_without_cat:
                direction = "Negative Correlation (LOWER misclassification with category)"
            else:
                direction = "Same Rate?"
        else:
            direction = "No significant difference"
        
        results.append((cat, chi2, p, "Ok", direction))
    
    # Print results
    print("Chi-Square Tests for URL Categories vs. Misclassification (with direction)\n")
    for cat, chi2, pval, status, direction in results:
        print(f"Category: {cat}")
        print(f"  Chi2 = {chi2:.4f}, p = {pval:.4e}, status = {status}")
        if status == "Ok":
            if pval < 0.05:
                print(f"  => Statistically SIGNIFICANT (p < 0.05). {direction}")
            else:
                print(f"  => Not significant (p >= 0.05). {direction}")
        else:
            print(f"  => {direction}")
        print("-"*60)
    
    print("\nDone.")
    
domain_map = load_domain_category_map("url_categorization.csv")
# 2. Flatten claims
flattened_df = flatten_claims_for_domain_categories(df, domain_map)
# 3. Gather all unique categories from domain_map + "Other"
unique_cats = set(domain_map.values())
unique_cats.add("Other")  # in case some domains were not found
analyze_url_categories_misclassification_direction(flattened_df, unique_cats)

Chi-Square Tests for URL Categories vs. Misclassification (with direction)

Category: Conspiracy Theory
  Chi2 = 0.0000, p = 9.9660e-01, status = Ok
  => Not significant (p >= 0.05). No significant difference
------------------------------------------------------------
Category: Religion & Spirituality
  Chi2 = 0.7962, p = 3.7223e-01, status = Ok
  => Not significant (p >= 0.05). No significant difference
------------------------------------------------------------
Category: Science News
  Chi2 = 1.7951, p = 1.8031e-01, status = Ok
  => Not significant (p >= 0.05). No significant difference
------------------------------------------------------------
Category: Medical Research
  Chi2 = 0.7962, p = 3.7223e-01, status = Ok
  => Not significant (p >= 0.05). No significant difference
------------------------------------------------------------
Category: Legal Journals
  Chi2 = 0.9880, p = 3.2023e-01, status = Ok
  => Not significant (p >= 0.05). No significant difference
------------------

In [21]:
file = pd.read_csv('url_categorization.csv')
len(file['category'].unique())

91