In [1]:
import os
import json
from collections import Counter, defaultdict

def collect_stats(base_dir):
    stats = {
        "article_count": 0,
        "total_words": 0,
        "image_counts": [],
        "author_counter": Counter()
    }

    for root, dirs, files in os.walk(base_dir):
        if "news content.json" in files:
            json_path = os.path.join(root, "news content.json")
            try:
                with open(json_path, "r", encoding="utf-8") as f:
                    data = json.load(f)
                    stats["article_count"] += 1
                    stats["total_words"] += len(data.get("text", "").split())
                    stats["image_counts"].append(len(data.get("images", [])))
                    stats["author_counter"].update(data.get("authors", []))
            except Exception as e:
                print(f"Error reading {json_path}: {e}")
    
    return stats

def print_stats(label, stats):
    print(f"\n--- {label.upper()} ARTICLES ---")
    if stats["article_count"] == 0:
        print("No articles found.")
        return
    
    avg_words = stats["total_words"] / stats["article_count"]
    avg_images = sum(stats["image_counts"]) / stats["article_count"]
    
    print(f"Total articles: {stats['article_count']}")
    print(f"Average article length: {avg_words:.2f} words")
    print(f"Average images per article: {avg_images:.2f}")
    print("Top authors:")
    for author, count in stats["author_counter"].most_common(5):
        print(f"  {author}: {count}")

# Set paths
base_fake = "data/Archive/GOSSIPCOP/fake"
base_real = "data/Archive/GOSSIPCOP/real"

# Collect stats
stats_fake = collect_stats(base_fake)
stats_real = collect_stats(base_real)

# Print stats
print_stats("Fake", stats_fake)
print_stats("Real", stats_real)



--- FAKE ARTICLES ---
Total articles: 4947
Average article length: 550.09 words
Average images per article: 26.80
Top authors:
  People Editorial Guidelines: 143
  Condé Nast: 74
  Us Weekly Staff: 71
  Andrew Shuster: 68
  Image: 67

--- REAL ARTICLES ---
Total articles: 15420
Average article length: 609.09 words
Average images per article: 32.33
Top authors:
  People Editorial Guidelines: 864
  Condé Nast: 401
  Us Weekly Staff: 221
  Dailymail.Com Reporter: 217
  .Wp-Block-Post-Author Display Flex Flex-Wrap Wrap .Wp-Block-Post-Author__Byline Font-Size Margin-Bottom Margin-Top Width: 158


In [2]:
# Set paths
base_fake = "data/Archive/POLITIFACT/fake"
base_real = "data/Archive/POLITIFACT/real"

# Collect stats
stats_fake = collect_stats(base_fake)
stats_real = collect_stats(base_real)

# Print stats
print_stats("Fake", stats_fake)
print_stats("Real", stats_real)


--- FAKE ARTICLES ---
Total articles: 397
Average article length: 345.35 words
Average images per article: 16.91
Top authors:
  Trending Story Found: 13
  Please Enter Your Name Here: 6
  Jim Hoft: 4
  Marius Bogdan Dinu: 2
  Sam Tide: 2

--- REAL ARTICLES ---
Total articles: 549
Average article length: 1853.61 words
Average images per article: 11.54
Top authors:
  Abc News: 23
  March: 6
  Paul Krugman: 4
  August: 4
  Team Fix: 3


In [4]:
import pandas as pd

column_names = [
    "id", "label", "statement", "subject", "speaker", "job_title",
    "state", "party", "barely_true_counts", "false_counts", "half_true_counts",
    "mostly_true_counts", "pants_on_fire_counts", "context"
]

base_path = "data/Archive/LIAR/"

column_names = [
    "id", "label", "statement", "subject", "speaker", "job_title",
    "state", "party", "barely_true_counts", "false_counts", "half_true_counts",
    "mostly_true_counts", "pants_on_fire_counts", "context"
]

train_df = pd.read_csv(base_path + "train.tsv", sep="\t", header=None, names=column_names)
valid_df = pd.read_csv(base_path + "valid.tsv", sep="\t", header=None, names=column_names)
test_df  = pd.read_csv(base_path + "test.tsv",  sep="\t", header=None, names=column_names)

# Combine all for general stats
full_df = pd.concat([train_df, valid_df, test_df], ignore_index=True)

# Basic stats
num_statements = len(full_df)
avg_length_words = full_df["statement"].dropna().apply(lambda x: len(str(x).split())).mean()
label_counts = Counter(full_df["label"])
top_subjects = Counter([s for subjects in full_df["subject"].dropna() for s in subjects.split(", ")]).most_common(5)
top_speakers = Counter(full_df["speaker"].dropna()).most_common(5)
party_counts = Counter(full_df["party"].dropna())

# Print results
print("\n--- LIAR DATASET STATS ---")
print(f"Total statements: {num_statements}")
print(f"Average statement length: {avg_length_words:.2f} words")

print("\nLabel distribution:")
for label, count in label_counts.items():
    print(f"  {label}: {count}")

print("\nTop 5 subjects:")
for subject, count in top_subjects:
    print(f"  {subject}: {count}")

print("\nTop 5 speakers:")
for speaker, count in top_speakers:
    print(f"  {speaker}: {count}")



--- LIAR DATASET STATS ---
Total statements: 12791
Average statement length: 18.04 words

Label distribution:
  false: 2507
  half-true: 2627
  mostly-true: 2454
  true: 2053
  barely-true: 2103
  pants-fire: 1047

Top 5 subjects:
  health-care: 474
  taxes: 356
  education: 309
  elections: 304
  immigration: 303

Top 5 speakers:
  barack-obama: 611
  donald-trump: 343
  hillary-clinton: 297
  mitt-romney: 212
  john-mccain: 189
