In [46]:
import json
import os
import glob
from collections import defaultdict

# import tiktoken
# tokenizer = tiktoken.encoding_for_model("gpt-4")

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("./models/raw/Meta-Llama-3-8B")


group_sizes = [486, 926, 1417, 2206, 2587, 2615, 3792, 7295]

grouped_reviewer_stats = []

def count_tokens(text):
    return len(tokenizer.encode(text))

path = "data/converted/ICLR/"
all_files = sorted(glob.glob(os.path.join(path, "*.json")))

current_index = 0

for group_size in group_sizes:
    group_stats = defaultdict(lambda: defaultdict(dict)) 

    total_reviewers = 0
    total_first_reply_tokens = 0
    total_dialogue_tokens = 0
    total_meta_review_tokens = 0
    total_paper_tokens = 0

    for _ in range(group_size):
        if current_index >= len(all_files):
            break
        filename = all_files[current_index]
        current_index += 1

        seen_reviewers = set()
        with open(filename, 'r', encoding='utf-8') as file:
            data = json.load(file)[0]

            history = data['history']
            input = data['input']
            output = data['output']
            total_meta_review_tokens += count_tokens(output)
            total_paper_tokens += count_tokens(input)

            for i in range(0, len(history), 3):
                role = history[i][0]
                first_reply = history[i][1]
                reviewer_id = role.split('Reviewer')[1].strip().split(',')[0]

                if reviewer_id in seen_reviewers:
                    continue

                seen_reviewers.add(reviewer_id)

                first_reply_tokens = count_tokens(first_reply)

                total_dialogue_tokens = 0
                for j in range(3):
                    total_dialogue_tokens += count_tokens(history[i+j][1])

                group_stats[filename][reviewer_id]['first_reply_tokens'] = first_reply_tokens
                group_stats[filename][reviewer_id]['total_dialogue_tokens'] = total_dialogue_tokens
                group_stats[filename][reviewer_id]['paper_tokens'] = count_tokens(input)

        total_reviewers += len(seen_reviewers)
    total_first_reply_tokens = sum(stats['first_reply_tokens'] for reviewers in group_stats.values() for stats in reviewers.values())
    total_tokens = sum(stats['total_dialogue_tokens'] for reviewers in group_stats.values() for stats in reviewers.values())

    grouped_reviewer_stats.append({
        'total_reviewers': total_reviewers,
        'total_paper_tokens': total_paper_tokens,
        'total_first_reply_tokens': total_first_reply_tokens,
        'total_dialogue_tokens': total_tokens,
        'total_meta_review_tokens': total_meta_review_tokens,
        'group_stats': group_stats
    })

for group_index, stats in enumerate(grouped_reviewer_stats):
    print(f"Group {group_index + 1}:")
    print(f"  Total Reviewers: {stats['total_reviewers']}")
    print(f"  Total Paper Tokens: {stats['total_paper_tokens']}")
    print(f"  Total Initial Review Tokens: {stats['total_first_reply_tokens']}")
    print(f"  Total Author/Review Tokens: {stats['total_dialogue_tokens']}")
    print(f"  Total Meta Reivew Tokens: {stats['total_meta_review_tokens']}")
    print()