### Estimate the amount of tokens for the erisk 2025 datasets

In [1]:
!dir

 Volume in drive D is Data
 Volume Serial Number is 6A96-9CEA

 Directory of D:\SRC\DS@GT\eRisk25\eRisk25-datasets

02/14/2025  01:36 PM    <DIR>          .
02/14/2025  01:36 PM    <DIR>          ..
02/14/2025  12:54 PM    <DIR>          .ipynb_checkpoints
02/09/2025  03:48 PM           826,340 00-erisk25task1EDA.ipynb
02/14/2025  01:34 PM             2,564 01-erisktokenestimate.ipynb
02/06/2025  12:09 AM     1,977,791,083 merged_output.parquet
02/06/2025  09:54 PM       231,616,082 merged_output_2023.parquet
02/06/2025  10:29 PM     2,816,315,923 merged_output_2024.parquet
02/07/2025  01:41 PM            55,251 most_common_bigrams_chart.png
02/07/2025  01:36 PM            37,899 most_common_words_chart.png
02/04/2025  10:02 PM             1,572 t1_parquet.py
02/03/2025  11:00 PM    <DIR>          task1-symptom-ranking
02/06/2025  10:08 PM             1,716 task1_parquetmerge.py
02/06/2025  10:04 PM             2,104 task1_trec2parquet.py
01/16/2025  08:18 PM    <DIR>          task2-co

In [3]:
import pandas as pd
import pyarrow.parquet as pq
import tiktoken
import os

# List of Parquet files
parquet_files = [
    "merged_output.parquet",
    "merged_output_2023.parquet",
    "merged_output_2024.parquet"
]

# Load GPT-3.5 tokenizer
tokenizer = tiktoken.get_encoding("cl100k_base")

# Initialize counters
total_token_count = 0
total_documents = 0

# Process each Parquet file
for file in parquet_files:
    if not os.path.exists(file):
        print(f"⚠️ File not found: {file}")
        continue
    
    print(f"🔄 Processing {file} ...")
    
    # Open Parquet file
    parquet_table = pq.ParquetFile(file)

    # Process file in row groups
    for row_group_index in range(parquet_table.num_row_groups):
        # Read a single row group (to avoid memory overload)
        batch = parquet_table.read_row_group(row_group_index, columns=["TEXT"]).to_pandas()

        # Tokenize and count tokens
        total_token_count += sum(len(tokenizer.encode(text)) for text in batch["TEXT"].dropna())
        total_documents += len(batch)

    print(f"✅ Completed: {file}")

# Final results
print(f"\n✅ Total documents processed: {total_documents}")
print(f"📊 Estimated total token count (GPT-3.5 tokenizer): {total_token_count:,}")

🔄 Processing merged_output.parquet ...
✅ Completed: merged_output.parquet
🔄 Processing merged_output_2023.parquet ...
✅ Completed: merged_output_2023.parquet
🔄 Processing merged_output_2024.parquet ...
✅ Completed: merged_output_2024.parquet

✅ Total documents processed: 37360334
📊 Estimated total token count (GPT-3.5 tokenizer): 713,612,490
