In [1]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer
import re

# 1. Load Data
# Assuming the file is already downloaded/extracted here
df = pd.read_csv("/home/prg/GitHub/mlops_group70/data/raw/mbti_1.csv") 

# 2. Define Cleaning Logic (Mirrored from your MBTIDataModule)
def clean_text(text: str) -> str:
    if text.startswith("b'") or text.startswith('b"'): text = text[2:-1]
    text = text.replace("|||", " ")
    text = re.sub(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", "", text)
    text = re.sub(r"!\[.*?\]\(.*?\)", "", text)
    text = re.sub(r"[^a-zA-Z0-9\s\.\,\!\?\'\-\:\;]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# 3. Process
print("Cleaning text...")
clean_texts = df['posts'].astype(str).apply(clean_text).tolist()

# 4. Tokenize & Count
# We disable truncation to get the TRUE length of the data
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

print("Tokenizing (this may take a moment)...")
encodings = tokenizer(clean_texts, add_special_tokens=True, verbose=False)
lengths = [len(ids) for ids in encodings['input_ids']]

# 5. Stats
mean_len = np.mean(lengths)
median_len = np.median(lengths)
percent_over_limit = (np.array(lengths) > 512).mean() * 100

print(f"Average Token Count: {mean_len:.2f}")
print(f"Median Token Count:  {median_len:.2f}")
print(f"Samples > 512 tokens: {percent_over_limit:.2f}%")

Cleaning text...
Tokenizing (this may take a moment)...
Average Token Count: 1737.75
Median Token Count:  1810.00
Samples > 512 tokens: 99.00%
