In [None]:
import polars as pl

In [None]:
import sys
from pathlib import Path

# 상대 경로 사용
PROJECT_ROOT = Path.cwd().parent
DATA_DIR = PROJECT_ROOT / 'data'

# 맨 앞에 추가
sys.path.insert(0, str(PROJECT_ROOT))

# 이제 import
from src.loading import DataLoader


loader = DataLoader(
    output_file= DATA_DIR / 'silver' / 'maude_preprocess_step1.parquet',
)

In [None]:
adapter = 'polars'
polars_kwargs = {
    'use_statistics': True,
    'parallel': 'auto',
    'low_memory': False,
    'rechunk': False,
    'cache': True,
}
maude_lf = loader.load(adapter=adapter, **polars_kwargs)
maude_lf

In [None]:
maude_lf.filter(
    pl.col('product_code').is_in(['MHE', 'MHY', 'HHS', 'MIH', 'MDS', 'LMH'])
).select(pl.len()).collect().to_series().to_list()

In [None]:
import polars as pl
maude_lf.select(
    pl.col('date_received').unique()
).sort('date_received', descending=True).head(10).collect()

In [None]:
# 가장 긴 100개 행
df = maude_lf.select(['mdr_text', 'product_problems']).with_columns(
    pl.col('mdr_text').str.len_chars().alias('text_len')
).sort('text_len', descending=True).head(10).collect()

In [None]:
from transformers import AutoTokenizer
import polars as pl
from tqdm import tqdm
from src.preprocess.prompt import GeneralPrompt

prompt = GeneralPrompt()

SYSTEM_INSTRUCTION = prompt.SYSTEM_INSTRUCTION

# 1. tokenizer 로드
tokenizer = AutoTokenizer.from_pretrained("nvidia/Qwen3-32B-NVFP4")
MAX_TOKENS = 12496  # vLLM max_model_len 설정값

# 2. 토큰 개수 세는 함수
def count_tokens(text):
    return len(tokenizer.encode(text))

# numpy 배열 순회하며 토큰 길이 측정
batch_size = 1000
token_counts = []
for i in tqdm(range(0, len(df), batch_size)):
    batch = df[i:i+batch_size]
    prompts = [SYSTEM_INSTRUCTION + prompt.format_user_prompt(row[0], row[1]) 
               for row in batch]
    token_counts.extend([len(ids) for ids in tokenizer(prompts)["input_ids"]])

# 통계 확인
import numpy as np
print(f"Max tokens: {np.max(token_counts)}")
print(f"Mean tokens: {np.mean(token_counts):.0f}")
print(f"Over limit: {np.sum(np.array(token_counts) > MAX_TOKENS)} rows")