In [None]:
# !pip install rapidfuzz

In [None]:
# !pip install country-named-entity-recognition

In [None]:
# !pip install clean-text

In [None]:
from typing import Tuple, List, Dict, Any, Sequence, Union
import polars as pl
import polars.selectors as cs
import pandas as pd
from pprint import pprint, pformat
import sys
from pathlib import Path
from tqdm import tqdm, trange
import psutil
import re


# 상대 경로 사용
PROJECT_ROOT = Path.cwd().parent
DATA_DIR = PROJECT_ROOT / 'data'

# 맨 앞에 추가
if str(PROJECT_ROOT) in sys.path:
    sys.path.remove(str(PROJECT_ROOT))
sys.path.insert(0, str(PROJECT_ROOT))

# 이제 import
from src.utils import process_lazyframe_in_chunks
from src.loading import DataLoader
from src.preprocess import TextPreprocessor, create_udi_preprocessor, create_company_preprocessor, create_generic_preprocessor, create_number_preprocessor
from src.preprocess.preprocess import get_pattern_cols, \
    get_unique_by_cols_safe, get_unique, \
    analyze_null_values, replace_pattern_with_null, overview_col

In [None]:
# maude 데이터 불러오기
loader1 = DataLoader(
    start=2024,
    end=2024,
    output_file = DATA_DIR / 'bronze' / 'maude3_raw.parquet',
    max_workers=4
)

adapter = 'polars'
polars_kwargs = {
    'use_statistics': True,
    'parallel': 'auto',
    'low_memory': False,
    'rechunk': False,
    'cache': True,
}
maude_lf = loader1.load(adapter=adapter, **polars_kwargs)
maude_lf

In [None]:
maude_lf.select(pl.col('mdr_report_key').n_unique()).head().collect()

In [None]:
IDENTIFIER_PATTERNS = [
    r"^device_\d+_brand_name$",
    r"identifiers_\d+_id", 
    r"identifiers_\d+_issuing_agency", 
    r"identifiers_\d+_package_discontinue_date", 
    r"identifiers_\d+_package_status", 
    r"identifiers_\d+_package_type", 
    r"identifiers_\d+_quantity_per_package", 
    r"identifiers_\d+_type", 
    r"identifiers_\d+_unit_of_use_id"
]
UDI_DI_PATTERNS = [r'^identifiers_\d+_id$']
TYPE_PATTERNS = [
    r'identifiers_\d+_type'
]

CUSTOMER_PATTERNS = [r'^customer']
DEVICE_SIZE_PATTERNS = [r'^device_sizes']
STORAGE_PATTERNS = [r'^storage']
GMDN_PATTERNS = [r'^gmdn']
PREMARKET_PATTERNS = [r'^premarket']


# 열 drop

In [None]:
# 기본 변수
BASE_COLS = [
    'mdr_report_key',
    'report_number', 
    'adverse_event_flag', 
    'product_problem_flag', 
    'event_type',
    'previous_use_code', 
    'single_use_flag', 
    'reprocessed_and_reused_flag',
    'product_problems'
]

DATE_COLS = [
    'date_of_event', 
    'date_received', 
    'device_date_of_manufacturer', 
]

DEVICE_COLS = [
    "device_0_manufacturer_d_name",
    "device_0_manufacturer_d_postal_code",
    "device_0_brand_name",
    "device_0_catalog_number",
    "device_0_model_number",
    "device_0_udi_di",
    "device_0_lot_number",
    "device_0_udi_public",
    "device_0_device_report_product_code",
    "device_0_device_operator",
    "device_0_openfda_device_class",
    "device_0_openfda_device_name",
]


PATIENT_COLS = [
    "patient_0_patient_age",
    "patient_0_sequence_number_outcome",
]



MDR_TEXT_PATTERNS = [
    r"^mdr_text_.*_text$",
    r"^mdr_text_.*_text_type_code$",
]

MDR_TEXT_COLS = get_pattern_cols(maude_lf, MDR_TEXT_PATTERNS[:1])
MDR_COLS = get_pattern_cols(maude_lf, MDR_TEXT_PATTERNS)
TOTAL_COLS = BASE_COLS + DATE_COLS + DEVICE_COLS + PATIENT_COLS + MDR_COLS

In [None]:
maude_lf = maude_lf.select(TOTAL_COLS)

# device 3만 선택 (필터링)

In [None]:
maude_lf = maude_lf.filter(
    pl.col('device_0_openfda_device_class').eq('3'), 
    pl.col('mdr_text_0_text').is_not_null()
)

maude_lf.select(pl.len()).collect().item()

# 클렌징

In [None]:
maude_step1_path = DATA_DIR / 'silver' / 'clean_step1.parquet'
maude_step2_path = DATA_DIR / 'silver' / 'clean_step2.parquet'
maude_step3_path = DATA_DIR / 'silver' / 'clean_step3.parquet'
maude_step4_path = DATA_DIR / 'silver' / 'clean_step4.parquet'

### (+추가) 지영님 1:1 매칭  
- device_0_device_name과 product_code에서 최빈값으로 join

In [None]:
def device_name_clean(maude_lf, col_name):
    """
    device_0_device_name 컬럼의 특수문자 제거 및 소문자 변환
    """
    return maude_lf.with_columns(
        pl.col(col_name)
        .str.replace_all(r'[^a-zA-Z0-9\s]', '')  # 특수문자 제거
        .str.to_lowercase()  # 소문자 변환
        .str.strip_chars()  # 앞뒤 공백 제거
        .str.replace_all(r'\s+', ' ')  # 연속된 공백을 하나로
        .alias(col_name)
    )
    
def device_name_product_code_match(maude_lf, name_col, code_col):
    """
    device_0_device_name과 device_0_device_product_code를 매칭시켜서
    한 product_code에 여러 device_name이 있는 경우 가장 빈도가 높은 이름으로 통일
    """ 
    # product_code별로 가장 빈도가 높은 device_name 선택
    code_to_name = (
        maude_lf
        .select([name_col, code_col])
        .filter(pl.col(name_col).is_not_null() & pl.col(code_col).is_not_null())
        .group_by([code_col, name_col])  # 먼저 그룹화해서 빈도 계산
        .count()
        .sort("count", descending=True)
        .group_by(code_col)
        .first()  # 각 code별 가장 빈도 높은 이름 선택
        .select([code_col, pl.col(name_col).alias("canonical_name")])
    )
    
    # 원본 데이터에 매핑 테이블 조인
    result = (
        maude_lf
        .join(
            code_to_name,
            on=code_col,
            how="left"
        ) 
        .with_columns(
            pl.coalesce(pl.col("canonical_name"), pl.col(name_col)).alias(name_col)
        )
        .drop("canonical_name")
    )
    
    return result

In [None]:
# 테스트
# 1. device_name 클린징
maude_lf = device_name_clean(maude_lf, "device_0_openfda_device_name")

# 2. product_code와 매칭
maude_lf = device_name_product_code_match(maude_lf, "device_0_openfda_device_name", "device_0_device_report_product_code")

# 확인
# maude_lf.select(['device_0_openfda_device_name','device_0_device_report_product_code']).head().collect()
maude_lf.select(pl.len()).collect().item()

### manufacturer_d_name & postal_code 
- 매칭 후 join (최빈값)

In [None]:
def manufacturer_postal_match(maude_lf, name_col, postal_col):
    """
    manufacturer_d_name을 manufacturer_d_postal_code로 매칭시켜서
    이름 없는 것들 채우고 있는 것들 통일하기
    """
    # 우편번호별로 가장 빈도가 높은 이름 선택
    postal_to_name = (
        maude_lf
        .select([name_col, postal_col])
        .filter(pl.col(name_col).is_not_null() & pl.col(postal_col).is_not_null())
        .unique()
        .group_by(postal_col)
        .agg(
            pl.col(name_col).mode().first().alias("canonical_name")
        )
    )
    
    # 원본 데이터에 매핑 테이블 조인
    result = (
        maude_lf
        .join(
            postal_to_name,
            on=postal_col,
            how="left"
        )
        .with_columns(
            # canonical_name이 있으면 사용, 없으면 원래 이름 유지
            pl.coalesce(pl.col("canonical_name"), pl.col(name_col)).alias(name_col)
        )
        .drop("canonical_name")
    )
    
    return result

In [None]:
maude_lf = manufacturer_postal_match(maude_lf, 'device_0_manufacturer_d_name', 'device_0_manufacturer_d_postal_code')

# maude_lf.select(['device_0_manufacturer_d_name', 'device_0_manufacturer_d_postal_code']).head().collect()
maude_lf.select(pl.len()).collect().item()

### device_report_product_code 특문 시작 코드 삭제

In [None]:
def product_code_clean(maude_lf, col_name):
    """
    device_0_device_product_code 컬럼의 영어 대문자 이외 문자 제거
    """
    return maude_lf.with_columns(
        pl.col(col_name)
        .str.replace_all(r'[^A-Z]', '')  # 영어 대문자 이외 문자 제거
        .alias(col_name)
    )

In [None]:
maude_lf = product_code_clean(
    maude_lf, 'device_0_device_report_product_code'
)
maude_lf.select(pl.len()).collect().item()

### 나이 컷

In [None]:
def process_age_columns(
        maude_lf : pl.LazyFrame,
        src_col : str = "patient_0_patient_age"
) -> pl.LazyFrame:
    schema_names = maude_lf.collect_schema().names()

    # 기기 나이 처리
    if src_col in schema_names:
        # 대문자 변환
        maude_lf = maude_lf.with_columns(
            pl.col(src_col)
            .cast(pl.Utf8)
            .str.to_uppercase()
            .alias("_age_text_upper")
        )
        
        na_patterns = r'UNK|NA|VARIOUS'
        maude_lf = replace_pattern_with_null(maude_lf, '_age_text_upper', na_patterns)

        # 숫자 추출
        maude_lf = maude_lf.with_columns(
            pl.col("_age_text_upper")
            .str.extract(r"(\d+)", 1)
            .cast(pl.Float64)
            .alias("_age_value")
        )

        # 단위 추출
        maude_lf = maude_lf.with_columns(
            pl.col("_age_text_upper")
            .str.extract(r"(DAY|DA|DAYS|D|WEEK|WEEKS|WK|WKS|MONTH|MONTHS|MO|YEAR|YEARS|YR|YRS)",
                          group_index=1)
            .alias("_age_unit")
        )

        maude_lf = maude_lf.with_columns(
            pl.when(pl.col("_age_value").is_null())
              .then(None)  # 숫자 자체가 없으면 null
            .when(pl.col("_age_unit").is_null())
              .then(None)  
            .when(pl.col("_age_unit").str.contains("DAY|DA|DYAS|D"))
              .then(pl.col("_age_value"))
            .when(pl.col("_age_unit").str.contains("WEEK|WEEKS|WK|WKS"))
              .then(pl.col("_age_value") * 7)
            .when(pl.col("_age_unit").str.contains("MONTH|MONTHS|MO"))
              .then(pl.col("_age_value") * 30)
            .when(pl.col("_age_unit").str.contains("YEAR|YEARS|YR|YRS"))
              .then(pl.col("_age_value") * 365)
            .otherwise(None)
            .alias("_age_days"))
        
        # 반올림 + 나이 제한
        maude_lf = maude_lf.with_columns(
            pl.col("_age_days")
            .round(0)
            .cast(pl.Int64)
            .clip(0, 120*365)
            .alias("patient_0_patient_age")
        )
        
        maude_lf = maude_lf.drop(["_age_value", "_age_text_upper", "_age_unit", "_age_days"])

        return maude_lf

In [None]:
maude_lf = process_age_columns(maude_lf, "patient_0_patient_age")
maude_lf.select(
    pl.col("patient_0_patient_age").min().alias("min_age"),
    pl.col("patient_0_patient_age").max().alias("max_age")
).collect()
maude_lf.select(pl.len()).collect().item()

### 1900년대 -> year 변환 필요

In [None]:
import polars as pl

def cast_date_cols_safe(
    maude_lf: pl.LazyFrame,
    date_cols: list[str],
    fmt: str = "%Y%m%d",
) -> pl.LazyFrame:
    schema = maude_lf.collect_schema()
    exprs = []

    for col in date_cols:
        if col not in schema:
            continue

        dt = schema[col]

        # 이미 Date면 건드리지 않음
        if dt == pl.Date:
            continue

        # Datetime이면 Date로만 다운캐스트
        if dt == pl.Datetime:
            exprs.append(pl.col(col).cast(pl.Date, strict=False).alias(col))
            continue

        # 그 외(Utf8/Int 등)만 fmt로 파싱
        exprs.append(
            pl.col(col)
              .cast(pl.Utf8)
              .str.strptime(pl.Date, format=fmt, strict=False)
              .alias(col)
        )

    return maude_lf if not exprs else maude_lf.with_columns(exprs)


In [None]:
maude_lf = cast_date_cols_safe(
    maude_lf,
    date_cols=DATE_COLS
)
maude_lf.select(pl.len()).collect().item()

In [None]:
# 1) clean 컬럼 생성 (dtype 고정)
maude_lf = maude_lf.with_columns(
    pl.when(pl.col("date_of_event").dt.year().is_between(1900, 1999))
      .then(pl.lit(None, dtype=pl.Date))          # <- 핵심: Date로 null 지정
      .otherwise(pl.col("date_of_event"))        
      .alias("date_of_event")
)
maude_lf.select(pl.len()).collect().item()

In [None]:
print("date_of_event 범위:")
print(maude_lf.select(
    pl.col("date_of_event").min().alias("min_date"),
    pl.col("date_of_event").max().alias("max_date"),
).collect())

display(maude_lf.select(pl.len()).head().collect().item())
maude_lf.select(pl.col('mdr_report_key').n_unique()).collect().item()

### 팀장님꺼

In [None]:
# 1단계
preprocessor1 = create_udi_preprocessor()
preprocessor1.apply_to_lazyframe(
    maude_lf, 'device_0_udi_di', maude_step1_path, chunk_size=100_000
)
del preprocessor1  # 명시적 삭제

In [None]:
# 2단계
maude_lf2 = pl.scan_parquet(maude_step1_path)
preprocessor2 = create_company_preprocessor()
preprocessor2.apply_to_lazyframe(
    maude_lf, ['device_0_manufacturer_d_name', 'device_0_brand_name'], 
    maude_step2_path, chunk_size=100_000
)
del maude_lf2, preprocessor2

In [None]:
# 중간 파일 정리
maude_step1_path.unlink(missing_ok=True)

# 3단계
maude_lf3 = pl.scan_parquet(maude_step2_path)
preprocessor3 = create_number_preprocessor()
preprocessor3.apply_to_lazyframe(
    maude_lf3, ['device_0_model_number', 'device_0_catalog_number', 'device_0_lot_number'], 
    maude_step3_path, chunk_size=100_000
)
del maude_lf3, preprocessor3

In [None]:
# 중간 파일 정리
maude_step2_path.unlink(missing_ok=True)

# 4단계
maude_lf4 = pl.scan_parquet(maude_step3_path)
preprocessor4 = create_generic_preprocessor()
preprocessor4.apply_to_lazyframe(
    maude_lf4, MDR_TEXT_COLS,
    maude_step4_path, chunk_size=100_000
)
del maude_lf4, preprocessor4

In [None]:
maude_step3_path.unlink(missing_ok=True)

# maude 데이터 불러오기
loader4 = DataLoader(
    start=2024,
    end=2024,
    output_file = maude_step4_path,
)

cleaned_maude_lf = loader4.load(adapter=adapter, **polars_kwargs)
cleaned_maude_lf

display(cleaned_maude_lf.select(pl.len()).head().collect())
cleaned_maude_lf.select(pl.col('mdr_report_key').n_unique()).head().collect()

### MDR_TEXT 

In [None]:
def combine_mdr_texts(lf: pl.LazyFrame, n: int = 5) -> pl.LazyFrame:
    """
    중복 제거된 set을 먼저 컬럼에 할당 후 문자열 결합
    """
    cols = lf.collect_schema().names()
    
    # 최대 n개까지 cut
    text_cols = sorted([c for c in cols if c.startswith('mdr_text_') and c.endswith('_text')])[:n]

    pairs = []
    for text_col in text_cols:
        type_col = re.sub(r'_text$', '_text_type_code', text_col)
        if type_col in cols:
            pairs.append((text_col, type_col))
    
    if not pairs:
        return lf.with_columns(pl.lit(None).alias('combined_mdr_text'))
    
    # 1. 중복 제거된 리스트를 컬럼에 할당
    lf = lf.with_columns(
        pl.struct([pl.col(tc) for tc, _ in pairs] + [pl.col(ty) for _, ty in pairs])
        .map_elements(
            lambda s: deduplicate_and_format(s, pairs),
            return_dtype=pl.List(pl.String)
        )
        .alias('deduplicated_formatted')
    )
    
    # 2. 리스트를 문자열로 결합
    lf = lf.with_columns(
        pl.col('deduplicated_formatted')
        .list.join("\n\n")
        .alias('combined_mdr_text')
    )
    
    return lf.drop('deduplicated_formatted')


def deduplicate_and_format(struct_val, pairs):
    """텍스트 중복 제거하고 포맷팅까지 한번에"""
    seen = {}
    result = []
    
    for text_col, type_col in pairs:
        text = struct_val.get(text_col)
        type_val = struct_val.get(type_col)
        
        if text is not None and text != "" and text not in seen:
            seen[text] = True
            type_display = type_val if type_val else ""
            result.append(f"[{type_display}]\n{text}")
    
    return result

In [None]:
# 결합 실행
combined_lf = combine_mdr_texts(cleaned_maude_lf)

display(combined_lf.select(pl.len()).collect().item())

combined_lf.select(
    pl.col('mdr_text_0_text').n_unique()
).head().collect()

combined_lf.select(pl.col('mdr_report_key').n_unique()).head().collect()

# udi 매칭

### primary 컬럼 생성

In [None]:
# udi 데이터 불러오기
udi_loader = DataLoader(
    name='udi',
    output_file=DATA_DIR / 'bronze' / 'udi_raw.parquet',
    max_workers = 2
)

udi_lf = udi_loader.load(adapter, **polars_kwargs)
udi_lf

클렌징 전 필터링 (class 3)

In [None]:
import polars.selectors as cs

# 방법 1: regex 패턴에 매칭되는 컬럼 중 하나라도 3을 포함하는 행 필터링
udi_lf = udi_lf.filter(
    pl.any_horizontal(cs.matches('*device_class$') == '3')
)

udi_lf.select(pl.len()).collect().item()

In [None]:
drop_patterns = CUSTOMER_PATTERNS + DEVICE_SIZE_PATTERNS + STORAGE_PATTERNS + GMDN_PATTERNS + PREMARKET_PATTERNS

regex = "|".join(drop_patterns)

udi_lf = udi_lf.select(
    ~cs.matches(regex)
)

# udi_lf.collect_schema().names()

In [None]:
udi_di_cols = get_pattern_cols(udi_lf, UDI_DI_PATTERNS)
identifiers_cols = get_pattern_cols(udi_lf, IDENTIFIER_PATTERNS)
type_cols = get_pattern_cols(udi_lf, TYPE_PATTERNS)

In [None]:
# type-udi_di 쌍 만들기 (인덱스로 매칭)
def extract_index(col_name):
    match = re.search(r'identifiers_(\d+)_', col_name)
    return int(match.group(1)) if match else None

type_id_pairs = []
for type_col in type_cols:
    idx = extract_index(type_col)
    udi_di_col = f'identifiers_{idx}_id'
    if udi_di_col in udi_di_cols:
        type_id_pairs.append((type_col, udi_di_col))

len(type_id_pairs)

In [None]:
udi_lf = cast_date_cols_safe(
    udi_lf,
    date_cols=['publish_date'],
    fmt='%Y-%m-%d'
)

udi_lf.filter(
    pl.col('publish_date').is_not_null()
).head().select('publish_date').collect()

In [None]:
udi_step1_path = DATA_DIR / 'silver' / 'udi_primary.parquet'
udi_step2_path = DATA_DIR / 'silver' / 'udi_clean.parquet'

## 매칭 시작

In [None]:
# Primary인 id를 추출
def primary_transform(lf: pl.LazyFrame):
    return lf.with_columns(
        pl.coalesce([
            pl.when(pl.col(type_col).eq("Primary"))
            .then(pl.col(id_col))
            for type_col, id_col in type_id_pairs
        ]).alias('primary_udi_di')
    )

process_lazyframe_in_chunks(
    udi_lf, 
    primary_transform, 
    udi_step1_path, 
    10_000,
    desc="Primary extraction"
)

In [None]:
# udi 데이터 불러오기
udi_loader = DataLoader(
    name='udi',
    output_file=udi_step1_path,
)

primary_udi_lf = udi_loader.load(adapter, **polars_kwargs)

In [None]:
# 회사 이름 정제
preprocessor1 = create_company_preprocessor()

preprocessor1.apply_to_lazyframe(
    primary_udi_lf,
    ['company_name', 'brand_name'],
    udi_step2_path,
    10_000,
)

del preprocessor1, primary_udi_lf
udi_step1_path.unlink(missing_ok=True)

In [None]:
# udi 데이터 불러오기
udi_loader = DataLoader(
    name='udi',
    output_file=udi_step2_path,
)

cleaned_udi_lf = udi_loader.load(adapter, **polars_kwargs)

### 컬럼 이름 통일

In [None]:
rename_udi_lf = cleaned_udi_lf.rename({
    'company_name': 'manufacturer',
    'brand_name': 'brand',
    'version_or_model_number': 'model_number',
    'primary_udi_di': 'udi_di',
})

rename_maude_lf = combined_lf.rename({
    'device_0_manufacturer_d_name': 'manufacturer',
    'device_0_brand_name': 'brand',
    'device_0_model_number': 'model_number',
    'device_0_catalog_number': 'catalog_number',
    'device_0_lot_number': 'lot_number',
    'device_0_udi_di': 'udi_di',
    'device_0_udi_public': 'udi_public'
})

In [None]:
target_cols = [
    'manufacturer',
    'brand',
    'model_number',
    'catalog_number'
]

join_col = 'udi_di'

common_cols = target_cols + [join_col]

maude_cols = common_cols + [
    'mdr_report_key',
]

udi_cols = common_cols + udi_di_cols

In [None]:
udi_necessary_lf = rename_udi_lf.select(pl.col(udi_cols))
maude_necessary_lf = rename_maude_lf.select(pl.col(maude_cols))

maude_necessary_lf.select(pl.len()).collect().item()

In [None]:
maude_necessary_lf.select(pl.col('mdr_report_key').n_unique()).head().collect()

### primary와 매칭해서 가져오는 함수

In [None]:
output_path=DATA_DIR / 'silver' / "maude_with_udi.parquet"

In [None]:
import polars as pl
from pathlib import Path
from src.preprocess.udi_preprocessor import UDIProcessor

# 처리
processor = UDIProcessor()
result_path = processor.process(
    maude_lf=rename_maude_lf,
    udi_lf=rename_udi_lf,
    output_path=Path(output_path),
    chunk_size=10_000
)


In [None]:
loader4 = DataLoader(
    name='event',
    output_file=output_path
)

semifinal_lf = loader4.load(adapter=adapter, **polars_kwargs)
display(semifinal_lf.select(pl.len()).collect().item())
display(semifinal_lf.select(pl.col('mdr_report_key').n_unique()).head().collect().item())

# 저품질 행, 필터링

- 제조사 존재 안함
- UDI, 제품군이 존재하지 않음
- Class 3에서 UDI가 없는 것들
- top 10으로만 설정 -> 시간.. 중요...

In [None]:
filtered_lf = semifinal_lf.filter(
    pl.col('manufacturer_final').is_not_null(),
    ~ pl.col('device_version_id').str.starts_with('UNK'),
    pl.col('udi_confidence').ne('VERY_LOW'),
    pl.col('udi_confidence').ne('LOW'),
)

top10_lst = filtered_lf.group_by('device_0_device_report_product_code').agg(
    pl.len().alias('count')
).sort('count', descending=True).head(10).collect().to_pandas()['device_0_device_report_product_code'].to_list()

filtered_lf.select(pl.len()).collect().item()

In [None]:
# filtered_lf = filtered_lf.filter(
#     pl.col('device_0_device_report_product_code').is_in(top10_lst),
# )

display(filtered_lf.select(pl.len()).collect().item())

display(
    filtered_lf.select(
        pl.col(['combined_mdr_text', 'product_problems']).n_unique()
    ).head().collect()
)

filtered_lf.group_by(['combined_mdr_text', 'product_problems']).agg(
    pl.col('mdr_report_key').n_unique()
).select(pl.len()).head().collect()

In [None]:
matched_lf = filtered_lf.clone()


# 타입 변환

## Categorical

### event_type
- categorical

In [None]:
def convert_event_type(matched_lf, verbose=True):
    """
    event_type Categorical 타입으로 변환
    
    변환 이유:
    - event_type 제한된 카테고리 값만 가짐
    - Categorical로 변환하면 메모리 절약됨
    
    Parameters:
    -----------
    matched_lf : polars.LazyFrame
        입력 데이터
    verbose : bool
        값 확인 결과 출력 여부
    
    Returns:
    --------
    polars.LazyFrame
        변환된 데이터
    """
    # # 값 확인
    # if verbose:
    #     print("\event_type 값 확인:")
    #     value_counts = matched_lf.select(
    #         pl.col('event_type').value_counts().sort('count', descending=True)
    #     ).collect()
    #     print(value_counts)
    #     print(f"\n고유값 개수: {value_counts.height}개")
    
    # Categorical 타입으로 변환
    return matched_lf.with_columns(
        pl.col('event_type').cast(pl.Categorical)
    )

In [None]:
matched_lf = convert_event_type( matched_lf, "event_type")

matched_lf.select(pl.len()).collect().item()

## boolean
- True / False로

### bool처리 + I까지 
- boolean

adverse_event_flag

product_problem_flag

report_to_fda

manufacturer_link_flag

In [None]:
def cast_flags_to_bool(matched_lf: pl.LazyFrame, flag_cols: Union[str | list[str]], true_val: str = 'Y', false_val = 'N') -> pl.LazyFrame:
    """
    Y/N flag를 Boolean으로 변환
    - 'Y' → True
    - 'N' → False
    - 'I', null, 기타 → None  # ← "I" 처리 추가
    """
    if isinstance(flag_cols, str):
      flag_cols = [flag_cols]
    
    schema_names = matched_lf.collect_schema().names()

    return matched_lf.with_columns([
        pl.when(pl.col(col).eq(true_val))
          .then(True)
        .when(pl.col(col).eq(false_val))
          .then(False)
        .otherwise(None)  # ← 이미 "I"를 None으로 처리된 것을 확인
        .alias(col)
        for col in flag_cols
        if col in schema_names
    ])


In [None]:
# 사용 (한 번만!) 
matched_lf = cast_flags_to_bool(matched_lf, [
    "adverse_event_flag",
    "product_problem_flag",
    'reprocessed_and_reused_flag',
    'single_use_flag',
    "report_to_fda"  # ← "I"도 자동으로 None 처리됨
])

matched_lf.select(pl.len()).collect().item()

In [None]:
matched_lf = cast_flags_to_bool(
    matched_lf, 
    'previous_use_code', 
    true_val='I', false_val='R'
)

matched_lf.select(pl.len()).collect().item()

In [None]:
matched_lf = cast_flags_to_bool(
    matched_lf, 
    'device_0_device_operator', 
    true_val='HEALTH PROFESSIONAL', false_val='LAY USER/PATIENT'
)

matched_lf.select(pl.len()).collect().item()

### mdr_report_key (int 32?)

In [None]:
def cast_to_int32(
    matched_lf: pl.LazyFrame,
    int_cols: str | list[str],
) -> pl.LazyFrame:
    if isinstance(int_cols, str):
        int_cols = [int_cols]
    
    schema = matched_lf.collect_schema()
    
    # 스키마에 존재하는 컬럼만 필터링
    valid_cols = [col for col in int_cols if col in schema]
    
    # 존재하지 않는 컬럼 경고
    for col in set(int_cols) - set(valid_cols):
        print(f"[WARN] Column '{col}' not found. Skipped.")
    
    if not valid_cols:
        return matched_lf
    
    return matched_lf.with_columns([
        pl.col(col).cast(pl.Int32, strict=False)
        for col in valid_cols
    ])
    

In [None]:
matched_lf = cast_to_int32(
    matched_lf, 'mdr_report_key'
)

matched_lf.select(pl.len()).collect().item()

# 중복 제거

In [None]:
# 한 번에 처리
columns_with_final = [col for col in matched_lf.collect_schema().names() if '_final' in col]

matched_lf = matched_lf.with_columns([
    pl.col(col).alias(col.replace('_final', ''))
    for col in columns_with_final
]).drop(columns_with_final)

matched_lf.select(pl.len()).collect().item()

In [None]:
# 1차 : 중복 제거 컬럼 
dedup_cols = [
    'report_number',
    'date_of_event', 
    'manufacturer',
    'device_version_id',
    'lot_number',
    'udi_public'
]

# # Unknown / N/A 패턴 리스트
na_patterns = r'^None$|^UNK|NOT APPLICABLE|NOT REPORTED|^N/A$|^NA$|^$|^\s+$|^UNKNOWN$|^NI$|^NULL$'

In [None]:
# 일단 중복된 행들만 확인
# 조합 (6개 컬럼)의 개수
matched_lf_with_cnt = matched_lf.with_columns(
    pl.len().over(dedup_cols).alias('duplicate_cnt')
)

# cnt가 2 이상이 경우에만 
# cnt가 1인 경우에는 삭제할 필요가 없으므로
matched_lf_duplicates_only = matched_lf_with_cnt.filter(
    pl.col('duplicate_cnt') >= 2
)

duplicate_cnt = matched_lf_duplicates_only.select(pl.len()).collect().item()
print(f"중복된 행의 개수: {duplicate_cnt:,}개")

unique_cnt = matched_lf.unique(subset=dedup_cols, maintain_order=True).select(pl.len()).collect().item()
print(f"유일한 행의 개수: {unique_cnt:,}개")

matched_lf.select(pl.len()).collect().item()

In [None]:
def remove_na_values(matched_lf: pl.LazyFrame, dedup_cols, na_patterns, verbose=True):
    """
    Na / Unknown 값이 있는 행을 제거하는 함수
    
    작동방식:
    1. 각 컬럼에 대해 유효한 값인지 체크
    2. 모든 조건을 포함한(모두 만족하는) 행으로 필터링
    3. 필터 적용

    Parameters:
    -----------
    matched_lf : polars.LazyFrame
        원본 LazyFrame
    dedup_cols : list
        체크할 컬럼 리스트
    na_patterns : str
        NA / Unknown 패턴 정규식
    verbose : bool
        진행상황 출력 여부

    Returns:
    --------
    polars.LazyFrame
        NA 값이 제거된 LazyFrame
    """
    
    # 진행상황 확인
    if verbose:
        print("NA 값 제거")
        print(f"패턴: {na_patterns}")
    
    # 제거되기 전 개수 확인
    before_cnt = matched_lf.select(pl.len()).collect().item()
    if verbose:
        print(f"제거 전 행 개수: {before_cnt:,}개")
    
    # 각 컬럼별로 필터 조건
    conditions = []
    
    for col in dedup_cols:
        # 컬럼이 존재하는지 확인
        if col in matched_lf.collect_schema().names():
            # 유효한 값의 조건
            # null이 아니고 na_patterns 패턴에 매칭되지 않는 값
            cond = (
                pl.col(col).is_not_null()
                & ~pl.col(col).cast(pl.Utf8).str.to_uppercase().str.contains(na_patterns)
            )
            conditions.append(cond)
            
            if verbose:
                print(f"  컬럼 '{col}'에 대해 NA/Unknown 값 제거 조건 추가")
        else:
            if verbose:
                print(f"  컬럼 '{col}'이(가) 존재하지 않음. 건너뜀")
    
    # 예외처리
    if not conditions:
        if verbose:
            print("제거할 조건이 없음. 원본 반환")
        return matched_lf
    
    # 모든 조건을 AND 조건으로 결합
    final_condition = conditions[0]
    for cond in conditions[1:]:
        final_condition = final_condition & cond

    print('='*50)
    print(final_condition)
    
    # 필터 적용
    matched_lf_cleaned = matched_lf.filter(final_condition)
    
    # 결과
    after_cnt = matched_lf_cleaned.select(pl.len()).collect().item()
    removed_cnt = before_cnt - after_cnt
    
    if verbose:
        print(f"제거 후 행 개수: {after_cnt:,}개")
        print(f"제거된 행 개수: {removed_cnt:,}개")
    
    return matched_lf_cleaned


In [None]:
def analyze_duplicates(matched_lf, group_cols, verbose = True):
    """
    중복 데이터 분석 함수들

    작동 방식 
    1. 전체 개수 확인
    2. 고유(unique) 개수 확인
    3. 전체 - 고유 = 중복 개수 확인

    Parameters:
    matched_lf : polars.DataFrame -> 원본
    dedup_cols : list -> 중복 확인 컬럼
    verbose : bool -> 진행상황 출력 여부

    returns:

    tuple : (전체 개수, 고유 개수, 중복 개수)
    """

    if verbose:
        print(f" 중복 확인")

    # 전체 개수
    total_cnt = matched_lf.select(pl.len()).collect().item()

    # 고유 개수
    unique_cnt = matched_lf.unique(
        subset = group_cols, # 중복 ㅎ판단
        maintain_order = True
    ).select(pl.len()).collect().item()

    # 중복 개수
    duplicate_cnt = total_cnt - unique_cnt

    if verbose:
        print(f"전체 개수 : {total_cnt:,}개")
        print(f"고유 개수 : {unique_cnt:,}개")
        print(f"중복 개수 : {duplicate_cnt:,}개")
        for i, col in enumerate(dedup_cols, start=1):
            print(f"{i}. {col}")

    return total_cnt, unique_cnt, duplicate_cnt

In [None]:
def remove_duplicates(matched_lf, dedup_cols, keep = 'first', verbose = True):
    """
    중복 데이터 제거 함수

    작동 방식
    1. dedup_cols 기준으로 중복 판단
    2. keep 옵션에 따라 첫번째/마지막 행 유지
    3. 중복 제거된 DF 반환

    Parameters:
    matched_lf : polars.DataFrame -> 원본 DF
    dedup_cols : list -> 중복 판단 컬럼 리스트
    keep : str -> 'first' or 'last'
        'first' : 첫번째 행 유지
        'last' : 마지막 행 유지
    verbose : bool -> 진행상황 출력 여부

    Returns:
    polars.DataFrame -> 중복 제거된 DF
    """

    if verbose:
        print("중복 제거 시작")
        print(f"중복 판단 컬럼: {dedup_cols}")
        print(f"유지 옵션: {keep}")

    # 중복 제거
    matched_lf_deduped = matched_lf.unique(
        subset = dedup_cols,
        maintain_order = True,
        keep = 'first'
    )

    if verbose:
        before_cnt = matched_lf.select(pl.len()).collect().item()
        after_cnt = matched_lf_deduped.select(pl.len()).collect().item()
        removed_cnt = before_cnt - after_cnt

        print(f"제거 전 행 개수 : {before_cnt:,}개")
        print(f"제거 후 행 개수 : {after_cnt:,}개")
        print(f"제거된 행 개수 : {removed_cnt:,}개")

    return matched_lf_deduped

In [None]:
# na 값들 제거
matched_lf_cleaned = remove_na_values(matched_lf_duplicates_only, dedup_cols, na_patterns)
print(matched_lf_cleaned)

matched_lf_cleaned.select(pl.len()).collect().item()

In [None]:
total, unique, duplicate = analyze_duplicates(matched_lf_cleaned, dedup_cols)
pprint((total, unique, duplicate))

matched_lf_cleaned.select(pl.len()).collect().item()

In [None]:
remove= remove_duplicates(matched_lf_cleaned, dedup_cols, keep='first')
print(remove)

remove.select(pl.len()).collect().item()

In [None]:
remove.select(
    pl.col('combined_mdr_text').n_unique()
).head().collect()

In [None]:
matched_lf.select(pl.col('mdr_report_key').n_unique()).head().collect()

In [None]:
# 방법 1: Anti Join (권장)
result_lf = matched_lf.join(
    remove,
    on='mdr_report_key',  # 조인할 컬럼
    how='anti'  # 매칭되지 않는 행만 유지
)

display(result_lf.select(pl.col('mdr_report_key').n_unique()).head().collect().item())
remove.select(pl.len()).collect().item(), result_lf.select(pl.len()).collect().item(), matched_lf.select(pl.len()).collect().item()

In [None]:
matched_lf = result_lf.clone()

# categorical 인코딩

In [None]:
def sequence_number_outcome_clean(matched_lf, col_name):
    """
    patient_0_sequence_number_outcome 컬럼을 one hot encoding (LazyFrame 유지)
    """
    
    outcome_mapping = {
        'Life Threatening': 'L',
        'Hospitalization': 'H',
        'Disability': 'S',
        'Congenital Anomaly': 'C',
        'Required Intervention': 'R',
        'Death': 'D',
        'Other': 'O',
        'Invalid Data': 'O',
        'Unknown': 'O',
        'No Information': 'O',
        'Not Applicable': 'O',
    }
    
    # 모든 가능한 outcome 코드
    all_outcomes = ['L', 'H', 'S', 'C', 'R', 'D', 'O']
    
    result = matched_lf.with_columns(
        pl.col(col_name)
        .str.replace_all(r'^\[|\]$', "")
        .str.replace_all(r"'", "")
        .str.split(",")
        .list.eval(pl.element().str.strip_chars())
        .alias("_outcome_list")
    )
    
    # 매핑 적용
    for key, value in outcome_mapping.items():
        result = result.with_columns(
            pl.col("_outcome_list")
            .list.eval(pl.element().str.replace(key, value))
            .alias("_outcome_list")
        )
    
    # 각 outcome에 대해 one-hot 컬럼 생성
    for outcome in all_outcomes:
        result = result.with_columns(
            pl.col("_outcome_list")
            .list.contains(outcome)
            .alias(f"outcome_{outcome}")
        )
    
    result = result.drop("_outcome_list", col_name)
    
    return result

In [None]:
# 상위 10개 행만 처리해서 확인
matched_lf = sequence_number_outcome_clean(matched_lf, "patient_0_sequence_number_outcome")

display(matched_lf.select(pl.col('mdr_report_key').n_unique()).head().collect().item())
display(matched_lf.select(pl.len()).collect().item())

matched_lf.select(['outcome_L', 'outcome_H', 'outcome_S', 'outcome_C', 'outcome_R', 'outcome_D', 'outcome_O']).head().collect()


# Flag (품질 & 관련성 파악) + 파생변수

### product_problem_flag x event_type

In [None]:
matched_lf = matched_lf.with_columns(
    pl.when(pl.col("event_type").is_in(["Death", "Injury"]))
      .then(True)
      .otherwise(False)
      .alias("adverse_event_flag_logic")
)

display(matched_lf.select(pl.col('mdr_report_key').n_unique()).head().collect().item())
display(matched_lf.select(pl.len()).collect().item())

In [None]:
# 새 컬럼 이름 "adverse_event_flag_logic"

cols = ["event_type", "adverse_event_flag", "adverse_event_flag_logic"]

# 1) True 샘플 (Death/Injury)
true_df = (
    matched_lf.filter(pl.col("adverse_event_flag_logic") == True)
      .select(cols)
      .collect()
)
true_samples = true_df.sample(n=min(20, true_df.height), seed=42)

# 2) False 샘플 (Death/Injury 제외 전부)
false_df = (
    matched_lf.filter(pl.col("adverse_event_flag_logic") == False)
      .select(cols)
      .collect()
)
false_samples = false_df.sample(n=min(20, false_df.height), seed=42)

# 3) 충돌 샘플 (원본 flag가 있고, 논리 파생값과 다른 경우)
conflict_df = (
    matched_lf.filter(
        pl.col("adverse_event_flag").is_not_null()
        & (pl.col("adverse_event_flag") != pl.col("adverse_event_flag_logic"))
    )
    .select(cols)
    .collect()
)
conflict_samples = conflict_df.sample(n=min(20, conflict_df.height), seed=42)


display(matched_lf.select(pl.col('mdr_report_key').n_unique()).head().collect().item())
display(matched_lf.select(pl.len()).collect().item())

logical_lf = matched_lf.filter(
    pl.col("adverse_event_flag").eq(pl.col("adverse_event_flag_logic"))
)

display(logical_lf.select(pl.col('mdr_report_key').n_unique()).head().collect().item())
display(logical_lf.select(pl.len()).collect().item())

true_samples, false_samples.to_pandas(), conflict_samples.to_pandas()


### 날짜 순서 모순 처리

In [None]:
logical_lf = logical_lf.filter(
    pl.col('device_date_of_manufacturer').lt(pl.col('date_of_event')),
    pl.col('date_of_event').le(pl.col('date_received'))
)

In [None]:
display(logical_lf.select(pl.len()).collect().item())

display(
    logical_lf.select(
        pl.col(['combined_mdr_text', 'product_problems']).n_unique()
    ).head().collect()
)

logical_lf.group_by(['combined_mdr_text', 'product_problems']).agg(
    pl.col('mdr_report_key').n_unique()
).select(pl.len()).head().collect().item()

# 텍스트 처리

# 2차 열 drop

In [None]:
# 정렬
logical_lf = logical_lf.sort('mdr_report_key')

In [None]:
# Rename

rename_map = {
    'patient_0_patient_age': 'patient_age',
    'device_0_device_operator': 'operator',
    'device_0_device_report_product_code': 'product_code',
    'device_0_openfda_device_name': 'product_name',
    'combined_mdr_text': 'mdr_text',
    'device_version_id': 'udi_di',
    'previous_use_code': 'previous_use_flag',
    'date_of_event': 'date_occurred',
    'device_date_of_manufacturer': 'date_manufactured',
    'brand': 'brand_name',
    'manufacturer': 'manufacturer_name'
}

logical_lf = logical_lf.drop('udi_di').rename(rename_map)

In [None]:
# 기본 변수
BASE_COLS = [
    'mdr_report_key',
    'adverse_event_flag',
    'product_problem_flag', 
    'date_occurred',
    'date_received', 
    'date_manufactured', 
    'event_type',
    'previous_use_flag', 
    'single_use_flag', 
    'reprocessed_and_reused_flag',
    'product_problems'
]

DEVICE_COLS = [
    "manufacturer_name",
    "brand_name",
    "model_number",
    "udi_di",
    "product_code",
    "operator",
    "product_name",
]


PATIENT_COLS = [
    "patient_age",
]

MDR_TEXT_COLS = [
    'mdr_text'
]

OUTCOME_PATTERNS = [
    r'^outcome'
]

OUTCOME_COLS = get_pattern_cols(logical_lf, OUTCOME_PATTERNS)
TOTAL_COLS = BASE_COLS + DEVICE_COLS + PATIENT_COLS + MDR_TEXT_COLS + OUTCOME_COLS

In [None]:
final_lf = logical_lf.select(TOTAL_COLS)

final_cols = final_lf.collect_schema().names()

analyze_null_values(final_lf, final_cols)

In [None]:
final_path = DATA_DIR / 'silver' / 'maude60.parquet'

In [None]:
final_lf.sink_parquet(final_path, compression='zstd', compression_level=3, mkdir=True)