In [None]:
from typing import Tuple, List, Dict, Any, Sequence
import polars as pl
import polars.selectors as cs
import pandas as pd
from pprint import pprint, pformat
import sys
from pathlib import Path
from tqdm import tqdm, trange
import psutil
import re


# 상대 경로 사용
PROJECT_ROOT = Path.cwd().parent
DATA_DIR = PROJECT_ROOT / 'data'

# 맨 앞에 추가
if str(PROJECT_ROOT) in sys.path:
    sys.path.remove(str(PROJECT_ROOT))
sys.path.insert(0, str(PROJECT_ROOT))

# Python 내장 code 모듈 캐시만 임시 제거
if 'code' in sys.modules:
    del sys.modules['code']

# 이제 import
from code.utils import process_lazyframe_in_chunks
from code.loading import DataLoader
from code.preprocess import TextPreprocessor, create_udi_preprocessor, create_company_preprocessor, create_generic_preprocessor
from code.preprocess.preprocess import get_pattern_cols, \
    get_unique_by_cols_safe, get_unique, \
    analyze_null_values, replace_pattern_with_null, overview_col

## 원본 데이터 불러오기

In [None]:
# maude 데이터 불러오기
loader1 = DataLoader(
    start=2020,
    end=2025,
    output_file = DATA_DIR / 'maude_sample.parquet',
    max_workers=4
)

adapter = 'polars'
polars_kwargs = {
    'use_statistics': True,
    'parallel': 'auto',
    'low_memory': False,
    'rechunk': False,
    'cache': True,
}
maude_lf = loader1.load(adapter=adapter, **polars_kwargs)
maude_lf

In [None]:
# udi 데이터 불러오기
udi_loader = DataLoader(
    name='udi',
    output_file=DATA_DIR/'udi.parquet',
)

udi_lf = udi_loader.load(adapter, **polars_kwargs)
udi_lf

## UDI Dataset 전처리

In [None]:
IDENTIFIER_PATTERNS = [
    r"^device_\d+_brand_name$",
    r"identifiers_\d+_id", 
    r"identifiers_\d+_issuing_agency", 
    r"identifiers_\d+_package_discontinue_date", 
    r"identifiers_\d+_package_status", 
    r"identifiers_\d+_package_type", 
    r"identifiers_\d+_quantity_per_package", 
    r"identifiers_\d+_type", 
    r"identifiers_\d+_unit_of_use_id"
]
UDI_DI_PATTERNS = [r'^identifiers_\d+_id$']
TYPE_PATTERNS = [
    r'identifiers_\d+_type'
]

CUSTOMER_PATTERNS = [r'^customer']
DEVICE_SIZE_PATTERNS = [r'^device_sizes']
STORAGE_PATTERNS = [r'^storage']

### Drop 필요없는 열

In [None]:
drop_patterns = CUSTOMER_PATTERNS + DEVICE_SIZE_PATTERNS + STORAGE_PATTERNS

regex = "|".join(drop_patterns)

udi_lf = udi_lf.select(
    ~cs.matches(regex)
)

# udi_lf.collect_schema().names()

### Primary 추출

In [None]:
udi_di_cols = get_pattern_cols(udi_lf, UDI_DI_PATTERNS)
identifiers_cols = get_pattern_cols(udi_lf, IDENTIFIER_PATTERNS)
type_cols = get_pattern_cols(udi_lf, TYPE_PATTERNS)

In [None]:
# type-udi_di 쌍 만들기 (인덱스로 매칭)
def extract_index(col_name):
    match = re.search(r'identifiers_(\d+)_', col_name)
    return int(match.group(1)) if match else None

type_id_pairs = []
for type_col in type_cols:
    idx = extract_index(type_col)
    udi_di_col = f'identifiers_{idx}_id'
    if udi_di_col in udi_di_cols:
        type_id_pairs.append((type_col, udi_di_col))

len(type_id_pairs)

In [None]:
primary_udi_unique = set()

for type_col, id_col in tqdm(type_id_pairs, desc="Processing columns", unit="col"):
    try:
        count = (
            udi_lf
            .filter(pl.col(type_col).eq("Primary"))
            .select(pl.len())
            .collect()
            .item()
        )
        
        if count > 0:
            ids = (
                udi_lf
                .filter(pl.col(type_col).eq("Primary"))
                .select(pl.col(id_col))
                .unique()
                .collect()
                .to_series()
                .drop_nulls()
                .to_list()
            )
            primary_udi_unique.update(ids)
            
    except Exception as e:
        tqdm.write(f"Error processing {type_col}: {e}")
        continue

print(f"\n{'='*50}")
print(f"UDI 데이터의 고유 Primary udi 개수: {len(primary_udi_unique):,}")
print(f"{'='*50}")

In [None]:
udi_step1_path = DATA_DIR / 'silver' / 'udi_primary.parquet'
udi_step2_path = DATA_DIR / 'silver' / 'udi_clean.parquet'

In [None]:
# # Primary인 id를 추출
# def primary_transform(lf: pl.LazyFrame):
#     return lf.with_columns(
#         pl.coalesce([
#             pl.when(pl.col(type_col).eq("Primary"))
#             .then(pl.col(id_col))
#             for type_col, id_col in type_id_pairs
#         ]).alias('primary_udi_di')
#     )

# process_lazyframe_in_chunks(
#     udi_lf, 
#     primary_transform, 
#     udi_step1_path, 
#     10_000,
#     desc="Primary extraction"
# )

In [None]:
# # udi 데이터 불러오기
# udi_loader = DataLoader(
#     name='udi',
#     output_file=udi_step1_path,
# )

# primary_udi_lf = udi_loader.load(adapter, **polars_kwargs)

In [None]:
# # 회사 이름 정제
# preprocessor1 = create_company_preprocessor()

# preprocessor1.apply_to_lazyframe(
#     primary_udi_lf,
#     ['company_name', 'brand_name'],
#     udi_step2_path,
#     10_000,
# )

# del preprocessor1, primary_udi_lf
# udi_step1_path.unlink(missing_ok=True)

In [None]:
# udi 데이터 불러오기
udi_loader = DataLoader(
    name='udi',
    output_file=udi_step2_path,
)

cleaned_udi_lf = udi_loader.load(adapter, **polars_kwargs)

In [None]:
# maude_lf.filter(
#     pl.col('device_0_udi_di').is_not_null()
# ).group_by(
#     ['device_0_manufacturer_d_name', 'device_0_brand_name', 'device_0_model_number', 'device_0_catalog_number']
# ).agg(
#     pl.col('device_0_udi_di').n_unique().alias('udi_nunique'),
#     pl.col('device_0_udi_di').unique().alias('udi_unique'),
#     pl.col('device_0_udi_di').mode().alias('udi_mode')
# ).sort('udi_nunique', descending=True).head(10).collect().to_pandas()

## MAUDE 1차 전처리

In [None]:
# 기본 변수
BASE_COLS = [
    'mdr_report_key', 'report_number', 'adverse_event_flag', 'product_problem_flag', 
    'date_of_event', 'date_received', 'device_date_of_manufacturer', 'event_type',
    'previous_use_code', 'single_use_flag', 'report_source_code',
    'reprocessed_and_reused_flag', 'report_to_fda', 'event_location', 
    'manufacturer_link_flag', 'manufacturer_g1_name', 'manufacturer_g1_postal_code',
    'pma_pmn_number'
]

DEVICE_COLS = [
    "device_0_manufacturer_d_name",
    "device_0_manufacturer_d_postal_code",
    "device_0_brand_name",
    "device_0_catalog_number",
    "device_0_model_number",
    "device_0_udi_di",
    "device_0_lot_number",
    "device_0_udi_public",
    "device_0_device_report_product_code",
    "device_0_device_age_text",
    "device_0_device_operator",
    "device_0_openfda_device_class",
    "device_0_openfda_device_name",
]



PATIENT_COLS = [
    "patient_0_patient_sequence_number",
    "patient_0_patient_age",
    "patient_0_patient_sex",
    "patient_0_patient_weight",
    "patient_0_patient_race",
    "patient_0_patient_problems",
    "patient_0_sequence_number_outcome",
    "patient_0_sequence_number_treatment",
]



MDR_TEXT_PATTERNS = [
    r"^mdr_text_.*_text$",
    r"^mdr_text_.*_text_type_code$",
]

MDR_COLS = get_pattern_cols(maude_lf, MDR_TEXT_PATTERNS)
TOTAL_COLS = BASE_COLS + DEVICE_COLS + PATIENT_COLS + MDR_COLS

### Drop 필요 없는 열

In [None]:
maude_lf = maude_lf.select(TOTAL_COLS)

# maude_lf.collect_schema().names()

In [None]:
maude_step1_path = DATA_DIR / 'silver' / 'clean_step1.parquet'
maude_step2_path = DATA_DIR / 'silver' / 'clean_step2.parquet'
maude_step3_path = DATA_DIR / 'silver' / 'clean_step3.parquet'

In [None]:
# # 1단계
# preprocessor1 = create_udi_preprocessor()
# preprocessor1.apply_to_lazyframe(
#     maude_lf, 'device_0_udi_di', maude_step1_path, chunk_size=10_000
# )
# del preprocessor1  # 명시적 삭제

In [None]:
# # 2단계
# maude_lf2 = pl.scan_parquet(maude_step1_path)
# preprocessor2 = create_company_preprocessor()
# preprocessor2.apply_to_lazyframe(
#     maude_lf2, ['device_0_manufacturer_d_name', 'manufacturer_g1_name', 'device_0_brand_name'], 
#     maude_step2_path, chunk_size=10_000
# )
# del maude_lf2, preprocessor2

In [None]:
# # 중간 파일 정리
# maude_step1_path.unlink(missing_ok=True)

# # 3단계
# maude_lf3 = pl.scan_parquet(maude_step2_path)
# preprocessor3 = create_generic_preprocessor()
# preprocessor3.apply_to_lazyframe(
#     maude_lf3, ['device_0_model_number', 'device_0_catalog_number', 'device_0_lot_number'], 
#     maude_step3_path, chunk_size=10_000
# )
# del maude_lf3, preprocessor3

In [None]:
maude_step2_path.unlink(missing_ok=True)

# maude 데이터 불러오기
loader3 = DataLoader(
    start=2020,
    end=2025,
    output_file = maude_step3_path,
)

adapter = 'polars'
polars_kwargs = {
    'use_statistics': True,
    'parallel': 'auto',
    'low_memory': False,
    'rechunk': False,
    'cache': True,
}
cleaned_maude_lf = loader3.load(adapter=adapter, **polars_kwargs)
cleaned_maude_lf

## 작업에 필요한 컬럼

In [None]:
rename_udi_lf = cleaned_udi_lf.rename({
    'company_name': 'manufacturer',
    'brand_name': 'brand',
    'version_or_model_number': 'model_number',
    'primary_udi_di': 'udi_di',
})

rename_maude_lf = cleaned_maude_lf.rename({
    'device_0_manufacturer_d_name': 'manufacturer',
    'device_0_brand_name': 'brand',
    'device_0_model_number': 'model_number',
    'device_0_catalog_number': 'catalog_number',
    'device_0_lot_number': 'lot_number',
    'device_0_udi_di': 'udi_di',
    'device_0_udi_public': 'udi_public'
})

In [None]:
target_cols = [
    'manufacturer',
    'brand',
    'model_number',
    'catalog_number'
]

join_col = 'udi_di'

common_cols = target_cols + [join_col]

maude_cols = common_cols + [
    'mdr_report_key',
]

udi_cols = common_cols + udi_di_cols

In [None]:
udi_necessary_lf = rename_udi_lf.select(pl.col(udi_cols))
maude_necessary_lf = rename_maude_lf.select(pl.col(maude_cols))

## 고유값 추출

In [None]:
cols_group = {
    'udi': udi_di_cols,
}

# udi 데이터셋의 udi_di 고유값
udi_udi_unique = get_unique_by_cols_safe(
    udi_necessary_lf, 
    cols_group,
    memory_safety_ratio=0.3,
    calibration_factor = 1
)['udi']

In [None]:
maude_udi_unique = get_unique(maude_necessary_lf, ['udi_di'])
angry_udi_unique = maude_udi_unique - udi_udi_unique
survive_udi_unique = maude_udi_unique & udi_udi_unique

print(f'UDI 데이터의 고유 udi 개수: {len(udi_udi_unique)}개')
print(f'MAUDE 데이터의 고유 udi 개수: {len(maude_udi_unique)}개')
print(f'UDI 데이터에 없는 MAUDE 데이터의 고유 udi 개수: {len(angry_udi_unique)}개')
print(f'UDI 데이터에 있는 MAUDE 데이터의 고유 udi 개수: {len(survive_udi_unique)}개')

In [None]:
maude_primary_udi_unique = survive_udi_unique & primary_udi_unique
maude_secondary_udi_unique = survive_udi_unique - primary_udi_unique

print(f'Primary UDI인 MAUDE 데이터의 고유 udi 개수: {len(maude_primary_udi_unique)}개')
print(f'Primary UDI가 아닌 MAUDE 데이터의 고유 udi 개수: {len(maude_secondary_udi_unique)}개')

In [None]:
class UniqueUDIDI:
    def __init__(
        self,
        udi_udi_unique: set = None,
        maude_udi_unique: set = None,
        angry_udi_unique: set = None,
        survive_udi_unique: set = None,
        primary_udi_unique: set = None,
        maude_primary_udi_unique: set = None,
        maude_secondary_udi_unique: set = None
    ):
        self.udi = udi_udi_unique
        self.maude = maude_udi_unique
        self.angry = angry_udi_unique
        self.survive = survive_udi_unique
        self.primary = primary_udi_unique
        self.maude_primary = maude_primary_udi_unique
        self.maude_secondary = maude_secondary_udi_unique

    def print_stats(self):
        print(f'UDI 데이터의 고유 udi 개수: {len(self.udi)}개')
        print(f'UDI 데이터의 고유 Primary udi 개수: {len(self.primary)}개')
        print(f'MAUDE 데이터의 고유 udi 개수: {len(self.maude)}개')
        print(f'UDI 데이터에 있는 MAUDE 데이터의 고유 udi 개수: {len(self.survive)}개')
        print(f'Primary UDI인 MAUDE 데이터의 고유 udi 개수: {len(self.maude_primary)}개')
        print(f'Primary UDI가 아닌 MAUDE 데이터의 고유 udi 개수: {len(self.maude_secondary)}개')
        print(f'UDI 데이터에 없는 MAUDE 데이터의 고유 udi 개수: {len(self.angry)}개')

unique_udi_di = UniqueUDIDI(udi_udi_unique, maude_udi_unique, angry_udi_unique, survive_udi_unique, primary_udi_unique, maude_primary_udi_unique, maude_secondary_udi_unique)

unique_udi_di.print_stats()

## UDI 데이터셋에 Primary 컬럼 생성

In [None]:
# MAUDE UDI를 LazyFrame으로
maude_udi_lf = pl.LazyFrame({
    'udi_di': list(unique_udi_di.maude_secondary)
})

In [None]:
# unpivot 후 즉시 필터링 (메모리 증가 최소화)
udi_mapping_lf = (
    udi_necessary_lf
    .with_row_index('row_idx')
    .unpivot(
        index=['row_idx'] + common_cols,
        on=udi_di_cols,
        variable_name='matched_col',
        value_name='match_udi_di'
    )
    .filter(
        pl.col('match_udi_di').is_not_null() &  # null 제거
        pl.col('match_udi_di').is_in(unique_udi_di.maude_secondary)  # 매칭되는 것만
    )
    .unique(subset=['match_udi_di', 'row_idx'])
    .select(['match_udi_di', 'row_idx'] + common_cols)
)

In [None]:
# udi_mapping_df = udi_mapping_lf.collect().to_pandas()

In [None]:
# # 통계만 먼저 확인
# print(f"Total matches: {len(udi_mapping_df):,}")
# print(f"Unique UDIs: {udi_mapping_df['match_udi_di'].nunique():,}")

In [None]:
# udi_mapping_df[udi_mapping_df['match_udi_di'] == '00021292007706']

In [None]:
maude_necessary_lf.filter(
    pl.col('udi_di').eq('00021292007706')
).head(15).collect().to_pandas().transpose()

# 전처리 함수 설계

In [None]:
# 일치 점수 매기는 함수
# 1. 매핑한 행만 불러오기
# 2. company_name, brand_name, version_or_mode_number, catalog_number에 대하여 
# 2-1. 전부 대문자로
# 2-2. 일치 점수 매기기
# 2-3. 가장 점수가 높은 것의 primary_udi_di를 할당
# 2-4. 필요한 정보 가져오기

# udi를 깔끔하게 만듦
# primary: 그대로 primary_lf와 매칭해서 필요 info 가져옴
# secondary: primary_lf 중 udi 매칭한 lf에서 점수 높은 info 가져옴
# null: primary_lf에서 점수 높은 info 가져옴

### primary와 매칭해서 가져오는 함수

In [None]:
def extract_from_match(
    src_lf: pl.LazyFrame, desc_lf: pl.LazyFrame, 
    on: str | Sequence[str], 
    target_cols: str | Sequence[str]
):
    if isinstance(on, str):
        on = [on]
    
    if isinstance(target_cols, str):
        target_cols = [target_cols]
    
    udi_subset = src_lf.select(pl.col(on + target_cols))
    
    result = desc_lf.join(
        udi_subset,
        on=on,
        how='left'
    )
    
    for col in target_cols:
        result = result.with_columns(
            pl.coalesce([f'{col}_right', col]).alias(col)
        ).drop(f'{col}_right')
    
    return result

maude_match_lf = extract_from_match(udi_necessary_lf, rename_maude_lf, join_col, target_cols)

# cleaned_maude_lf.head(10).collect().to_pandas()
maude_match_lf.head(10).collect().to_pandas()

In [None]:
analyze_null_values(maude_match_lf, common_cols)
analyze_null_values(rename_maude_lf, common_cols)
overview_col(maude_match_lf, 'brand', n_rows=100)
overview_col(rename_maude_lf, 'brand', n_rows=100)
overview_col(maude_lf, 'device_0_brand_name', n_rows=1000)

In [38]:
import polars as pl
from pathlib import Path
from code.preprocess.preprocessor import UDIProcessor
from code.preprocess.config import Config


output_path=DATA_DIR / "maude_with_udi.parquet"

# 처리
processor = UDIProcessor(Config())
result_path = processor.process(
    maude_lf=rename_maude_lf,
    udi_lf=rename_udi_lf,
    output_path=Path(output_path),
    chunk_size=10_000
)


KeyboardInterrupt: 

In [None]:
loader4 = DataLoader(
    name='event',
    output_file=output_path
)

matched_lf = loader4.load(adapter=adapter, **polars_kwargs)

In [None]:
matched_lf.filter(
    pl.col('match_source').eq('udi_no_match')
    # & pl.col('udi_matched').eq(False)
).head(10).collect().to_pandas().transpose()

In [None]:
analyze_null_values(rename_udi_lf, ['publish_date', 'public_version_date'])

overview_col(matched_lf, 'brand_final')
overview_col(rename_maude_lf, 'brand')

### 결측 패턴 분석

In [None]:
import polars as pl

# 결측 패턴 분석 (Polars LazyFrame 버전)
missing_pattern = {
    'overall_rate': (
        maude_match_lf
        .select(pl.col('udi_di').is_null().mean())
        .collect()
        .item()
    ),
    
    'by_manufacturer': (
        maude_match_lf
        .group_by('manufacturer')
        .agg(pl.col('udi_di').is_null().mean().alias('missing_rate'))
        .collect()
    ),
    
    'by_year': (
        maude_match_lf
        .with_columns(pl.col('date_received').str.to_datetime(format='%Y%m%d').dt.year().alias('year'))
        .group_by('year')
        .agg(pl.col('udi_di').is_null().mean().alias('missing_rate'))
        .collect()
    ),
    
    'by_product_code': (
        maude_match_lf
        .group_by('product_code')
        .agg(pl.col('udi_di').is_null().mean().alias('missing_rate'))
        .collect()
    )
}

In [None]:
zero_brand = rename_maude_lf.filter(
    pl.col('brand').eq('00421871')
).head(10).collect()

zero_brand.to_pandas().transpose()

## MAUDE 데이터의 UDI를 깔끔하게

In [None]:
with open('primary.txt', 'w', encoding='utf-8') as f:
    pretty_primary = pformat(unique_udi_di.maude_primary, indent=4, width=80)
    f.write(pretty_primary)
    

with open('secondary.txt', 'w', encoding='utf-8') as f:
    pretty_secondary = pformat(unique_udi_di.maude_secondary, indent=4, width=80)
    f.write(pretty_secondary)

with open('notfound.txt', 'w', encoding='utf-8') as f:
    pretty_angry = pformat(unique_udi_di.angry, indent=4, width=80)
    f.write(pretty_angry)

In [None]:
group_cols = ['manufacturer', 'brand','model_number', 'lot_number']

In [None]:
group_lf = test.select(['udi_di', *group_cols]).group_by(group_cols).agg(
    pl.col('udi_di').n_unique().alias('udi_di_unique'),
)

outlier = group_lf.filter(
    pl.col('udi_di_unique').gt(1)
).select(pl.len()).collect().item()

print(f'UDI-DI 이상치는 {outlier}개 있습니다.')
group_lf.collect().drop_nulls().sort('udi_di_unique', descending=True).head(100).to_pandas()

In [None]:
rename_maude_lf.select(group_cols + ['udi_di']).filter(
    pl.col('manufacturer').eq('DEXCOM'),
    pl.col('model_number').eq('9500-161'),
    pl.col('brand').eq('DEXCOM G7 CONTINUOUS GLUCOSE MONITORING SYSTEM'),
).drop_nulls().head(1000).collect().to_pandas()