In [1]:
from typing import Tuple, List, Dict, Any, Sequence
import polars as pl
import polars.selectors as cs
import pandas as pd
from pprint import pprint, pformat
import sys
from pathlib import Path
from tqdm import tqdm, trange
import psutil
import re


# ÏÉÅÎåÄ Í≤ΩÎ°ú ÏÇ¨Ïö©
PROJECT_ROOT = Path.cwd().parent
DATA_DIR = PROJECT_ROOT / 'data'

# Îß® ÏïûÏóê Ï∂îÍ∞Ä
if str(PROJECT_ROOT) in sys.path:
    sys.path.remove(str(PROJECT_ROOT))
sys.path.insert(0, str(PROJECT_ROOT))

# Python ÎÇ¥Ïû• code Î™®Îìà Ï∫êÏãúÎßå ÏûÑÏãú Ï†úÍ±∞
if 'code' in sys.modules:
    del sys.modules['code']

# Ïù¥Ï†ú import
from code.utils import process_lazyframe_in_chunks
from code.loading import DataLoader
from code.preprocess import TextPreprocessor, create_udi_preprocessor, create_company_preprocessor, create_generic_preprocessor
from code.preprocess.preprocess import get_pattern_cols, \
    get_unique_by_cols_safe, get_unique, \
    analyze_null_values, replace_pattern_with_null, overview_col

## ÏõêÎ≥∏ Îç∞Ïù¥ÌÑ∞ Î∂àÎü¨Ïò§Í∏∞

In [2]:
# maude Îç∞Ïù¥ÌÑ∞ Î∂àÎü¨Ïò§Í∏∞
loader1 = DataLoader(
    start=2020,
    end=2025,
    output_file = DATA_DIR / 'maude_sample.parquet',
    max_workers=4
)

adapter = 'polars'
polars_kwargs = {
    'use_statistics': True,
    'parallel': 'auto',
    'low_memory': False,
    'rechunk': False,
    'cache': True,
}
maude_lf = loader1.load(adapter=adapter, **polars_kwargs)
maude_lf


üìñ /Users/eric/yeeun/Project4/data/maude_sample.parquet Î°úÎî© Ï§ë... (adapter=polars)


In [3]:
# udi Îç∞Ïù¥ÌÑ∞ Î∂àÎü¨Ïò§Í∏∞
udi_loader = DataLoader(
    name='udi',
    output_file=DATA_DIR/'udi.parquet',
)

udi_lf = udi_loader.load(adapter, **polars_kwargs)
udi_lf


üìñ /Users/eric/yeeun/Project4/data/udi.parquet Î°úÎî© Ï§ë... (adapter=polars)


## UDI Dataset Ï†ÑÏ≤òÎ¶¨

In [4]:
IDENTIFIER_PATTERNS = [
    r"^device_\d+_brand_name$",
    r"identifiers_\d+_id", 
    r"identifiers_\d+_issuing_agency", 
    r"identifiers_\d+_package_discontinue_date", 
    r"identifiers_\d+_package_status", 
    r"identifiers_\d+_package_type", 
    r"identifiers_\d+_quantity_per_package", 
    r"identifiers_\d+_type", 
    r"identifiers_\d+_unit_of_use_id"
]
UDI_DI_PATTERNS = [r'^identifiers_\d+_id$']
TYPE_PATTERNS = [
    r'identifiers_\d+_type'
]

CUSTOMER_PATTERNS = [r'^customer']
DEVICE_SIZE_PATTERNS = [r'^device_sizes']
STORAGE_PATTERNS = [r'^storage']

### Drop ÌïÑÏöîÏóÜÎäî Ïó¥

In [5]:
drop_patterns = CUSTOMER_PATTERNS + DEVICE_SIZE_PATTERNS + STORAGE_PATTERNS

regex = "|".join(drop_patterns)

udi_lf = udi_lf.select(
    ~cs.matches(regex)
)

# udi_lf.collect_schema().names()

### Primary Ï∂îÏ∂ú

In [6]:
udi_di_cols = get_pattern_cols(udi_lf, UDI_DI_PATTERNS)
identifiers_cols = get_pattern_cols(udi_lf, IDENTIFIER_PATTERNS)
type_cols = get_pattern_cols(udi_lf, TYPE_PATTERNS)

In [7]:
# type-udi_di Ïåç ÎßåÎì§Í∏∞ (Ïù∏Îç±Ïä§Î°ú Îß§Ïπ≠)
def extract_index(col_name):
    match = re.search(r'identifiers_(\d+)_', col_name)
    return int(match.group(1)) if match else None

type_id_pairs = []
for type_col in type_cols:
    idx = extract_index(type_col)
    udi_di_col = f'identifiers_{idx}_id'
    if udi_di_col in udi_di_cols:
        type_id_pairs.append((type_col, udi_di_col))

len(type_id_pairs)

132

In [8]:
primary_udi_unique = set()

for type_col, id_col in tqdm(type_id_pairs, desc="Processing columns", unit="col"):
    try:
        count = (
            udi_lf
            .filter(pl.col(type_col).eq("Primary"))
            .select(pl.len())
            .collect()
            .item()
        )
        
        if count > 0:
            ids = (
                udi_lf
                .filter(pl.col(type_col).eq("Primary"))
                .select(pl.col(id_col))
                .unique()
                .collect()
                .to_series()
                .drop_nulls()
                .to_list()
            )
            primary_udi_unique.update(ids)
            
    except Exception as e:
        tqdm.write(f"Error processing {type_col}: {e}")
        continue

print(f"\n{'='*50}")
print(f"UDI Îç∞Ïù¥ÌÑ∞Ïùò Í≥†Ïú† Primary udi Í∞úÏàò: {len(primary_udi_unique):,}")
print(f"{'='*50}")

Processing columns: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 132/132 [00:00<00:00, 141.57col/s]


UDI Îç∞Ïù¥ÌÑ∞Ïùò Í≥†Ïú† Primary udi Í∞úÏàò: 4,903,764





In [9]:
udi_step1_path = DATA_DIR / 'silver' / 'udi_primary.parquet'
udi_step2_path = DATA_DIR / 'silver' / 'udi_clean.parquet'

In [10]:
# # PrimaryÏù∏ idÎ•º Ï∂îÏ∂ú
# def primary_transform(lf: pl.LazyFrame):
#     return lf.with_columns(
#         pl.coalesce([
#             pl.when(pl.col(type_col).eq("Primary"))
#             .then(pl.col(id_col))
#             for type_col, id_col in type_id_pairs
#         ]).alias('primary_udi_di')
#     )

# process_lazyframe_in_chunks(
#     udi_lf, 
#     primary_transform, 
#     udi_step1_path, 
#     10_000,
#     desc="Primary extraction"
# )

In [11]:
# # udi Îç∞Ïù¥ÌÑ∞ Î∂àÎü¨Ïò§Í∏∞
# udi_loader = DataLoader(
#     name='udi',
#     output_file=udi_step1_path,
# )

# primary_udi_lf = udi_loader.load(adapter, **polars_kwargs)

In [12]:
# # ÌöåÏÇ¨ Ïù¥Î¶Ñ Ï†ïÏ†ú
# preprocessor1 = create_company_preprocessor()

# preprocessor1.apply_to_lazyframe(
#     primary_udi_lf,
#     ['company_name', 'brand_name'],
#     udi_step2_path,
#     10_000,
# )

# del preprocessor1, primary_udi_lf
# udi_step1_path.unlink(missing_ok=True)

In [13]:
# udi Îç∞Ïù¥ÌÑ∞ Î∂àÎü¨Ïò§Í∏∞
udi_loader = DataLoader(
    name='udi',
    output_file=udi_step2_path,
)

cleaned_udi_lf = udi_loader.load(adapter, **polars_kwargs)


üìñ /Users/eric/yeeun/Project4/data/silver/udi_clean.parquet Î°úÎî© Ï§ë... (adapter=polars)


In [14]:
# maude_lf.filter(
#     pl.col('device_0_udi_di').is_not_null()
# ).group_by(
#     ['device_0_manufacturer_d_name', 'device_0_brand_name', 'device_0_model_number', 'device_0_catalog_number']
# ).agg(
#     pl.col('device_0_udi_di').n_unique().alias('udi_nunique'),
#     pl.col('device_0_udi_di').unique().alias('udi_unique'),
#     pl.col('device_0_udi_di').mode().alias('udi_mode')
# ).sort('udi_nunique', descending=True).head(10).collect().to_pandas()

## MAUDE 1Ï∞® Ï†ÑÏ≤òÎ¶¨

In [15]:
# Í∏∞Î≥∏ Î≥ÄÏàò
BASE_COLS = [
    'mdr_report_key', 'report_number', 'adverse_event_flag', 'product_problem_flag', 
    'date_of_event', 'date_received', 'device_date_of_manufacturer', 'event_type',
    'previous_use_code', 'single_use_flag', 'report_source_code',
    'reprocessed_and_reused_flag', 'report_to_fda', 'event_location', 
    'manufacturer_link_flag', 'manufacturer_g1_name', 'manufacturer_g1_postal_code',
    'pma_pmn_number'
]

DEVICE_COLS = [
    "device_0_manufacturer_d_name",
    "device_0_manufacturer_d_postal_code",
    "device_0_brand_name",
    "device_0_catalog_number",
    "device_0_model_number",
    "device_0_udi_di",
    "device_0_lot_number",
    "device_0_udi_public",
    "device_0_device_report_product_code",
    "device_0_device_age_text",
    "device_0_device_operator",
    "device_0_openfda_device_class",
    "device_0_openfda_device_name",
]



PATIENT_COLS = [
    "patient_0_patient_sequence_number",
    "patient_0_patient_age",
    "patient_0_patient_sex",
    "patient_0_patient_weight",
    "patient_0_patient_race",
    "patient_0_patient_problems",
    "patient_0_sequence_number_outcome",
    "patient_0_sequence_number_treatment",
]



MDR_TEXT_PATTERNS = [
    r"^mdr_text_.*_text$",
    r"^mdr_text_.*_text_type_code$",
]

MDR_COLS = get_pattern_cols(maude_lf, MDR_TEXT_PATTERNS)
TOTAL_COLS = BASE_COLS + DEVICE_COLS + PATIENT_COLS + MDR_COLS

### Drop ÌïÑÏöî ÏóÜÎäî Ïó¥

In [16]:
maude_lf = maude_lf.select(TOTAL_COLS)

# maude_lf.collect_schema().names()

In [17]:
maude_step1_path = DATA_DIR / 'silver' / 'clean_step1.parquet'
maude_step2_path = DATA_DIR / 'silver' / 'clean_step2.parquet'
maude_step3_path = DATA_DIR / 'silver' / 'clean_step3.parquet'

In [18]:
# # 1Îã®Í≥Ñ
# preprocessor1 = create_udi_preprocessor()
# preprocessor1.apply_to_lazyframe(
#     maude_lf, 'device_0_udi_di', maude_step1_path, chunk_size=10_000
# )
# del preprocessor1  # Î™ÖÏãúÏ†Å ÏÇ≠Ï†ú

In [19]:
# # 2Îã®Í≥Ñ
# maude_lf2 = pl.scan_parquet(maude_step1_path)
# preprocessor2 = create_company_preprocessor()
# preprocessor2.apply_to_lazyframe(
#     maude_lf2, ['device_0_manufacturer_d_name', 'manufacturer_g1_name', 'device_0_brand_name'], 
#     maude_step2_path, chunk_size=10_000
# )
# del maude_lf2, preprocessor2

In [20]:
# # Ï§ëÍ∞Ñ ÌååÏùº Ï†ïÎ¶¨
# maude_step1_path.unlink(missing_ok=True)

# # 3Îã®Í≥Ñ
# maude_lf3 = pl.scan_parquet(maude_step2_path)
# preprocessor3 = create_generic_preprocessor()
# preprocessor3.apply_to_lazyframe(
#     maude_lf3, ['device_0_model_number', 'device_0_catalog_number', 'device_0_lot_number'], 
#     maude_step3_path, chunk_size=10_000
# )
# del maude_lf3, preprocessor3

In [21]:
maude_step2_path.unlink(missing_ok=True)

# maude Îç∞Ïù¥ÌÑ∞ Î∂àÎü¨Ïò§Í∏∞
loader3 = DataLoader(
    start=2020,
    end=2025,
    output_file = maude_step3_path,
)

adapter = 'polars'
polars_kwargs = {
    'use_statistics': True,
    'parallel': 'auto',
    'low_memory': False,
    'rechunk': False,
    'cache': True,
}
cleaned_maude_lf = loader3.load(adapter=adapter, **polars_kwargs)
cleaned_maude_lf


üìñ /Users/eric/yeeun/Project4/data/silver/clean_step3.parquet Î°úÎî© Ï§ë... (adapter=polars)


## ÏûëÏóÖÏóê ÌïÑÏöîÌïú Ïª¨Îüº

In [22]:
rename_udi_lf = cleaned_udi_lf.rename({
    'company_name': 'manufacturer',
    'brand_name': 'brand',
    'version_or_model_number': 'model_number',
    'primary_udi_di': 'udi_di',
})

rename_maude_lf = cleaned_maude_lf.rename({
    'device_0_manufacturer_d_name': 'manufacturer',
    'device_0_brand_name': 'brand',
    'device_0_model_number': 'model_number',
    'device_0_catalog_number': 'catalog_number',
    'device_0_lot_number': 'lot_number',
    'device_0_udi_di': 'udi_di',
    'device_0_udi_public': 'udi_public'
})

In [23]:
target_cols = [
    'manufacturer',
    'brand',
    'model_number',
    'catalog_number'
]

join_col = 'udi_di'

common_cols = target_cols + [join_col]

maude_cols = common_cols + [
    'mdr_report_key',
]

udi_cols = common_cols + udi_di_cols

In [24]:
udi_necessary_lf = rename_udi_lf.select(pl.col(udi_cols))
maude_necessary_lf = rename_maude_lf.select(pl.col(maude_cols))

## Í≥†Ïú†Í∞í Ï∂îÏ∂ú

In [25]:
cols_group = {
    'udi': udi_di_cols,
}

# udi Îç∞Ïù¥ÌÑ∞ÏÖãÏùò udi_di Í≥†Ïú†Í∞í
udi_udi_unique = get_unique_by_cols_safe(
    udi_necessary_lf, 
    cols_group,
    memory_safety_ratio=0.3,
    calibration_factor = 1
)['udi']

Î¨∏ÏûêÏó¥ Í∏∏Ïù¥ ÌÜµÍ≥ÑÎ•º Ïã§Ï†ú Îç∞Ïù¥ÌÑ∞Î°úÎ∂ÄÌÑ∞ Ï∂îÏ†ï Ï§ë...
  Î¨∏ÏûêÏó¥ Í∏∏Ïù¥ ÌÜµÍ≥Ñ:
    - ÌèâÍ∑†(mean): 13.9Ïûê
    - Ï§ëÏïôÍ∞í(median): 14.0Ïûê
    - 75%ile: 14.0Ïûê
    - 90%ile: 14.0Ïûê
    - ÏµúÎåìÍ∞í: 23Ïûê
  ‚Üí ÏÇ¨Ïö©Ìï† ÌÅ¨Í∏∞(p75): 14.0Ïûê

=== Î©îÎ™®Î¶¨ Í∏∞Î∞ò ÏûêÎèô ÏûÑÍ≥ÑÍ∞í Í≥ÑÏÇ∞ ===
ÏÇ¨Ïö© Í∞ÄÎä•Ìïú Î©îÎ™®Î¶¨: 33.12 GB
ÏïàÏ†Ñ ÏÇ¨Ïö© Î©îÎ™®Î¶¨ (30%): 9.94 GB
ÏòàÏÉÅ Î∞îÏù¥Ìä∏/Í≥†Ïú†Í∞í: 106 bytes
  - Î¨∏ÏûêÏó¥ Îç∞Ïù¥ÌÑ∞: 28 bytes
  - str Ïò§Î≤ÑÌó§Îìú: 50 bytes
  - set Ïò§Î≤ÑÌó§Îìú: 28 bytes
Í≥ÑÏÇ∞Îêú ÏµúÎåÄ Í≥†Ïú†Í∞í: 100,652,909Í∞ú



Extracting unique values:   0%|          | 0/1 [00:00<?, ?it/s]

udi: 6,336,862Í∞úÏùò Í≥†Ïú†Í∞í (ÏòàÏÉÅ Î©îÎ™®Î¶¨: 640.6 MB)


Extracting unique values: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:03<00:00,  3.61s/it]

  ‚úì udi Ï∂îÏ∂ú ÏôÑÎ£å (Ïã§Ï†ú Î©îÎ™®Î¶¨: 636.11 MB)
    ‚Üí ÏòàÏÉÅÏπò Ï†ïÌôïÎèÑ: 100.7% (ÏòàÏÉÅ/Ïã§Ï†ú ÎπÑÏú®)


=== Ï∂îÏ∂ú ÏöîÏïΩ ===
ÏÑ±Í≥µ: 1/1
Ïã§Ìå®/Ïä§ÌÇµ: 0/1





In [26]:
maude_udi_unique = get_unique(maude_necessary_lf, ['udi_di'])
angry_udi_unique = maude_udi_unique - udi_udi_unique
survive_udi_unique = maude_udi_unique & udi_udi_unique

print(f'UDI Îç∞Ïù¥ÌÑ∞Ïùò Í≥†Ïú† udi Í∞úÏàò: {len(udi_udi_unique)}Í∞ú')
print(f'MAUDE Îç∞Ïù¥ÌÑ∞Ïùò Í≥†Ïú† udi Í∞úÏàò: {len(maude_udi_unique)}Í∞ú')
print(f'UDI Îç∞Ïù¥ÌÑ∞Ïóê ÏóÜÎäî MAUDE Îç∞Ïù¥ÌÑ∞Ïùò Í≥†Ïú† udi Í∞úÏàò: {len(angry_udi_unique)}Í∞ú')
print(f'UDI Îç∞Ïù¥ÌÑ∞Ïóê ÏûàÎäî MAUDE Îç∞Ïù¥ÌÑ∞Ïùò Í≥†Ïú† udi Í∞úÏàò: {len(survive_udi_unique)}Í∞ú')

UDI Îç∞Ïù¥ÌÑ∞Ïùò Í≥†Ïú† udi Í∞úÏàò: 6336861Í∞ú
MAUDE Îç∞Ïù¥ÌÑ∞Ïùò Í≥†Ïú† udi Í∞úÏàò: 59072Í∞ú
UDI Îç∞Ïù¥ÌÑ∞Ïóê ÏóÜÎäî MAUDE Îç∞Ïù¥ÌÑ∞Ïùò Í≥†Ïú† udi Í∞úÏàò: 10593Í∞ú
UDI Îç∞Ïù¥ÌÑ∞Ïóê ÏûàÎäî MAUDE Îç∞Ïù¥ÌÑ∞Ïùò Í≥†Ïú† udi Í∞úÏàò: 48479Í∞ú


In [27]:
maude_primary_udi_unique = survive_udi_unique & primary_udi_unique
maude_secondary_udi_unique = survive_udi_unique - primary_udi_unique

print(f'Primary UDIÏù∏ MAUDE Îç∞Ïù¥ÌÑ∞Ïùò Í≥†Ïú† udi Í∞úÏàò: {len(maude_primary_udi_unique)}Í∞ú')
print(f'Primary UDIÍ∞Ä ÏïÑÎãå MAUDE Îç∞Ïù¥ÌÑ∞Ïùò Í≥†Ïú† udi Í∞úÏàò: {len(maude_secondary_udi_unique)}Í∞ú')

Primary UDIÏù∏ MAUDE Îç∞Ïù¥ÌÑ∞Ïùò Í≥†Ïú† udi Í∞úÏàò: 46114Í∞ú
Primary UDIÍ∞Ä ÏïÑÎãå MAUDE Îç∞Ïù¥ÌÑ∞Ïùò Í≥†Ïú† udi Í∞úÏàò: 2365Í∞ú


In [28]:
class UniqueUDIDI:
    def __init__(
        self,
        udi_udi_unique: set = None,
        maude_udi_unique: set = None,
        angry_udi_unique: set = None,
        survive_udi_unique: set = None,
        primary_udi_unique: set = None,
        maude_primary_udi_unique: set = None,
        maude_secondary_udi_unique: set = None
    ):
        self.udi = udi_udi_unique
        self.maude = maude_udi_unique
        self.angry = angry_udi_unique
        self.survive = survive_udi_unique
        self.primary = primary_udi_unique
        self.maude_primary = maude_primary_udi_unique
        self.maude_secondary = maude_secondary_udi_unique

    def print_stats(self):
        print(f'UDI Îç∞Ïù¥ÌÑ∞Ïùò Í≥†Ïú† udi Í∞úÏàò: {len(self.udi)}Í∞ú')
        print(f'UDI Îç∞Ïù¥ÌÑ∞Ïùò Í≥†Ïú† Primary udi Í∞úÏàò: {len(self.primary)}Í∞ú')
        print(f'MAUDE Îç∞Ïù¥ÌÑ∞Ïùò Í≥†Ïú† udi Í∞úÏàò: {len(self.maude)}Í∞ú')
        print(f'UDI Îç∞Ïù¥ÌÑ∞Ïóê ÏûàÎäî MAUDE Îç∞Ïù¥ÌÑ∞Ïùò Í≥†Ïú† udi Í∞úÏàò: {len(self.survive)}Í∞ú')
        print(f'Primary UDIÏù∏ MAUDE Îç∞Ïù¥ÌÑ∞Ïùò Í≥†Ïú† udi Í∞úÏàò: {len(self.maude_primary)}Í∞ú')
        print(f'Primary UDIÍ∞Ä ÏïÑÎãå MAUDE Îç∞Ïù¥ÌÑ∞Ïùò Í≥†Ïú† udi Í∞úÏàò: {len(self.maude_secondary)}Í∞ú')
        print(f'UDI Îç∞Ïù¥ÌÑ∞Ïóê ÏóÜÎäî MAUDE Îç∞Ïù¥ÌÑ∞Ïùò Í≥†Ïú† udi Í∞úÏàò: {len(self.angry)}Í∞ú')

unique_udi_di = UniqueUDIDI(udi_udi_unique, maude_udi_unique, angry_udi_unique, survive_udi_unique, primary_udi_unique, maude_primary_udi_unique, maude_secondary_udi_unique)

unique_udi_di.print_stats()

UDI Îç∞Ïù¥ÌÑ∞Ïùò Í≥†Ïú† udi Í∞úÏàò: 6336861Í∞ú
UDI Îç∞Ïù¥ÌÑ∞Ïùò Í≥†Ïú† Primary udi Í∞úÏàò: 4903764Í∞ú
MAUDE Îç∞Ïù¥ÌÑ∞Ïùò Í≥†Ïú† udi Í∞úÏàò: 59072Í∞ú
UDI Îç∞Ïù¥ÌÑ∞Ïóê ÏûàÎäî MAUDE Îç∞Ïù¥ÌÑ∞Ïùò Í≥†Ïú† udi Í∞úÏàò: 48479Í∞ú
Primary UDIÏù∏ MAUDE Îç∞Ïù¥ÌÑ∞Ïùò Í≥†Ïú† udi Í∞úÏàò: 46114Í∞ú
Primary UDIÍ∞Ä ÏïÑÎãå MAUDE Îç∞Ïù¥ÌÑ∞Ïùò Í≥†Ïú† udi Í∞úÏàò: 2365Í∞ú
UDI Îç∞Ïù¥ÌÑ∞Ïóê ÏóÜÎäî MAUDE Îç∞Ïù¥ÌÑ∞Ïùò Í≥†Ïú† udi Í∞úÏàò: 10593Í∞ú


## UDI Îç∞Ïù¥ÌÑ∞ÏÖãÏóê Primary Ïª¨Îüº ÏÉùÏÑ±

In [29]:
# MAUDE UDIÎ•º LazyFrameÏúºÎ°ú
maude_udi_lf = pl.LazyFrame({
    'udi_di': list(unique_udi_di.maude_secondary)
})

In [30]:
# unpivot ÌõÑ Ï¶âÏãú ÌïÑÌÑ∞ÎßÅ (Î©îÎ™®Î¶¨ Ï¶ùÍ∞Ä ÏµúÏÜåÌôî)
udi_mapping_lf = (
    udi_necessary_lf
    .with_row_index('row_idx')
    .unpivot(
        index=['row_idx'] + common_cols,
        on=udi_di_cols,
        variable_name='matched_col',
        value_name='match_udi_di'
    )
    .filter(
        pl.col('match_udi_di').is_not_null() &  # null Ï†úÍ±∞
        pl.col('match_udi_di').is_in(unique_udi_di.maude_secondary)  # Îß§Ïπ≠ÎêòÎäî Í≤ÉÎßå
    )
    .unique(subset=['match_udi_di', 'row_idx'])
    .select(['match_udi_di', 'row_idx'] + common_cols)
)

In [31]:
# udi_mapping_df = udi_mapping_lf.collect().to_pandas()

In [32]:
# # ÌÜµÍ≥ÑÎßå Î®ºÏ†Ä ÌôïÏù∏
# print(f"Total matches: {len(udi_mapping_df):,}")
# print(f"Unique UDIs: {udi_mapping_df['match_udi_di'].nunique():,}")

In [33]:
# udi_mapping_df[udi_mapping_df['match_udi_di'] == '00021292007706']

In [34]:
maude_necessary_lf.filter(
    pl.col('udi_di').eq('00021292007706')
).head(15).collect().to_pandas().transpose()

Unnamed: 0,0,1
manufacturer,TRIVIDIA HEALTH,TRIVIDIA HEALTH
brand,TRUE METRIX GO,TRUE METRIX GO
model_number,KIT GO TVH TMX METERMG/DL,KIT GO CVS TMX METERMG/DL
catalog_number,,
udi_di,00021292007706,00021292007706
mdr_report_key,20041268,18722016


# Ï†ÑÏ≤òÎ¶¨ Ìï®Ïàò ÏÑ§Í≥Ñ

In [35]:
# ÏùºÏπò Ï†êÏàò Îß§Í∏∞Îäî Ìï®Ïàò
# 1. Îß§ÌïëÌïú ÌñâÎßå Î∂àÎü¨Ïò§Í∏∞
# 2. company_name, brand_name, version_or_mode_number, catalog_numberÏóê ÎåÄÌïòÏó¨ 
# 2-1. Ï†ÑÎ∂Ä ÎåÄÎ¨∏ÏûêÎ°ú
# 2-2. ÏùºÏπò Ï†êÏàò Îß§Í∏∞Í∏∞
# 2-3. Í∞ÄÏû• Ï†êÏàòÍ∞Ä ÎÜíÏùÄ Í≤ÉÏùò primary_udi_diÎ•º Ìï†Îãπ
# 2-4. ÌïÑÏöîÌïú Ï†ïÎ≥¥ Í∞ÄÏ†∏Ïò§Í∏∞

# udiÎ•º ÍπîÎÅîÌïòÍ≤å ÎßåÎì¶
# primary: Í∑∏ÎåÄÎ°ú primary_lfÏôÄ Îß§Ïπ≠Ìï¥ÏÑú ÌïÑÏöî info Í∞ÄÏ†∏Ïò¥
# secondary: primary_lf Ï§ë udi Îß§Ïπ≠Ìïú lfÏóêÏÑú Ï†êÏàò ÎÜíÏùÄ info Í∞ÄÏ†∏Ïò¥
# null: primary_lfÏóêÏÑú Ï†êÏàò ÎÜíÏùÄ info Í∞ÄÏ†∏Ïò¥

### primaryÏôÄ Îß§Ïπ≠Ìï¥ÏÑú Í∞ÄÏ†∏Ïò§Îäî Ìï®Ïàò

In [36]:
def extract_from_match(
    src_lf: pl.LazyFrame, desc_lf: pl.LazyFrame, 
    on: str | Sequence[str], 
    target_cols: str | Sequence[str]
):
    if isinstance(on, str):
        on = [on]
    
    if isinstance(target_cols, str):
        target_cols = [target_cols]
    
    udi_subset = src_lf.select(pl.col(on + target_cols))
    
    result = desc_lf.join(
        udi_subset,
        on=on,
        how='left'
    )
    
    for col in target_cols:
        result = result.with_columns(
            pl.coalesce([f'{col}_right', col]).alias(col)
        ).drop(f'{col}_right')
    
    return result

maude_match_lf = extract_from_match(udi_necessary_lf, rename_maude_lf, join_col, target_cols)

# cleaned_maude_lf.head(10).collect().to_pandas()
maude_match_lf.head(10).collect().to_pandas()

Unnamed: 0,mdr_report_key,report_number,adverse_event_flag,product_problem_flag,date_of_event,date_received,device_date_of_manufacturer,event_type,previous_use_code,single_use_flag,...,mdr_text_5_text,mdr_text_5_text_type_code,mdr_text_6_text,mdr_text_6_text_type_code,mdr_text_7_text,mdr_text_7_text_type_code,mdr_text_8_text,mdr_text_8_text_type_code,mdr_text_9_text,mdr_text_9_text_type_code
0,19448579,3013756811-2024-97562,N,Y,20240510.0,20240603,20210201.0,Malfunction,R,N,...,,,,,,,,,,
1,19448674,8030965-2024-07013,N,Y,20240101.0,20240603,,Malfunction,U,N,...,,,,,,,,,,
2,19448702,3004753838-2024-130855,N,Y,20240504.0,20240603,20240101.0,Malfunction,I,Y,...,,,,,,,,,,
3,19448836,3013756811-2024-88985,N,Y,20240510.0,20240603,20221201.0,Malfunction,R,N,...,,,,,,,,,,
4,19448850,9612169-2024-00511,N,Y,,20240603,20231117.0,Malfunction,I,,...,,,,,,,,,,
5,19448931,3004753838-2024-130890,N,Y,20240506.0,20240603,20230524.0,Malfunction,I,N,...,,,,,,,,,,
6,19449093,3013756811-2024-93158,N,Y,20240509.0,20240603,20221101.0,Malfunction,R,N,...,,,,,,,,,,
7,19449660,2032227-2024-181093,N,Y,20240513.0,20240603,20230620.0,Malfunction,A,N,...,,,,,,,,,,
8,19449741,2518422-2024-16641,Y,Y,20231012.0,20240603,20160309.0,Injury,,N,...,,,,,,,,,,
9,19613702,3004753838-2024-151007,N,Y,20240508.0,20240626,20220425.0,Malfunction,I,Y,...,,,,,,,,,,


In [37]:
analyze_null_values(maude_match_lf, common_cols)
analyze_null_values(rename_maude_lf, common_cols)
overview_col(maude_match_lf, 'brand', n_rows=100)
overview_col(rename_maude_lf, 'brand', n_rows=100)
overview_col(maude_lf, 'device_0_brand_name', n_rows=1000)


=== Í≤∞Ï∏°Ïπò Î∂ÑÏÑù ===
Ï†ÑÏ≤¥ Ìñâ Ïàò: 2,627,150

udi_di                                       :    975,101Í∞ú ( 37.12%)
model_number                                 :    476,339Í∞ú ( 18.13%)
catalog_number                               :    315,278Í∞ú ( 12.00%)
brand                                        :     47,292Í∞ú (  1.80%)
manufacturer                                 :      3,065Í∞ú (  0.12%)

=== Í≤∞Ï∏°Ïπò Î∂ÑÏÑù ===
Ï†ÑÏ≤¥ Ìñâ Ïàò: 2,627,150

model_number                                 :  1,157,071Í∞ú ( 44.04%)
udi_di                                       :    975,101Í∞ú ( 37.12%)
catalog_number                               :    382,838Í∞ú ( 14.57%)
brand                                        :     48,248Í∞ú (  1.84%)
manufacturer                                 :      5,591Í∞ú (  0.21%)
brandÏùò Í≥†Ïú† Í∞úÏàò: 47189


Unnamed: 0,head_brand,tail_brand
0,,ZMAX IMPLANT SYSTEM
1,0 KEEL LEFT SIZE D CEMENTED TIBIA,ZMED II
2,0 KEEL TIBIAL PEG DRILL,ZMED II CATHETER
3,0 MM ML TAPER RASP SZ 4,ZMR
4,0 MM RASP LONG POST SIZE 4,ZNN CMN LAG SCREW REAMER SHORT
...,...,...
95,10MM TI CANN FRN GT 340MM RIGHT STERILE,ZYNO PUMPS
96,10MM TI CANN FRN GT 360MM LEFT STERILE,ZYPHR
97,10MM 3FLUTED ACORN REAMER,ZYPHR DISP CRAN PERF LRG 1411
98,10MM BLUNT BLADE AND TUBE SET,ZYSTON CURVE SPACER SYS


brandÏùò Í≥†Ïú† Í∞úÏàò: 61602


Unnamed: 0,head_brand,tail_brand
0,,ZOLL CARDIAC MONITOR
1,0 ENDOLOOP LIG WPDS II,ZOLL COOLGARD 3000 IVTM SYSTEM
2,0 KEEL LEFT SIZE D CEMENTED TIBIA,ZOLL DEFIBRILLATOR
3,0 KEEL LEFT SIZE G CEMENTED TIBIA,ZOLL DEFIBRILLATOR PADS
4,0 KEEL RIGHT SIZE C CEMENTED TIBIA,ZOLL DEFIBRILLATOR PATCHES
...,...,...
95,100 THIN SWIVEL DIRECT FLOW 30K,ZYNO MEDICAL Z800WF INFUSION PUMP SYSTEM
96,1000 TRIPLE BEND SWIVEL DIRECT FLOW 25K,ZYNO PUMP
97,1000 TRIPLE BEND SWIVEL DIRECT FLOW ULTRASONIC...,ZYNO PUMPS
98,1000ML PLEURX SPARE BOTTLE LATAM,ZYPHR DISP CRAN PERF LRG 1411


device_0_brand_nameÏùò Í≥†Ïú† Í∞úÏàò: 66169


Unnamed: 0,head_device_0_brand_name,tail_device_0_brand_name
0,,WHITE JET ROUTINE I TAPED CASSETTE NO/LID
1,"""1.5MM"" SYSTEM TW DRILL 1.1X50MM 15MMSTP W/NT",WHITE JET ROUTINE III TAPED CASSETTE W/LID
2,"""COMFORT, COMFORT SHORT, SILHOUETTE, NERIA SOF...",WHITE PEARL ANTERIOR CERVICAL PLATE
3,"""GORE- VIATORRI-TIPS ENDOPROSTHESIS WITH CONTR...",WHITE RELOAD FOR ECHELON 45
4,"""SILHOUETTE PARADIGM",WHITE RELOAD FOR ECHELON 60
...,...,...
995,2.4MM LCP(TM) T-PLATE 2 HOLES HEAD/7 HOLES SHAFT,"√ò8X 20MM BC IF SCRW, VENTED"
996,2.4MM LOCKING SCREW SELF-TAPPING-20MM,"√ò8X 30MM BC IF SCRW, VENTED"
997,2.4MM LOCKING SCREW SLF-TPNG WITH STARDRIVE RE...,"√ò9X 20MM BC IF SCRW, VENTED"
998,2.4MM LOCKING SCREW SLF-TPNG WITH STARDRIVE RE...,"√ò9X 30MM BC IF SCRW, VENTED"


In [38]:
import polars as pl
from pathlib import Path
from code.preprocess.preprocessor import UDIProcessor
from code.preprocess.config import Config


output_path=DATA_DIR / "maude_with_udi.parquet"

# Ï≤òÎ¶¨
processor = UDIProcessor(Config())
result_path = processor.process(
    maude_lf=rename_maude_lf,
    udi_lf=rename_udi_lf,
    output_path=Path(output_path),
    chunk_size=50_000
)


UDI Ï≤òÎ¶¨ ÌååÏù¥ÌîÑÎùºÏù∏ ÏãúÏûë (Ìö®Ïú®Ï†Å Îß§Ìïë)
üîß MAUDE Ï†ÑÏ≤òÎ¶¨...
   ‚úì Ï†ÑÏ≤òÎ¶¨ ÏôÑÎ£å (LazyFrame Ïú†ÏßÄ)
üîß UDI DB Ï†ÑÏ≤òÎ¶¨...
üîß Ï†úÏ°∞ÏÇ¨Î™Ö ÌçºÏßÄ Îß§Ïπ≠...
   Îß§Ïπ≠: 359/3910 Í±¥
üîß Lookup ÌÖåÏù¥Î∏î ÏÉùÏÑ±...
   UDI-DI Lookup: 4,903,764 Í±¥
   Ï†úÏ°∞ÏÇ¨ Full Lookup: 2,819,081 Í±¥
   Ï†úÏ°∞ÏÇ¨ Partial Lookup: 313,758 Í±¥
üîß UDI Îß§Ìïë ÌÖåÏù¥Î∏î ÏÉùÏÑ±...
   Unique UDI: 59,113 Í±¥
   - Primary Îß§Ïπ≠ ÏÑ±Í≥µ: 46,118 Í±¥
   - Primary Îß§Ïπ≠ Ïã§Ìå®: 12,995 Í±¥
   Secondary Îß§Ïπ≠ ÏãúÎèÑ Ï§ë... (264Í∞ú Ïª¨Îüº)
   Îß§Ïπ≠ ÎåÄÏÉÅ UDI: 12,995 Í±¥
Processing 4,903,764 rows in chunks of 100,000...


Secondary Îß§Ìïë ÏÉùÏÑ±: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [00:20<00:00,  2.45it/s]


Merging chunks...
‚úì Saved to data/temp_secondary_mapping.parquet
   - Secondary Îß§Ìïë ÏÉùÏÑ± ÏôÑÎ£å: 2,365 Í±¥
   - Secondary Îß§Ïπ≠ ÏÑ±Í≥µ: 2,123 Í±¥ (Îã®Ïùº Primary)
   - Secondary Îß§Ïπ≠ Ïã§Ìå®: 10,872 Í±¥
   ‚úÖ ÏµúÏ¢Ö UDI Îß§Ìïë: 48,483 Í±¥
      - udi_direct: 46,118
      - udi_secondary: 2,123
      - udi_no_match: 242

üîß Îß§Ïπ≠ Îã®Í≥Ñ (UDI Îß§Ìïë + Ï†úÏ°∞ÏÇ¨ Îß§Ïπ≠)...
Processing 2,627,150 rows in chunks of 50,000...


UDI Îß§Ìïë + Ï†úÏ°∞ÏÇ¨ Îß§Ïπ≠: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 53/53 [00:08<00:00,  5.93it/s]


Merging chunks...
‚úì Saved to /Users/eric/yeeun/Project4/data/maude_with_udi_temp.parquet

üîß ÌõÑÏ≤òÎ¶¨ (Îã§Ï§ë Îß§Ïπ≠ & Tier 3)...
Processing 2,627,150 rows in chunks of 50,000...


Îã§Ï§ë Îß§Ïπ≠ & Tier 3 Ï≤òÎ¶¨: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 53/53 [00:02<00:00, 18.91it/s]


Merging chunks...
‚úì Saved to /Users/eric/yeeun/Project4/data/maude_with_udi_temp_resolved.parquet
‚úÖ ÏµúÏ¢Ö Í≤∞Í≥º: /Users/eric/yeeun/Project4/data/maude_with_udi_temp_resolved.parquet

üìä ÏµúÏ¢Ö Í≤∞Í≥º

Îß§Ïπ≠ Ï∂úÏ≤ò Î∂ÑÌè¨:
shape: (8, 3)
‚îå‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îê
‚îÇ match_source         ‚îÜ count   ‚îÜ percent ‚îÇ
‚îÇ ---                  ‚îÜ ---     ‚îÜ ---     ‚îÇ
‚îÇ str                  ‚îÜ u32     ‚îÜ f64     ‚îÇ
‚ïû‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ï™‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ï™‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ï°
‚îÇ udi_direct           ‚îÜ 1414034 ‚îÜ 53.82   ‚îÇ
‚îÇ no_match             ‚îÜ 1052416 ‚îÜ 40.06   ‚îÇ
‚îÇ mfr_partial_multiple ‚îÜ 71581   ‚îÜ 2.72    ‚îÇ
‚îÇ udi_secondary        ‚îÜ 58729   ‚îÜ 2.24    ‚îÇ
‚îÇ mfr_partial_single   ‚îÜ 14529   ‚îÜ 0.55    ‚îÇ
‚îÇ mfr_full_single      ‚îÜ 11933   ‚îÜ 0.45    ‚îÇ
‚îÇ udi_no_m

In [42]:
loader4 = DataLoader(
    name='event',
    output_file=output_path
)

matched_lf = loader4.load(adapter=adapter, **polars_kwargs)


üìñ /Users/eric/yeeun/Project4/data/maude_with_udi.parquet Î°úÎî© Ï§ë... (adapter=polars)


In [45]:
matched_lf.filter(
    pl.col('match_source').eq('udi_secondary')
    # & pl.col('udi_matched').eq(False)
).head(10).collect().to_pandas().transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
mdr_report_key,19392650,19269798,19367186,19010255,19205752,19491886,19133855,19028605,19512788,20328805
report_number,3007521480-2024-00006,2954323-2024-15963,2954323-2024-17674,2954323-2024-10575,2954323-2024-14709,2954323-2024-19834,2954323-2024-13273,2954323-2024-10952,2954323-2024-20162,3003442380-2024-25072
adverse_event_flag,N,Y,Y,N,Y,Y,N,Y,N,N
product_problem_flag,Y,N,N,Y,N,N,Y,N,Y,Y
date_of_event,20240509,20240409,20240424,20240315,20240405,20240519,20240411,20240229,20240517,20240905
...,...,...,...,...,...,...,...,...,...,...
model_number_final,,,,,,,,,,
catalog_number_final,402395,,,,,,,,,
match_source,udi_secondary,udi_secondary,udi_secondary,udi_secondary,udi_secondary,udi_secondary,udi_secondary,udi_secondary,udi_secondary,udi_secondary
udi_confidence,MEDIUM,MEDIUM,MEDIUM,MEDIUM,MEDIUM,MEDIUM,MEDIUM,MEDIUM,MEDIUM,MEDIUM


In [41]:
analyze_null_values(matched_lf, maude_cols)

overview_col(matched_lf, 'brand_final')
overview_col(rename_maude_lf, 'brand')


=== Í≤∞Ï∏°Ïπò Î∂ÑÏÑù ===
Ï†ÑÏ≤¥ Ìñâ Ïàò: 2,627,150

model_number                                 :  1,157,071Í∞ú ( 44.04%)
udi_di                                       :    975,101Í∞ú ( 37.12%)
catalog_number                               :    382,838Í∞ú ( 14.57%)
brand                                        :     48,248Í∞ú (  1.84%)
manufacturer                                 :      5,591Í∞ú (  0.21%)
mdr_report_key                               :          0Í∞ú (  0.00%)
brand_finalÏùò Í≥†Ïú† Í∞úÏàò: 47188


Unnamed: 0,head_brand_final,tail_brand_final
0,,ZMAX IMPLANT SYSTEM
1,0 KEEL LEFT SIZE D CEMENTED TIBIA,ZMED II
2,0 KEEL TIBIAL PEG DRILL,ZMED II CATHETER
3,0 MM ML TAPER RASP SZ 4,ZMR
4,0 MM RASP LONG POST SIZE 4,ZNN CMN LAG SCREW REAMER SHORT
...,...,...
95,10MM TI CANN FRN GT 340MM RIGHT STERILE,ZYNO PUMPS
96,10MM TI CANN FRN GT 360MM LEFT STERILE,ZYPHR
97,10MM 3FLUTED ACORN REAMER,ZYPHR DISP CRAN PERF LRG 1411
98,10MM BLUNT BLADE AND TUBE SET,ZYSTON CURVE SPACER SYS


brandÏùò Í≥†Ïú† Í∞úÏàò: 61602


Unnamed: 0,head_brand,tail_brand
0,,ZOLL CARDIAC MONITOR
1,0 ENDOLOOP LIG WPDS II,ZOLL COOLGARD 3000 IVTM SYSTEM
2,0 KEEL LEFT SIZE D CEMENTED TIBIA,ZOLL DEFIBRILLATOR
3,0 KEEL LEFT SIZE G CEMENTED TIBIA,ZOLL DEFIBRILLATOR PADS
4,0 KEEL RIGHT SIZE C CEMENTED TIBIA,ZOLL DEFIBRILLATOR PATCHES
...,...,...
95,100 THIN SWIVEL DIRECT FLOW 30K,ZYNO MEDICAL Z800WF INFUSION PUMP SYSTEM
96,1000 TRIPLE BEND SWIVEL DIRECT FLOW 25K,ZYNO PUMP
97,1000 TRIPLE BEND SWIVEL DIRECT FLOW ULTRASONIC...,ZYNO PUMPS
98,1000ML PLEURX SPARE BOTTLE LATAM,ZYPHR DISP CRAN PERF LRG 1411


### Í≤∞Ï∏° Ìå®ÌÑ¥ Î∂ÑÏÑù

In [None]:
import polars as pl

# Í≤∞Ï∏° Ìå®ÌÑ¥ Î∂ÑÏÑù (Polars LazyFrame Î≤ÑÏ†Ñ)
missing_pattern = {
    'overall_rate': (
        maude_match_lf
        .select(pl.col('udi_di').is_null().mean())
        .collect()
        .item()
    ),
    
    'by_manufacturer': (
        maude_match_lf
        .group_by('manufacturer')
        .agg(pl.col('udi_di').is_null().mean().alias('missing_rate'))
        .collect()
    ),
    
    'by_year': (
        maude_match_lf
        .with_columns(pl.col('date_received').str.to_datetime(format='%Y%m%d').dt.year().alias('year'))
        .group_by('year')
        .agg(pl.col('udi_di').is_null().mean().alias('missing_rate'))
        .collect()
    ),
    
    'by_product_code': (
        maude_match_lf
        .group_by('product_code')
        .agg(pl.col('udi_di').is_null().mean().alias('missing_rate'))
        .collect()
    )
}

In [None]:
zero_brand = rename_maude_lf.filter(
    pl.col('brand').eq('00421871')
).head(10).collect()

zero_brand.to_pandas().transpose()

## MAUDE Îç∞Ïù¥ÌÑ∞Ïùò UDIÎ•º ÍπîÎÅîÌïòÍ≤å

In [None]:
with open('primary.txt', 'w', encoding='utf-8') as f:
    pretty_primary = pformat(unique_udi_di.maude_primary, indent=4, width=80)
    f.write(pretty_primary)
    

with open('secondary.txt', 'w', encoding='utf-8') as f:
    pretty_secondary = pformat(unique_udi_di.maude_secondary, indent=4, width=80)
    f.write(pretty_secondary)

with open('notfound.txt', 'w', encoding='utf-8') as f:
    pretty_angry = pformat(unique_udi_di.angry, indent=4, width=80)
    f.write(pretty_angry)

In [None]:
group_cols = ['manufacturer', 'brand','model_number', 'lot_number']

In [None]:
group_lf = test.select(['udi_di', *group_cols]).group_by(group_cols).agg(
    pl.col('udi_di').n_unique().alias('udi_di_unique'),
)

outlier = group_lf.filter(
    pl.col('udi_di_unique').gt(1)
).select(pl.len()).collect().item()

print(f'UDI-DI Ïù¥ÏÉÅÏπòÎäî {outlier}Í∞ú ÏûàÏäµÎãàÎã§.')
group_lf.collect().drop_nulls().sort('udi_di_unique', descending=True).head(100).to_pandas()

In [None]:
rename_maude_lf.select(group_cols + ['udi_di']).filter(
    pl.col('manufacturer').eq('DEXCOM'),
    pl.col('model_number').eq('9500-161'),
    pl.col('brand').eq('DEXCOM G7 CONTINUOUS GLUCOSE MONITORING SYSTEM'),
).drop_nulls().head(1000).collect().to_pandas()