In [27]:
from typing import Tuple, List, Dict, Any
import polars as pl
import polars.selectors as cs
import pandas as pd
from pprint import pprint, pformat
import sys
from pathlib import Path
from tqdm import tqdm, trange
import psutil
import re


# ÏÉÅÎåÄ Í≤ΩÎ°ú ÏÇ¨Ïö©
PROJECT_ROOT = Path.cwd().parent
DATA_DIR = PROJECT_ROOT / 'data'

# Îß® ÏïûÏóê Ï∂îÍ∞Ä
if str(PROJECT_ROOT) in sys.path:
    sys.path.remove(str(PROJECT_ROOT))
sys.path.insert(0, str(PROJECT_ROOT))

# Python ÎÇ¥Ïû• code Î™®Îìà Ï∫êÏãúÎßå ÏûÑÏãú Ï†úÍ±∞
if 'code' in sys.modules:
    del sys.modules['code']

# Ïù¥Ï†ú import
from code.utils import process_lazyframe_in_chunks
from code.loading import DataLoader
from code.preprocess import TextPreprocessor, create_udi_preprocessor, create_company_preprocessor, create_generic_preprocessor
from code.preprocess.preprocess import get_pattern_cols, \
    get_unique_by_cols_safe, get_unique, \
    analyze_null_values, replace_pattern_with_null

## ÏõêÎ≥∏ Îç∞Ïù¥ÌÑ∞ Î∂àÎü¨Ïò§Í∏∞

In [28]:
# maude Îç∞Ïù¥ÌÑ∞ Î∂àÎü¨Ïò§Í∏∞
loader1 = DataLoader(
    start=2020,
    end=2025,
    output_file = DATA_DIR / 'maude.parquet',
    max_workers=4
)

adapter = 'polars'
polars_kwargs = {
    'use_statistics': True,
    'parallel': 'auto',
    'low_memory': False,
    'rechunk': False,
    'cache': True,
}
maude_lf = loader1.load(adapter=adapter, **polars_kwargs)
maude_lf


üìñ /home/dataiku/eric/Sparta/Project3/data/maude.parquet Î°úÎî© Ï§ë... (adapter=polars)


In [29]:
# udi Îç∞Ïù¥ÌÑ∞ Î∂àÎü¨Ïò§Í∏∞
udi_loader = DataLoader(
    name='udi',
    output_file=DATA_DIR/'udi.parquet',
)

udi_lf = udi_loader.load(adapter, **polars_kwargs)
udi_lf


üìñ /home/dataiku/eric/Sparta/Project3/data/udi.parquet Î°úÎî© Ï§ë... (adapter=polars)


## UDI Dataset Ï†ÑÏ≤òÎ¶¨

### Primary Ï∂îÍ∞Ä

In [30]:
IDENTIFIER_PATTERNS = [
    r"^device_\d+_brand_name$",
    r"identifiers_\d+_id", 
    r"identifiers_\d+_issuing_agency", 
    r"identifiers_\d+_package_discontinue_date", 
    r"identifiers_\d+_package_status", 
    r"identifiers_\d+_package_type", 
    r"identifiers_\d+_quantity_per_package", 
    r"identifiers_\d+_type", 
    r"identifiers_\d+_unit_of_use_id"
]
UDI_DI_PATTERNS = [r'^identifiers_\d+_id$']
TYPE_PATTERNS = [
    r'identifiers_\d+_type'
]

CUSTOMER_PATTERNS = [r'^customer']
DEVICE_SIZE_PATTERNS = [r'^device_sizes']
STORAGE_PATTERNS = [r'^storage']

## Drop ÌïÑÏöîÏóÜÎäî Ïó¥

In [31]:
drop_patterns = CUSTOMER_PATTERNS + DEVICE_SIZE_PATTERNS + STORAGE_PATTERNS

regex = "|".join(drop_patterns)

udi_lf = udi_lf.select(
    ~cs.matches(regex)
)

udi_lf.collect_schema().names()

['brand_name',
 'catalog_number',
 'commercial_distribution_end_date',
 'commercial_distribution_status',
 'company_name',
 'device_count_in_base_package',
 'device_description',
 'gmdn_terms_0_code',
 'gmdn_terms_0_code_status',
 'gmdn_terms_0_definition',
 'gmdn_terms_0_implantable',
 'gmdn_terms_0_name',
 'gmdn_terms_10_code',
 'gmdn_terms_10_code_status',
 'gmdn_terms_10_definition',
 'gmdn_terms_10_implantable',
 'gmdn_terms_10_name',
 'gmdn_terms_11_code',
 'gmdn_terms_11_code_status',
 'gmdn_terms_11_definition',
 'gmdn_terms_11_implantable',
 'gmdn_terms_11_name',
 'gmdn_terms_12_code',
 'gmdn_terms_12_code_status',
 'gmdn_terms_12_definition',
 'gmdn_terms_12_implantable',
 'gmdn_terms_12_name',
 'gmdn_terms_13_code',
 'gmdn_terms_13_code_status',
 'gmdn_terms_13_definition',
 'gmdn_terms_13_implantable',
 'gmdn_terms_13_name',
 'gmdn_terms_14_code',
 'gmdn_terms_14_code_status',
 'gmdn_terms_14_definition',
 'gmdn_terms_14_implantable',
 'gmdn_terms_14_name',
 'gmdn_terms_15_

In [6]:
udi_di_cols = get_pattern_cols(udi_lf, UDI_DI_PATTERNS)
identifiers_cols = get_pattern_cols(udi_lf, IDENTIFIER_PATTERNS)
type_cols = get_pattern_cols(udi_lf, TYPE_PATTERNS)

In [7]:
# type-udi_di Ïåç ÎßåÎì§Í∏∞ (Ïù∏Îç±Ïä§Î°ú Îß§Ïπ≠)
def extract_index(col_name):
    match = re.search(r'identifiers_(\d+)_', col_name)
    return int(match.group(1)) if match else None

type_id_pairs = []
for type_col in type_cols:
    idx = extract_index(type_col)
    udi_di_col = f'identifiers_{idx}_id'
    if udi_di_col in udi_di_cols:
        type_id_pairs.append((type_col, udi_di_col))

len(type_id_pairs)

132

In [8]:
primary_udi_unique = set()

for type_col, id_col in tqdm(type_id_pairs, desc="Processing columns", unit="col"):
    try:
        count = (
            udi_lf
            .filter(pl.col(type_col).eq("Primary"))
            .select(pl.len())
            .collect()
            .item()
        )
        
        if count > 0:
            ids = (
                udi_lf
                .filter(pl.col(type_col).eq("Primary"))
                .select(pl.col(id_col))
                .unique()
                .collect()
                .to_series()
                .drop_nulls()
                .to_list()
            )
            primary_udi_unique.update(ids)
            
    except Exception as e:
        tqdm.write(f"Error processing {type_col}: {e}")
        continue

print(f"\n{'='*50}")
print(f"UDI Îç∞Ïù¥ÌÑ∞Ïùò Í≥†Ïú† Primary udi Í∞úÏàò: {len(primary_udi_unique):,}")
print(f"{'='*50}")

Processing columns: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 132/132 [00:01<00:00, 80.48col/s] 


UDI Îç∞Ïù¥ÌÑ∞Ïùò Í≥†Ïú† Primary udi Í∞úÏàò: 4,903,764





In [9]:
step1_path = DATA_DIR / 'silver' / 'udi_primary.parquet'
step2_path = DATA_DIR / 'silver' / 'udi_clean.parquet'

In [None]:
# PrimaryÏù∏ idÎ•º Ï∂îÏ∂ú
def primary_transform(lf: pl.LazyFrame):
    return lf.with_columns(
        pl.coalesce([
            pl.when(pl.col(type_col).eq("Primary"))
            .then(pl.col(id_col))
            for type_col, id_col in type_id_pairs
        ]).alias('primary_udi_di')
    )

process_lazyframe_in_chunks(
    udi_lf, 
    primary_transform, 
    step1_path, 
    10_000,
    desc="Primary extraction"
)

# udi Îç∞Ïù¥ÌÑ∞ Î∂àÎü¨Ïò§Í∏∞
udi_loader = DataLoader(
    name='udi',
    output_file=step1_path,
)

primary_udi_lf = udi_loader.load(adapter, **polars_kwargs)

In [None]:
# ÌöåÏÇ¨ Ïù¥Î¶Ñ Ï†ïÏ†ú
create_company_preprocessor().apply_to_lazyframe(
    primary_udi_lf,
    'company_name',
    step2_path,
    10_000,
)

step1_path.unlink(missing_ok=True)

In [12]:

# udi Îç∞Ïù¥ÌÑ∞ Î∂àÎü¨Ïò§Í∏∞
udi_loader = DataLoader(
    name='udi',
    output_file=step2_path,
)

cleaned_udi_lf = udi_loader.load(adapter, **polars_kwargs)


üìñ /home/dataiku/eric/Sparta/Project3/data/silver/udi_clean.parquet Î°úÎî© Ï§ë... (adapter=polars)


In [None]:
# maude_lf.filter(
#     pl.col('device_0_udi_di').is_not_null()
# ).group_by(
#     ['device_0_manufacturer_d_name', 'device_0_brand_name', 'device_0_model_number', 'device_0_catalog_number']
# ).agg(
#     pl.col('device_0_udi_di').n_unique().alias('udi_nunique'),
#     pl.col('device_0_udi_di').unique().alias('udi_unique'),
#     pl.col('device_0_udi_di').mode().alias('udi_mode')
# ).sort('udi_nunique', descending=True).head(10).collect().to_pandas()

Unnamed: 0,device_0_manufacturer_d_name,device_0_brand_name,device_0_model_number,device_0_catalog_number,udi_nunique,udi_unique,udi_mode
0,STAAR SURGICAL COMPANY,IMPLANTABLE COLLAMER LENS (ICL),VTICM5_13.2,,148,"[00841542116633, 00840311311538, 0084031131172...",[00840311312313]
1,STAAR SURGICAL COMPANY,IMPLANTABLE COLLAMER LENS (ICL),VTICM5_13.2,,141,"[00840311312887, 00840311312337, 0084031131122...","[00840311312030, 00840311311224, 0084031131111..."
2,ZOLL MEDICAL CORPORATION,X SERIES,X SERIES,X SERIES,141,"[00847946019297, 00847946019143, 0084794601925...",[00847946019259]
3,STAAR SURGICAL COMPANY,IMPLANTABLE COLLAMER LENS (ICL),VTICM5_12.6,,140,"[00841542115421, 00841542114905, 0084154211520...","[00840311308385, 00840311307807, 00840311308682]"
4,STAAR SURGICAL COMPANY,IMPLANTABLE COLLAMER LENS (ICL),VTICM5_12.6,,119,"[00841542114950, 00841542115544, 0084031130807...",[00840311307821]
5,BOSTON SCIENTIFIC CORPORATION,SYNERGY XD,,,99,"[08714729981084, 08714729985204, 0871472998071...",[08714729985242]
6,BOSTON SCIENTIFIC CORPORATION,MUSTANG,24674,24674,93,"[08714729793731, 08714729793854, 0871472979332...",[08714729793335]
7,STAAR SURGICAL COMPANY,IMPLANTABLE COLLAMER LENS (ICL),VTICMO12.6,,81,"[00840311323111, 00840311323517, 0084154212036...",[00840311322763]
8,ZOLL MEDICAL CORPORATION,R SERIES DEFIBRILLATOR,R SERIES,R SERIES,80,"[00847946017521, 00847946017163, 0084794601727...",[00847946017521]
9,ONKOS SURGICAL,ELEOS LIMB SALVAGE SYSTEM,,,75,"[B278CB1515203M0, B27825001210E0, B27825001208...",[B27825002111E0]


## UDI ÌÅ¥Î¶∞Ïßï

In [None]:
step1_path = DATA_DIR / 'silver' / 'clean_step1.parquet'
step2_path = DATA_DIR / 'silver' / 'clean_step2.parquet'

# 1Îã®Í≥Ñ: UDI ÌÅ¥Î¶∞Ïßï
create_udi_preprocessor().apply_to_lazyframe(
    maude_lf, 'device_0_udi_di', step1_path, chunk_size=10_000
)

In [None]:

# 2Îã®Í≥Ñ: ÌöåÏÇ¨Î™Ö ÌÅ¥Î¶∞Ïßï
maude_lf2 = pl.scan_parquet(step1_path)
create_company_preprocessor().apply_to_lazyframe(
    maude_lf2, 'device_0_manufacturer_d_name', step2_path, chunk_size=10_000
)

In [None]:
# maude Îç∞Ïù¥ÌÑ∞ Î∂àÎü¨Ïò§Í∏∞
loader3 = DataLoader(
    start=2020,
    end=2025,
    output_file = step2_path,
)

adapter = 'polars'
polars_kwargs = {
    'use_statistics': True,
    'parallel': 'auto',
    'low_memory': False,
    'rechunk': False,
    'cache': True,
}
maude_lf = loader3.load(adapter=adapter, **polars_kwargs)
maude_lf

## ÏûëÏóÖÏóê ÌïÑÏöîÌïú Ïª¨Îüº

In [None]:
common_cols = [
    'product_code',
    'manufacturer',
    'brand',
    'model_number',
    'udi_di',
    'device_class',
    'device_name'
]

maude_cols = common_cols + [
    'mdr_report_key',
    'catalog_number',
]

udi_cols = common_cols + udi_di_cols + type_cols

## Í≥†Ïú†Í∞í Ï∂îÏ∂ú

In [None]:
cols_group = {
    'udi': udi_di_cols,
}

# udi Îç∞Ïù¥ÌÑ∞ÏÖãÏùò udi_di Í≥†Ïú†Í∞í
udi_udi_unique = get_unique_by_cols_safe(
    udi_lf, 
    cols_group,
    memory_safety_ratio=0.3,
    calibration_factor = 1
)['udi']

In [None]:
maude_udi_unique = get_unique(maude_lf, ['device_0_udi_di'])
angry_udi_unique = maude_udi_unique - udi_udi_unique
survive_udi_unique = maude_udi_unique & udi_udi_unique

print(f'UDI Îç∞Ïù¥ÌÑ∞Ïùò Í≥†Ïú† udi Í∞úÏàò: {len(udi_udi_unique)}Í∞ú')
print(f'MAUDE Îç∞Ïù¥ÌÑ∞Ïùò Í≥†Ïú† udi Í∞úÏàò: {len(maude_udi_unique)}Í∞ú')
print(f'UDI Îç∞Ïù¥ÌÑ∞Ïóê ÏóÜÎäî MAUDE Îç∞Ïù¥ÌÑ∞Ïùò Í≥†Ïú† udi Í∞úÏàò: {len(angry_udi_unique)}Í∞ú')
print(f'UDI Îç∞Ïù¥ÌÑ∞Ïóê ÏûàÎäî MAUDE Îç∞Ïù¥ÌÑ∞Ïùò Í≥†Ïú† udi Í∞úÏàò: {len(survive_udi_unique)}Í∞ú')

In [None]:
maude_primary_udi_unique = survive_udi_unique & primary_udi_unique
maude_secondary_udi_unique = survive_udi_unique - primary_udi_unique

print(f'Primary UDIÏù∏ MAUDE Îç∞Ïù¥ÌÑ∞Ïùò Í≥†Ïú† udi Í∞úÏàò: {len(maude_primary_udi_unique)}Í∞ú')
print(f'Primary UDIÍ∞Ä ÏïÑÎãå MAUDE Îç∞Ïù¥ÌÑ∞Ïùò Í≥†Ïú† udi Í∞úÏàò: {len(maude_secondary_udi_unique)}Í∞ú')

In [None]:
class UniqueUDIDI:
    def __init__(
        self,
        udi_udi_unique: set = None,
        maude_udi_unique: set = None,
        angry_udi_unique: set = None,
        survive_udi_unique: set = None,
        primary_udi_unique: set = None,
        maude_primary_udi_unique: set = None,
        maude_secondary_udi_unique: set = None
    ):
        self.udi = udi_udi_unique
        self.maude = maude_udi_unique
        self.angry = angry_udi_unique
        self.survive = survive_udi_unique
        self.primary = primary_udi_unique
        self.maude_primary = maude_primary_udi_unique
        self.maude_secondary = maude_secondary_udi_unique

    def print_stats(self):
        print(f'UDI Îç∞Ïù¥ÌÑ∞Ïùò Í≥†Ïú† udi Í∞úÏàò: {len(self.udi)}Í∞ú')
        print(f'UDI Îç∞Ïù¥ÌÑ∞Ïùò Í≥†Ïú† Primary udi Í∞úÏàò: {len(self.primary)}Í∞ú')
        print(f'MAUDE Îç∞Ïù¥ÌÑ∞Ïùò Í≥†Ïú† udi Í∞úÏàò: {len(self.maude)}Í∞ú')
        print(f'UDI Îç∞Ïù¥ÌÑ∞Ïóê ÏûàÎäî MAUDE Îç∞Ïù¥ÌÑ∞Ïùò Í≥†Ïú† udi Í∞úÏàò: {len(self.survive)}Í∞ú')
        print(f'Primary UDIÏù∏ MAUDE Îç∞Ïù¥ÌÑ∞Ïùò Í≥†Ïú† udi Í∞úÏàò: {len(self.maude_primary)}Í∞ú')
        print(f'Primary UDIÍ∞Ä ÏïÑÎãå MAUDE Îç∞Ïù¥ÌÑ∞Ïùò Í≥†Ïú† udi Í∞úÏàò: {len(self.maude_secondary)}Í∞ú')
        print(f'UDI Îç∞Ïù¥ÌÑ∞Ïóê ÏóÜÎäî MAUDE Îç∞Ïù¥ÌÑ∞Ïùò Í≥†Ïú† udi Í∞úÏàò: {len(self.angry)}Í∞ú')

unique_udi_di = UniqueUDIDI(udi_udi_unique, maude_udi_unique, angry_udi_unique, survive_udi_unique, primary_udi_unique, maude_primary_udi_unique, maude_secondary_udi_unique)

unique_udi_di.print_stats()

## UDI Îç∞Ïù¥ÌÑ∞ÏÖãÏóê Primary Ïª¨Îüº ÏÉùÏÑ±

In [None]:
# MAUDE UDIÎ•º LazyFrameÏúºÎ°ú
maude_udi_lf = pl.LazyFrame({
    'udi_di': list(unique_udi_di.maude_secondary)
})

In [None]:
necessary_cols = [
    'company_name',
    'brand_name',
    'version_or_model_number',
    'catalog_number',
    'primary_udi_di',
]

# unpivot ÌõÑ Ï¶âÏãú ÌïÑÌÑ∞ÎßÅ (Î©îÎ™®Î¶¨ Ï¶ùÍ∞Ä ÏµúÏÜåÌôî)
udi_mapping_lf = (
    primary_udi_lf
    .with_row_index('row_idx')
    .select(['row_idx'] + necessary_cols + udi_di_cols)
    .unpivot(
        index=['row_idx'] + necessary_cols,
        on=udi_di_cols,
        variable_name='matched_col',
        value_name='udi_di'
    )
    .filter(
        pl.col('udi_di').is_not_null() &  # null Ï†úÍ±∞
        pl.col('udi_di').is_in(unique_udi_di.maude_secondary)  # Îß§Ïπ≠ÎêòÎäî Í≤ÉÎßå
    )
    .unique(subset=['udi_di', 'row_idx'])
    .select(['udi_di', 'row_idx'] + necessary_cols)
)

In [None]:
udi_mapping_df = udi_mapping_lf.collect().to_pandas()

In [None]:
# ÌÜµÍ≥ÑÎßå Î®ºÏ†Ä ÌôïÏù∏
print(f"Total matches: {len(udi_mapping_df):,}")
print(f"Unique UDIs: {udi_mapping_df['udi_di'].nunique():,}")

In [None]:
udi_mapping_df[udi_mapping_df['udi_di'] == '00021292007706']

In [None]:
maude_lf.filter(
    pl.col('device_0_udi_di').eq('00021292007706')
).head(15).collect().to_pandas().transpose()

# Ï†ÑÏ≤òÎ¶¨ Ìï®Ïàò ÏÑ§Í≥Ñ

In [None]:
# ÏùºÏπò Ï†êÏàò Îß§Í∏∞Îäî Ìï®Ïàò
# 1. Îß§ÌïëÌïú ÌñâÎßå Î∂àÎü¨Ïò§Í∏∞
# 2. company_name, brand_name, version_or_mode_number, catalog_numberÏóê ÎåÄÌïòÏó¨ 
# 2-1. Ï†ÑÎ∂Ä ÎåÄÎ¨∏ÏûêÎ°ú
# 2-2. ÏùºÏπò Ï†êÏàò Îß§Í∏∞Í∏∞
# 2-3. Í∞ÄÏû• Ï†êÏàòÍ∞Ä ÎÜíÏùÄ Í≤ÉÏùò primary_udi_diÎ•º Ìï†Îãπ
# 2-4. ÌïÑÏöîÌïú Ï†ïÎ≥¥ Í∞ÄÏ†∏Ïò§Í∏∞


# udiÎ•º ÍπîÎÅîÌïòÍ≤å ÎßåÎì¶
# primary: Í∑∏ÎåÄÎ°ú primary_lfÏôÄ Îß§Ïπ≠Ìï¥ÏÑú ÌïÑÏöî info Í∞ÄÏ†∏Ïò¥
# secondary: primary_lf Ï§ë udi Îß§Ïπ≠Ìïú lfÏóêÏÑú Ï†êÏàò ÎÜíÏùÄ info Í∞ÄÏ†∏Ïò¥
# null: primary_lfÏóêÏÑú Ï†êÏàò ÎÜíÏùÄ info Í∞ÄÏ†∏Ïò¥

## MAUDE Îç∞Ïù¥ÌÑ∞Ïùò UDIÎ•º ÍπîÎÅîÌïòÍ≤å

In [None]:
with open('primary.txt', 'w', encoding='utf-8') as f:
    pretty_primary = pformat(unique_udi_di.maude_primary, indent=4, width=80)
    f.write(pretty_primary)
    

with open('secondary.txt', 'w', encoding='utf-8') as f:
    pretty_secondary = pformat(unique_udi_di.maude_secondary, indent=4, width=80)
    f.write(pretty_secondary)

with open('notfound.txt', 'w', encoding='utf-8') as f:
    pretty_angry = pformat(unique_udi_di.angry, indent=4, width=80)
    f.write(pretty_angry)

In [None]:
group_cols = ['device_0_manufacturer_d_name', 'device_0_brand_name','device_0_model_number', 'device_0_lot_number']
na_pattern = r'^UNK|^(NI|NA|N/A|ASKU|NAV|NAVU|MSK|QS|TRC|DER|INV|PINF|NINF|-)$'
maude_lf = replace_pattern_with_null(maude_lf, group_cols + ['device_0_udi_di'], na_pattern)

In [None]:
group_lf = maude_lf.select(['device_0_udi_di', *group_cols]).group_by(group_cols).agg(
    pl.col('device_0_udi_di').n_unique().alias('udi_di_unique'),
)

outlier = group_lf.filter(
    pl.col('udi_di_unique').gt(1)
).select(pl.len()).collect().item()

print(f'UDI-DI Ïù¥ÏÉÅÏπòÎäî {outlier}Í∞ú ÏûàÏäµÎãàÎã§.')
group_lf.collect().drop_nulls().sort('udi_di_unique', descending=True).head(100).to_pandas()

In [None]:
maude_lf.select(group_cols + ['device_0_udi_di']).filter(
    pl.col('device_0_manufacturer_d_name').eq('DEXCOM, INC.'),
    pl.col('device_0_model_number').eq('9500-161'),
    pl.col('device_0_brand_name').eq('DEXCOM G7 CONTINUOUS GLUCOSE MONITORING SYSTEM'),
).drop_nulls().head(1000).collect().to_pandas()