In [1]:
from typing import Tuple, List
import polars as pl
from pprint import pprint
import sys
from pathlib import Path
import psutil
import pandas as pd
import time

In [2]:
# 데이터 로드
df = pl.scan_parquet('data/maude_sample.parquet')
total_rows = df.select(pl.len()).collect().item()
total_cols = len(df.collect_schema().names())
print(f"전체 행: {total_rows:,}개, 전체 컬럼: {total_cols}개")

전체 행: 14,132,321개, 전체 컬럼: 2247개


In [3]:
# 전역 변수
BASE_COLS = [
    'mdr_report_key', 'adverse_event_flag', 'product_problems',
    'product_problem_flag', 'date_of_event', 'date_report', 
    'date_received', 'device_date_of_manufacturer', 'event_type',
    'previous_use_code', 'single_use_flag', 'report_source_code',
    'reprocessed_and_reused_flag', 'date_facility_aware', 'report_date',
    'report_to_fda', 'date_report_to_fda', 'report_to_manufacturer',
    'date_report_to_manufacturer', 'event_location', 
    'date_manufacturer_received', 'manufacturer_link_flag',
    'date_added', 'date_changed', 'pma_pmn_number',
    'suppl_dates_fda_received', 'suppl_dates_mfr_received', 'report_number'
]

# device_* 특정 컬럼들
DEVICE_PATTERNS = [
    '_brand_name', '_udi_di', '_device_report_product_code', 
    '_model_number', '_expiration_date_of_device', '_device_age_text',
    '_device_operator', '_implant_flag', '_manufacturer_d_name', 
    '_openfda_device_class', '_openfda_device_name', '_generic_name'
]

# patient_* 특정 컬럼들
PATIENT_PATTERNS = [
    '_patient_sequence_number', '_patient_age', '_patient_sex',
    '_patient_weight', '_patient_race', '_patient_problems',
    '_sequence_number_outcome'
]

# mdr_text_* 특정 컬럼들
MDR_TEXT_PATTERNS = ['_text', '_text_type_code']

In [4]:
def get_use_cols(
    df: pl.LazyFrame,
    base_cols: List[str] = BASE_COLS,
    device_patterns: List[str] = DEVICE_PATTERNS,
    patient_patterns: List[str] = PATIENT_PATTERNS,
    mdr_text_patterns: List[str] = MDR_TEXT_PATTERNS
) -> Tuple[List[str], List[str], List[str], List[str]]:
    
    total_cols = df.collect_schema().names()
    device_cols = [col for col in total_cols if col.startswith('device_') and 
                any(pattern in col for pattern in device_patterns)]
    patient_cols = [col for col in total_cols if col.startswith('patient_') and 
                    any(pattern in col for pattern in patient_patterns)]
    mdr_text_cols = [col for col in total_cols if col.startswith('mdr_text_') and 
                    any(col.endswith(pattern) for pattern in mdr_text_patterns)]

    # 합치기
    analysis_cols = base_cols + device_cols + patient_cols + mdr_text_cols
    analysis_cols = sorted(list(set(analysis_cols)), reverse=True)

    print(f"총 컬럼: {len(analysis_cols)}개")
    print(f"  - 기본 컬럼: {len(base_cols)}개")
    print(f"  - device_*: {len(device_cols)}개")
    print(f"  - patient_*: {len(patient_cols)}개")
    print(f"  - mdr_text_*: {len(mdr_text_cols)}개")

    return device_cols, patient_cols, mdr_text_cols, analysis_cols

device_cols, patient_cols, mdr_text_cols, analysis_cols = get_use_cols(df)

총 컬럼: 836개
  - 기본 컬럼: 28개
  - device_*: 622개
  - patient_*: 26개
  - mdr_text_*: 160개


In [5]:
# ============================================================
# 새 셀 5: 헬퍼 함수들 (extract_idx, get_mdr_cols)
# ============================================================

def extract_idx(col_name: str, prefix: str = 'mdr_text_') -> int:
    """
    컬럼명에서 인덱스 추출
    예: 'mdr_text_0_text' -> 0
    """
    try:
        # prefix 제거 후 첫 번째 숫자 추출
        after_prefix = col_name.replace(prefix, '')
        idx = int(after_prefix.split('_')[0])
        return idx
    except:
        return -1


def get_mdr_cols(df: pl.LazyFrame, prefix: str = 'mdr_text_') -> Tuple[List[str], List[str]]:
    """
    mdr_text 관련 컬럼을 type과 text로 분리
    
    Returns:
        type_cols: mdr_text_*_text_type_code 컬럼들
        text_cols: mdr_text_*_text 컬럼들 (단, text_type_code는 제외)
    """
    all_cols = df.collect_schema().names()
    
    # type_code 컬럼들
    type_cols = [col for col in all_cols if col.startswith(prefix) and col.endswith('_text_type_code')]
    
    # text 컬럼들 (text_type_code 제외)
    text_cols = [col for col in all_cols if col.startswith(prefix) and col.endswith('_text') 
                 and not col.endswith('_text_type_code')]
    
    # 인덱스 순서로 정렬
    type_cols = sorted(type_cols, key=lambda x: extract_idx(x, prefix))
    text_cols = sorted(text_cols, key=lambda x: extract_idx(x, prefix))
    
    return type_cols, text_cols

print(" 헬퍼 함수 정의 완료!")

 헬퍼 함수 정의 완료!


In [6]:
# ============================================================
# 새 셀 6: combine_mdr_text 함수
# ============================================================

def combine_mdr_text(df: pl.LazyFrame, prefix: str = 'mdr_text_') -> pl.LazyFrame:
    """ 
    mdr_text 컬럼들 합치기(역순으로)

    상황별 
        1. 둘 다 있을 때: [ReportN: Type] Content
        2. 텍스트만 있을 때: Content
        3. 타입만 있을 때: None
        4. 둘 다 없을 때: None
    """
    
    # 컬럼 찾기
    type_cols, text_cols = get_mdr_cols(df, prefix)
    
    print(f"type_cols: {len(type_cols)}개")
    print(f"text_cols: {len(text_cols)}개")

    # 각 쌍을 "[ReportN: Type] Content" 형식으로
    parts = []
    report_n = 1

    for type_col, text_col in zip(type_cols, text_cols):
        # 인덱스 확인 (같은 번호끼리 묶기)
        if extract_idx(type_col, prefix) != extract_idx(text_col, prefix):
            continue

        # 1. 둘 다 있을 때
        case1 = (
            pl.when(pl.col(type_col).is_not_null() 
            & pl.col(text_col).is_not_null())
            .then(pl.concat_str([
                pl.lit(f'[Report{report_n}: '),
                pl.col(type_col),
                pl.lit(']\n'),
                pl.col(text_col)
            ]))
        )
        
        # 2. 텍스트만 있을 때
        case2 = (
            case1
            .when(pl.col(type_col).is_null()
            & pl.col(text_col).is_not_null())
            .then(pl.col(text_col))
        )

        # 3. 나머지는 None
        last_case = case2.otherwise(None)

        parts.append(last_case)
        report_n += 1

    # 모든 부분을 \n\n으로 합치기
    combined = pl.concat_str(parts, separator='\n\n', ignore_nulls=True)
    
    return df.with_columns(
        pl.when(combined.str.len_chars() > 0).then(combined).otherwise(None).alias('mdr_text_combined')
    )

print(" combine_mdr_text 함수 정의 완료!")

 combine_mdr_text 함수 정의 완료!


In [7]:
# ============================================================
# 방법 A: drop_nulls + streaming (가장 빠르고 권장!)
# ============================================================
group_cols = [
    'device_0_manufacturer_d_name',
    'device_0_udi_di',
    'device_0_lot_number',
    'date_of_event',
    'report_number',
    'device_0_udi_public'
]


In [8]:
print("필터링 + 저장 중...")
start = time.time()

output_file = 'filtered_data.parquet'

# collect() 대신 sink_parquet() 사용!
# 메모리에 안 올리고 디스크로 바로 저장
df.select(group_cols).drop_nulls().sink_parquet(output_file)

print(f"✓ 완료! {time.time()-start:.1f}초")
print(f"✓ 저장 위치: {output_file}")

# 이제 필요할 때만 빠르게 읽기
filtered_df = pl.read_parquet(output_file)
print(f"✓ 행 수: {len(filtered_df):,}")
filtered_df

필터링 + 저장 중...
✓ 완료! 1.0초
✓ 저장 위치: filtered_data.parquet
✓ 행 수: 5,475,167


device_0_manufacturer_d_name,device_0_udi_di,device_0_lot_number,date_of_event,report_number,device_0_udi_public
str,str,str,str,str,str
"""UNOMEDICAL A/S""","""05705244018129""","""UNKNOWN""","""20240813""","""3003442380-2024-25657""","""05705244018129"""
"""MEDTRONIC PUERTO RICO OPERATIO…","""000000763000639273""","""NG3060456H""","""20240828""","""2032227-2024-240254""","""(01)000000763000639273"""
"""DEXCOM, INC.""","""00386270001627""","""5333813""","""20240726""","""3004753838-2024-212264""","""00386270001627"""
"""NOBEL BIOCARE AB""","""07332747161335""","""12226767""","""20240603""","""9611993-2024-104474""","""(01)07332747161335(10)12226767…"
"""NOBEL BIOCARE AB""","""07332747161267""","""12218458""","""20240606""","""2027971-2024-118688""","""(01)07332747161267(10)12218458…"
…,…,…,…,…,…
"""BAXTER HEALTHCARE CORPORATION""","""00085412007731""","""H21L27053""","""20221202""","""1416980-2023-00194""","""(01)00085412007731"""
"""INSULET CORPORATION""","""20385081120033""","""L71359""","""20230124""","""3004464228-2023-03314""","""(01)20385081120033(11)220305(1…"
"""DATASCOPE CORP. - FAIRFIELD""","""10607567108605""","""3000231181""","""20230202""","""2248146-2023-00127""","""10607567108605"""
"""MEDTRONIC PUERTO RICO OPERATIO…","""000000763000545604""","""HG6D96CZZ""","""20230206""","""2032227-2023-168986""","""(01)000000763000545604"""


In [None]:
# 2개 이상인 경우만 확인
print("2개 이상인 경우 확인 중...")
start = time.time()

# 그룹별 카운트 계산 (2개 이상인 경우만)
count_df = (
    filtered_df
    .group_by(group_cols)
    .agg(pl.len().alias('count'))
    .filter(pl.col('count') >= 2)
    .sort('count', descending=True)
)

print(f"✓ 2개 이상인 그룹 수: {len(count_df):,}개")
print(f"✓ 완료! {time.time()-start:.1f}초")

# 카운트 분포 확인
print(f"\n카운트 분포:")
print(count_df.group_by('count').agg(pl.len().alias('그룹수')).sort('count'))

count_df


2개 이상인 경우 확인 중...
✓ 2개 이상인 그룹 수: 8,204개
✓ 완료! 1.7초

카운트 분포:
shape: (7, 2)
┌───────┬────────┐
│ count ┆ 그룹수 │
│ ---   ┆ ---    │
│ u32   ┆ u32    │
╞═══════╪════════╡
│ 2     ┆ 7400   │
│ 3     ┆ 621    │
│ 4     ┆ 130    │
│ 5     ┆ 29     │
│ 6     ┆ 13     │
│ 7     ┆ 8      │
│ 8     ┆ 3      │
└───────┴────────┘


device_0_manufacturer_d_name,device_0_udi_di,device_0_lot_number,date_of_event,report_number,device_0_udi_public,count
str,str,str,str,str,str,u32
"""JJGC S.A.""","""07899878024170""","""ELWG6""","""20240614""","""0001222315-2024-001978""","""07899878024170""",8
"""JJGC S.A.""","""07899878024156""","""KRVA8""","""20250611""","""3008261720-2025-004548""","""07899878024156""",8
"""JJGC S.A.""","""07899878024576""","""JKHE5""","""20250418""","""3008261720-2025-004492""","""07899878024576""",8
"""JJGC S.A.""","""07898237569062""","""MCL63""","""20230525""","""3008261720-2024-001870""","""07898237569062""",7
"""JJGC S.A.""","""07898237569062""","""GFM43""","""20231113""","""3008261720-2024-003029""","""07898237569062""",7
…,…,…,…,…,…,…
"""INSTITUT STRAUMANN AG""","""07630031706720""","""EMNR8""","""20250423""","""0009613348-2025-010237""","""07630031706720""",2
"""INSTITUT STRAUMANN AG""","""07630031707031""","""NHL68""","""20240318""","""0009613348-2024-009754""","""07630031707031""",2
"""INSTITUT STRAUMANN AG""","""07630031707086""","""AERX3""","""20240507""","""0009613348-2024-012693""","""07630031707086""",2
"""INSTITUT STRAUMANN AG""","""07630031700872""","""TR462""","""20250916""","""0009613348-2025-016556""","""07630031700872""",2


In [20]:
# 여러 조건을 명확하게 나열
filtered = df.filter(
      (pl.col('device_0_udi_di') == "07899878024170")
    & (pl.col('device_0_lot_number') == "ELWG6")
    & (pl.col('report_number') == "0001222315-2024-001978")
    & (pl.col('device_0_udi_public') == "07899878024170")
    & (pl.col('device_0_manufacturer_d_name') == "JJGC S.A.")
    & (pl.col('date_of_event') == "20240614")
).collect()

# Pandas로 변환 후 transpose
df_pandas = filtered.to_pandas().T
df_pandas

Unnamed: 0,0,1,2,3,4,5,6,7
adverse_event_flag,Y,Y,Y,Y,Y,Y,Y,Y
date_added,20240725,20240725,20240725,20240725,20240725,20240725,20240725,20240725
date_changed,20240726,20240726,20240726,20240726,20240726,20240726,20240726,20240726
date_facility_aware,20240725,20240725,20240725,20240725,20240725,20240725,20240725,20240725
date_manufacturer_received,,,,,,,,
...,...,...,...,...,...,...,...,...
source_type,,,,,,,,
summary_report_flag,N,N,N,N,N,N,N,N
suppl_dates_fda_received,,,,,,,,
suppl_dates_mfr_received,,,,,,,,


In [11]:
# # ============================================================
# # 새 셀 7: 함수 테스트
# # ============================================================

# # 특정 report에 적용해보기
# target_report = "0009613348-2024-014874"

# filtered_combined = df.filter(
#     pl.col('report_number') == target_report
# )

# # mdr_text 합치기 적용
# filtered_combined = combine_mdr_text(filtered_combined)

# # 결과 확인
# result = filtered_combined.select(['report_number', 'mdr_text_combined']).collect()

# print(f"\n결과 확인:")
# print(result)

# # 합쳐진 텍스트 내용 확인
# for row in result.to_dicts():
#     print(f"\n{'='*80}")
#     print(f"Report: {row['report_number']}")
#     print('='*80)
#     print(row['mdr_text_combined'][:500] if row['mdr_text_combined'] else "None")
#     print("...")

In [12]:
# value_counts = df.group_by('report_number').agg(
#     pl.len().alias('count')
# ).collect().sort('count', descending=True)

In [13]:
# value_counts

In [14]:
# target_report = "0009613348-2024-014874"
# report_col = 'report_number'



In [15]:
# filtered = df.filter(
#     pl.col(report_col) == target_report
# ).collect()

In [16]:
# filtered

In [17]:
# # ============================================================
# # 특정 컬럼들이 모두 NULL이 아니면서 같은 그룹 찾기
# # ============================================================

# # 비교할 컬럼들
# group_cols = [
#     'device_0_manufacturer_d_name',
#     'device_0_udi_di',
#     'device_0_lot_number',
#     'date_of_event',
#     'report_number',
#     'device_0_udi_public'
# ]

# # 방법 3: 스트리밍 모드 (매우 큰 데이터용)
# filtered_df = df.lazy().filter(
#     pl.all_horizontal([pl.col(c).is_not_null() for c in group_cols])
# ).collect(streaming=True)

# filtered_df.head(10)

# # filtered_df = df.filter(
# #     pl.col('device_0_manufacturer_d_name').is_not_null() &
# #     pl.col('device_0_udi_di').is_not_null() &
# #     pl.col('device_0_lot_number').is_not_null() &
# #     pl.col('date_of_event').is_not_null() &
# #     pl.col('report_number').is_not_null() &
# #     pl.col('device_0_udi_public').is_not_null()
# # ).collect().to_pandas()

# filtered_df

In [18]:
# # ============================================================
# # 방법 A: drop_nulls + streaming (가장 빠르고 권장!)
# # ============================================================
# group_cols = [
#     'device_0_manufacturer_d_name',
#     'device_0_udi_di',
#     'device_0_lot_number',
#     'date_of_event',
#     'report_number',
#     'device_0_udi_public'
# ]

# # 1. 먼저 개수 확인
# count = df.drop_nulls(subset=group_cols).select(pl.len()).collect().item()
# print(f"필터링될 행 수: {count:,}")

In [None]:
# # 여러 조건을 명확하게 나열
# filtered = df.filter(
#     pl.col(report_col) == target_report
# ).filter(
#     (pl.col('device_0_udi_di') == "07630031706713")
#     & (pl.col('device_0_lot_number') == "WMC68")
# ).collect()

# print(f"'{target_report}'의 전체 행 개수: {len(filtered)}개\n")

# # Pandas로 변환 후 transpose
# df_pandas = filtered.to_pandas().T
# df_pandas

NameError: name 'report_col' is not defined

In [None]:
# device_0_manufacturer_d_name
# device_0_udi_di
# device_0_lot_number
# date_of_event
# report_number
# device_0_udi_public

# Polars LazyFrame에서 필터링
filtered = df.filter(
    (pl.col('device_0_manufacturer_d_name') == "")
    & (pl.col('device_0_udi_di') == "")
    & (pl.col('device_0_lot_number') == "")
    & (pl.col('date_of_event') == "")
    & (pl.col('report_number') == "")
    & (pl.col('device_0_udi_public') == "") 
).collect()

# Pandas로 변환 후 transpose
df_pandas = filtered.to_pandas().T
df_pandas


adverse_event_flag
date_added
date_changed
date_facility_aware
date_manufacturer_received
...
source_type
summary_report_flag
suppl_dates_fda_received
suppl_dates_mfr_received
type_of_report
