In [None]:
import polars as pl
import pandas as pd
import numpy as np

In [2]:
df = pl.scan_parquet('../data/maude_sample.parquet')

### 함수 정의

In [3]:
def convert_report_source_code(df):
    """
    report_source_code를 Categorical 타입으로 변환
    
    변환 이유:
    - report_source_code는 제한된 카테고리 값만 가짐
    - Categorical로 변환하면 메모리 절약됨
    
    Parameters:
    -----------
    df : polars.LazyFrame
        입력 데이터
    
    Returns:
    --------
    polars.LazyFrame
        변환된 데이터
    """
    # Categorical 타입으로 변환
    return df.with_columns(
        pl.col('report_source_code').cast(pl.Categorical)
    )

In [4]:
def convert_reprocessed_flag(df):
    """
    reprocessed_and_reused_flag 처리
    
    처리 방법:
    1. 원본 컬럼을 reprocessed_and_reused_flag_raw로 이름 변경
    2. reprocessed_and_reused_flag_bin 생성 (1/0/null)
       - 'Y' -> 1 (재처리됨)
       - 'N' -> 0 (재처리 안 됨)
       - null -> null
    
    Parameters:
    -----------
    df : polars.LazyFrame
        입력 데이터
    
    Returns:
    --------
    polars.LazyFrame
        변환된 데이터 (_raw, _bin 컬럼 생성)
    """
    return df.with_columns([
        # 원본 컬럼을 _raw로 이름 변경
        pl.col('reprocessed_and_reused_flag').alias('reprocessed_and_reused_flag_raw'),
        
        # _bin 컬럼 생성 (1/0/null)
        pl.when(pl.col('reprocessed_and_reused_flag') == pl.lit('Y'))
            .then(pl.lit(1))
        .when(pl.col('reprocessed_and_reused_flag') == pl.lit('N'))
            .then(pl.lit(0))
        .otherwise(None)  # null은 null로 유지
        .cast(pl.Int8)  # 숫자형으로 변환 (1 byte)
        .alias('reprocessed_and_reused_flag_bin')
    ]).drop('reprocessed_and_reused_flag')  # 원본 컬럼 삭제

In [5]:
def convert_manufacturer_link_flag(df):
    """
    manufacturer_link_flag 처리
    
    처리 방법:
    1. 원본 컬럼을 manufacturer_link_flag_raw로 이름 변경
    2. manufacturer_link_flag_bin 생성 (1/0/null)
       - 'Y' -> 1 (제조사와 연결됨)
       - 'N' -> 0 (제조사와 연결 안 됨)
       - null -> null
    
    Parameters:
    -----------
    df : polars.LazyFrame
        입력 데이터
    
    Returns:
    --------
    polars.LazyFrame
        변환된 데이터 (_raw, _bin 컬럼 생성)
    """
    return df.with_columns([
        # 원본 컬럼을 _raw로 이름 변경
        pl.col('manufacturer_link_flag').alias('manufacturer_link_flag_raw'),
        
        # _bin 컬럼 생성 (1/0/null)
        pl.when(pl.col('manufacturer_link_flag') == pl.lit('Y'))
            .then(pl.lit(1))
        .when(pl.col('manufacturer_link_flag') == pl.lit('N'))
            .then(pl.lit(0))
        .otherwise(None)  # null은 null로 유지
        .cast(pl.Int8)  # 숫자형으로 변환
        .alias('manufacturer_link_flag_bin')
    ]).drop('manufacturer_link_flag')  # 원본 컬럼 삭제

In [6]:
def convert_previous_use_code(df):
    """
    previous_use_code를 의미있는 문자열로 변환
    
    변환 규칙:
    - 'I' -> 'Initial' (처음 사용)
    - 'R' -> 'Reuse' (재사용)
    - 그 외 -> 'Unknown' (알 수 없음)
    
    참고:
    - 새로운 컬럼 'previous_use_category'를 생성
    
    Parameters:
    -----------
    df : polars.LazyFrame
        입력 데이터
    
    Returns:
    --------
    polars.LazyFrame
        변환된 데이터
    """
    return df.with_columns(
        # I면 Initial, R이면 Reuse, 그 외는 Unknown
        pl.when(pl.col('previous_use_code') == pl.lit('I'))
            .then(pl.lit('Initial'))
        .when(pl.col('previous_use_code') == pl.lit('R'))
            .then(pl.lit('Reuse'))
        .otherwise(pl.lit('Unknown'))
        .alias('previous_use_category')
    )

In [7]:
def convert_single_use_flag(df):
    """
    single_use_flag 처리
    
    처리 방법:
    1. 원본 컬럼을 single_use_flag_raw로 이름 변경
    2. single_use_flag_bin 생성 (1/0/null)
       - 'Y' 또는 '1' -> 1 (일회용)
       - 'N' 또는 '0' -> 0 (다회용)
       - 그 외 -> null (알 수 없음)
    
    Parameters:
    -----------
    df : polars.LazyFrame
        입력 데이터
    
    Returns:
    --------
    polars.LazyFrame
        변환된 데이터 (_raw, _bin 컬럼 생성)
    """
    return df.with_columns([
        # 원본 컬럼을 _raw로 이름 변경
        pl.col('single_use_flag').alias('single_use_flag_raw'),
        
        # _bin 컬럼 생성 (1/0/null)
        # 먼저 문자열로 변환 후 공백 제거
        pl.when(
            pl.col('single_use_flag')
            .cast(pl.Utf8, strict=False)
            .str.strip_chars()
            .is_in(['Y', '1'])
        )
            .then(pl.lit(1))
        .when(
            pl.col('single_use_flag')
            .cast(pl.Utf8, strict=False)
            .str.strip_chars()
            .is_in(['N', '0'])
        )
            .then(pl.lit(0))
        .otherwise(None)  # 그 외는 null
        .cast(pl.Int8)  # 숫자형으로 변환
        .alias('single_use_flag_bin')
    ]).drop('single_use_flag')  # 원본 컬럼 삭제

### 일관성 검사 함수

In [8]:
def check_single_use_consistency(df, verbose=True):
    """
    single_use_flag_bin와 previous_use_category 일관성 검사
    
    검사 내용:
    - single_use_flag_bin=1 (일회용)인데
      previous_use_category='Reuse' (재사용)인 경우
    - 논리적으로 모순됨
    
    Parameters:
    -----------
    df : polars.LazyFrame
        입력 데이터
    verbose : bool
        출력 여부
    
    Returns:
    --------
    dict
        검사 결과
    """
    if verbose:
        print("\nsingle_use_flag_bin vs previous_use_category 일관성 검사")
        print("-" * 60)
    
    # 전체 개수
    total_count = df.select(pl.len()).collect().item()
    
    if verbose:
        print(f"전체 데이터: {total_count:,}개")
    
    # 교차표
    cross_table = df.group_by(['single_use_flag_bin', 'previous_use_category']).len().sort(
        ['single_use_flag_bin', 'len'], 
        descending=[False, True]
    ).collect()
    
    if verbose:
        print("\n전체 교차표:")
        print(cross_table)
        print("\n비율:")
        for row in cross_table.iter_rows(named=True):
            single_use = row['single_use_flag_bin']
            prev_use = row['previous_use_category']
            cnt = row['len']
            pct = (cnt / total_count * 100) if total_count > 0 else 0
            print(f"  single_use_bin={single_use} + previous_use={prev_use}: {cnt:,}개 ({pct:.2f}%)")
    
    # 논리적으로 이상한 케이스
    # single_use_flag_bin=1 (일회용)인데 previous_use_category='Reuse' (재사용)
    if verbose:
        print("\n논리적으로 이상한 케이스:")
        print("single_use_flag_bin=1 (일회용)인데 previous_use_category='Reuse' (재사용)")
    
    weird_case = df.filter(
        (pl.col('single_use_flag_bin') == pl.lit(1)) &
        (pl.col('previous_use_category') == pl.lit('Reuse'))
    )
    
    weird_count = weird_case.select(pl.len()).collect().item()
    weird_pct = (weird_count / total_count * 100) if total_count > 0 else 0
    
    if verbose:
        print(f"총 {weird_count:,}개 ({weird_pct:.2f}%)")
    
    return {
        'total_count': total_count,
        'weird_count': weird_count,
        'weird_pct': weird_pct,
        'cross_table': cross_table
    }

In [9]:
def check_adverse_event_consistency(df, verbose=True):
    """
    adverse_event_flag와 event_type 일관성 검사
    
    검사 내용:
    - event_type이 'Death' 또는 'Injury'인데
      adverse_event_flag='N'인 경우
    - 논리적으로 모순됨
    
    참고:
    - 이 두 컬럼은 전처리 대상이 아니므로 원본 값 사용
    
    Parameters:
    -----------
    df : polars.LazyFrame
        입력 데이터
    verbose : bool
        출력 여부
    
    Returns:
    --------
    dict
        검사 결과
    """
    if verbose:
        print("\nadverse_event_flag vs event_type 일관성 검사")
        print("-" * 60)
    
    # 전체 개수
    total_count = df.select(pl.len()).collect().item()
    
    if verbose:
        print(f"전체 데이터: {total_count:,}개")
    
    # 이상한 케이스: Death/Injury인데 adverse_event_flag='N'
    if verbose:
        print("\n논리적으로 이상한 케이스:")
        print("event_type이 'Death' 또는 'Injury'인데 adverse_event_flag='N'")
    
    weird_case = df.filter(
        (pl.col('event_type').is_in(['Death', 'Injury'])) &
        (pl.col('adverse_event_flag') == pl.lit('N'))
    )
    
    weird_count = weird_case.select(pl.len()).collect().item()
    weird_pct = (weird_count / total_count * 100) if total_count > 0 else 0
    
    if verbose:
        print(f"총 {weird_count:,}개 ({weird_pct:.2f}%)")
    
    # 이상한 케이스 상세 분석
    if weird_count > 0 and verbose:
        print("\n상세 분석:")
        result = weird_case.group_by(['event_type', 'adverse_event_flag']).len().sort(
            'len', descending=True
        ).collect()
        for row in result.iter_rows(named=True):
            event = row['event_type']
            flag = row['adverse_event_flag']
            cnt = row['len']
            pct = (cnt / total_count * 100) if total_count > 0 else 0
            print(f"  {event} + {flag}: {cnt:,}개 ({pct:.2f}%)")
    
    # 정상 케이스: Death/Injury인데 adverse_event_flag='Y'
    if verbose:
        print("\n정상 케이스:")
        print("event_type이 'Death' 또는 'Injury'인데 adverse_event_flag='Y'")
    
    normal_case = df.filter(
        (pl.col('event_type').is_in(['Death', 'Injury'])) &
        (pl.col('adverse_event_flag') == pl.lit('Y'))
    )
    
    normal_count = normal_case.select(pl.len()).collect().item()
    normal_pct = (normal_count / total_count * 100) if total_count > 0 else 0
    
    if verbose:
        print(f"총 {normal_count:,}개 ({normal_pct:.2f}%)")
    
    # 정상 케이스 상세 분석
    if normal_count > 0 and verbose:
        print("\n상세 분석:")
        result = normal_case.group_by(['event_type', 'adverse_event_flag']).len().sort(
            'len', descending=True
        ).collect()
        for row in result.iter_rows(named=True):
            event = row['event_type']
            flag = row['adverse_event_flag']
            cnt = row['len']
            pct = (cnt / total_count * 100) if total_count > 0 else 0
            print(f"  {event} + {flag}: {cnt:,}개 ({pct:.2f}%)")
    
    # 전체 교차표
    if verbose:
        print("\n전체 교차표:")
    
    cross_table = df.group_by(['event_type', 'adverse_event_flag']).len().sort(
        ['event_type', 'len'], 
        descending=[False, True]
    ).collect()
    
    if verbose:
        print(cross_table)
        print("\n비율:")
        for row in cross_table.iter_rows(named=True):
            event = row['event_type']
            flag = row['adverse_event_flag']
            cnt = row['len']
            pct = (cnt / total_count * 100) if total_count > 0 else 0
            print(f"  {event} + {flag}: {cnt:,}개 ({pct:.2f}%)")
    
    return {
        'total_count': total_count,
        'weird_count': weird_count,
        'weird_pct': weird_pct,
        'normal_count': normal_count,
        'normal_pct': normal_pct,
        'cross_table': cross_table
    }

In [10]:
def analyze_manufacturer_link_vs_report_source(df, verbose=True):
    """
    manufacturer_link_flag_bin와 report_source_code 비율 분석
    
    분석 내용:
    - manufacturer_link_flag_bin별로
      'User facility report' 또는 'Distributor report'인 비율
    
    Parameters:
    -----------
    df : polars.LazyFrame
        입력 데이터
    verbose : bool
        출력 여부
    
    Returns:
    --------
    dict
        분석 결과
    """
    if verbose:
        print("\nmanufacturer_link_flag_bin vs report_source_code 비율 분석")
        print("-" * 60)
    
    # 전체 개수
    total_count = df.select(pl.len()).collect().item()
    
    if verbose:
        print(f"전체 데이터: {total_count:,}개")
    
    # manufacturer_link_flag_bin별 전체 개수
    flag_counts = df.group_by('manufacturer_link_flag_bin').len().sort(
        'manufacturer_link_flag_bin'
    ).collect()
    
    # 관심 대상: User facility report 또는 Distributor report
    target_sources = ['User facility report', 'Distributor report']
    
    # 관심 대상만 필터링
    target_df = df.filter(
        pl.col('report_source_code').is_in(target_sources)
    )
    
    # 전체 교차표
    cross_table = df.group_by(['manufacturer_link_flag_bin', 'report_source_code']).len().sort(
        ['manufacturer_link_flag_bin', 'len'], 
        descending=[False, True]
    ).collect()
    
    if verbose:
        print("\n전체 교차표:")
        print(cross_table)
    
    # manufacturer_link_flag_bin별 비율 계산
    if verbose:
        print("\nmanufacturer_link_flag_bin별 User facility/Distributor report 비율:")
    
    results = []
    for row in flag_counts.iter_rows(named=True):
        flag_value = row['manufacturer_link_flag_bin']
        flag_total = row['len']
        
        # 해당 flag에서 target_sources에 해당하는 개수
        target_count = target_df.filter(
            pl.col('manufacturer_link_flag_bin') == pl.lit(flag_value)
        ).select(pl.len()).collect().item()
        
        target_pct = (target_count / flag_total * 100) if flag_total > 0 else 0
        
        # flag 값 라벨링
        flag_label = '1 (제조사 연결)' if flag_value == 1 else '0 (연결 안됨)' if flag_value == 0 else 'null'
        
        if verbose:
            print(f"\nmanufacturer_link_flag_bin = {flag_value} ({flag_label}):")
            print(f"  전체: {flag_total:,}개")
            print(f"  User facility/Distributor report: {target_count:,}개 ({target_pct:.2f}%)")
        
        results.append({
            'manufacturer_link_flag_bin': flag_value,
            'total_count': flag_total,
            'target_count': target_count,
            'target_pct': target_pct
        })
    
    # 전체 요약
    total_target = target_df.select(pl.len()).collect().item()
    total_target_pct = (total_target / total_count * 100) if total_count > 0 else 0
    
    if verbose:
        print("\n전체 요약:")
        print(f"전체 데이터: {total_count:,}개")
        print(f"User facility/Distributor report: {total_target:,}개 ({total_target_pct:.2f}%)")
    
    return {
        'total_count': total_count,
        'total_target_count': total_target,
        'total_target_pct': total_target_pct,
        'results': results,
        'cross_table': cross_table
    }

### 함수 코드 실행

In [11]:
# 5개 컬럼의 타입 확인
schema = df.collect_schema()
print(f"report_source_code: {schema['report_source_code']}")
print(f"reprocessed_and_reused_flag: {schema['reprocessed_and_reused_flag']}")
print(f"manufacturer_link_flag: {schema['manufacturer_link_flag']}")
print(f"previous_use_code: {schema['previous_use_code']}")
print(f"single_use_flag: {schema['single_use_flag']}")

report_source_code: String
reprocessed_and_reused_flag: String
manufacturer_link_flag: String
previous_use_code: String
single_use_flag: String


In [12]:
# 샘플 데이터 확인
print("\n전처리 전 샘플 데이터 (5개):")
sample_before = df.select([
    'report_source_code',
    'reprocessed_and_reused_flag',
    'manufacturer_link_flag',
    'previous_use_code',
    'single_use_flag'
]).head(5).collect()
print(sample_before)


전처리 전 샘플 데이터 (5개):
shape: (5, 5)
┌────────────────────┬───────────────────┬───────────────────┬───────────────────┬─────────────────┐
│ report_source_code ┆ reprocessed_and_r ┆ manufacturer_link ┆ previous_use_code ┆ single_use_flag │
│ ---                ┆ eused_flag        ┆ _flag             ┆ ---               ┆ ---             │
│ str                ┆ ---               ┆ ---               ┆ str               ┆ str             │
│                    ┆ str               ┆ str               ┆                   ┆                 │
╞════════════════════╪═══════════════════╪═══════════════════╪═══════════════════╪═════════════════╡
│ Manufacturer       ┆ N                 ┆ Y                 ┆ U                 ┆ Y               │
│ report             ┆                   ┆                   ┆                   ┆                 │
│ Manufacturer       ┆ N                 ┆ Y                 ┆ R                 ┆ N               │
│ report             ┆                   ┆               

In [13]:
# 1. report_source_code -> Categorical
print("1. report_source_code -> Categorical")
df = convert_report_source_code(df)

# 2. reprocessed_and_reused_flag -> _raw, _bin
print("2. reprocessed_and_reused_flag -> _raw (Y/N/null), _bin (1/0/null)")
df = convert_reprocessed_flag(df)

# 3. manufacturer_link_flag -> _raw, _bin
print("3. manufacturer_link_flag -> _raw (Y/N/null), _bin (1/0/null)")
df = convert_manufacturer_link_flag(df)

# 4. previous_use_code -> String
print("4. previous_use_code -> previous_use_category (I=Initial, R=Reuse, 나머지=Unknown)")
df = convert_previous_use_code(df)

# 5. single_use_flag -> _raw, _bin
print("5. single_use_flag -> _raw (Y/N/1/0/null), _bin (1/0/null)")
df = convert_single_use_flag(df)

1. report_source_code -> Categorical
2. reprocessed_and_reused_flag -> _raw (Y/N/null), _bin (1/0/null)
3. manufacturer_link_flag -> _raw (Y/N/null), _bin (1/0/null)
4. previous_use_code -> previous_use_category (I=Initial, R=Reuse, 나머지=Unknown)
5. single_use_flag -> _raw (Y/N/1/0/null), _bin (1/0/null)


In [14]:
# 컬럼 타입 확인
schema_after = df.collect_schema()
print(f"report_source_code: {schema_after['report_source_code']}")
print(f"reprocessed_and_reused_flag_raw: {schema_after['reprocessed_and_reused_flag_raw']}")
print(f"reprocessed_and_reused_flag_bin: {schema_after['reprocessed_and_reused_flag_bin']}")
print(f"manufacturer_link_flag_raw: {schema_after['manufacturer_link_flag_raw']}")
print(f"manufacturer_link_flag_bin: {schema_after['manufacturer_link_flag_bin']}")
print(f"previous_use_code: {schema_after['previous_use_code']}")
print(f"previous_use_category: {schema_after['previous_use_category']} (새로 생성)")
print(f"single_use_flag_raw: {schema_after['single_use_flag_raw']}")
print(f"single_use_flag_bin: {schema_after['single_use_flag_bin']}")

report_source_code: Categorical
reprocessed_and_reused_flag_raw: String
reprocessed_and_reused_flag_bin: Int8
manufacturer_link_flag_raw: String
manufacturer_link_flag_bin: Int8
previous_use_code: String
previous_use_category: String (새로 생성)
single_use_flag_raw: String
single_use_flag_bin: Int8


In [15]:
# 샘플 데이터 확인
print("\n전처리 후 샘플 데이터 (10개):")
sample_after = df.select([
    'report_source_code',
    'reprocessed_and_reused_flag_raw',
    'reprocessed_and_reused_flag_bin',
    'manufacturer_link_flag_raw',
    'manufacturer_link_flag_bin',
    'previous_use_code',
    'previous_use_category',
    'single_use_flag_raw',
    'single_use_flag_bin'
]).head(10).collect()
print(sample_after)


전처리 후 샘플 데이터 (10개):
shape: (10, 9)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ report_so ┆ reprocess ┆ reprocess ┆ manufactu ┆ … ┆ previous_ ┆ previous_ ┆ single_us ┆ single_u │
│ urce_code ┆ ed_and_re ┆ ed_and_re ┆ rer_link_ ┆   ┆ use_code  ┆ use_categ ┆ e_flag_ra ┆ se_flag_ │
│ ---       ┆ used_flag ┆ used_flag ┆ flag_raw  ┆   ┆ ---       ┆ ory       ┆ w         ┆ bin      │
│ cat       ┆ _ra…      ┆ _bi…      ┆ ---       ┆   ┆ str       ┆ ---       ┆ ---       ┆ ---      │
│           ┆ ---       ┆ ---       ┆ str       ┆   ┆           ┆ str       ┆ str       ┆ i8       │
│           ┆ str       ┆ i8        ┆           ┆   ┆           ┆           ┆           ┆          │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ Manufactu ┆ N         ┆ 0         ┆ Y         ┆ … ┆ U         ┆ Unknown   ┆ Y         ┆ 1        │
│ rer       ┆           ┆           ┆           ┆   ┆  

In [16]:
# 검사 1: single_use_flag_bin vs previous_use_category
print("\n[검사 1] single_use_flag_bin vs previous_use_category")
result1 = check_single_use_consistency(df)

# 검사 2: adverse_event_flag vs event_type
print("\n[검사 2] adverse_event_flag vs event_type")
result2 = check_adverse_event_consistency(df)

# 검사 3: manufacturer_link_flag_bin vs report_source_code
print("\n[검사 3] manufacturer_link_flag_bin vs report_source_code")
result3 = analyze_manufacturer_link_vs_report_source(df)


[검사 1] single_use_flag_bin vs previous_use_category

single_use_flag_bin vs previous_use_category 일관성 검사
------------------------------------------------------------
전체 데이터: 14,132,321개

전체 교차표:
shape: (9, 3)
┌─────────────────────┬───────────────────────┬─────────┐
│ single_use_flag_bin ┆ previous_use_category ┆ len     │
│ ---                 ┆ ---                   ┆ ---     │
│ i8                  ┆ str                   ┆ u32     │
╞═════════════════════╪═══════════════════════╪═════════╡
│ null                ┆ Unknown               ┆ 1747877 │
│ null                ┆ Initial               ┆ 71012   │
│ null                ┆ Reuse                 ┆ 16067   │
│ 0                   ┆ Unknown               ┆ 3297165 │
│ 0                   ┆ Reuse                 ┆ 1875236 │
│ 0                   ┆ Initial               ┆ 910083  │
│ 1                   ┆ Initial               ┆ 5658429 │
│ 1                   ┆ Unknown               ┆ 548714  │
│ 1                   ┆ Reuse       