In [1]:
import pandas as pd

cohort_df = pd.read_csv('cohort_final.csv')
tof_drug_df = pd.read_csv('tof_final_cohort_drug.csv')

# 'person_id'를 기준으로 데이터 병합 (left join)
merged_df = pd.merge(tof_drug_df, cohort_df[['person_id', 'tof_group']], on='person_id', how='left')

# 결과 저장
merged_df.to_csv('tof_final_cohort_drug_with_group.csv', index=False)


In [4]:
# 데이터 로드
merged_df = pd.read_csv('tof_final_cohort_drug_with_group.csv')

# 각 그룹별 고유 person_id 수 계산
group_distinct_counts = merged_df.drop_duplicates(subset=['person_id'])['tof_group'].value_counts()

group_distinct_counts

tof_group
3    925
2    182
1    140
Name: count, dtype: int64

In [5]:
# TOF Total 계산 (group 1, 2, 3 모두 포함)
tof_total = merged_df['person_id'].nunique()

# 결과 출력
tof_total

1247

In [7]:
import pandas as pd

# 데이터 로드
final_cohort_df = pd.read_csv('tof_final_cohort_drug_with_group.csv')

# drug_source_value에 따라 새로운 컬럼 값 할당
def map_drug_category(value):
    diuretics = ['FRS4', 'SRN2', 'HCT']
    raas = ['ENL', 'ENL5', 'LST100', 'LST25']
    beta_blocker = ['CVD3', 'CVD6', 'CVD1', 'CVD', 'CVDS8', 'CVDS16', 'CVDS64', 'CVDS32']
    antiplatelet = ['ASAT1', 'ASAE', 'ASA500', 'CPD', 'CPDN']
    anticoagulation = ['WFR2', 'WFR5']
    etc = ['SCBVS50', 'SCBVS100', 'SCBVS200', 'TVT15', 'TVT30', 'DPGZ']
    class_i = ['FCND50', 'QND', 'MXLT', 'LDC2I', 'LDC1I', 'LDC2I5']
    class_ii = ['NDL', 'ATN', 'ATN2', 'PPN1', 'PPN4', 'PPNS', 'MTP', 'BSPL', 'BSPL1K', 'BSPL2', 'BSPL10', 'CVD3', 'CVD6', 'CVD1', 'CVD', 'CVDS8', 'CVDS16', 'CVDS64', 'CVDS32']
    class_iii = ['AMO', 'AMOI', 'STL4']
    class_iv = ['VRP18', 'VRP4', 'VRP8', 'VRPI', 'DTA3IR', 'DTA3', 'DTA9', 'DTAI', 'DTA18']
    miscellaneous = ['DGXL', 'DGX25', 'DGXI', 'ADN90I', 'ADNI']

    if value in diuretics:
        return 'diuretics'
    elif value in raas:
        return 'RAAS'
    elif value in beta_blocker:
        return 'beta blocker'
    elif value in antiplatelet:
        return 'antiplatelet'
    elif value in anticoagulation:
        return 'anticoagulation'
    elif value in etc:
        return 'etc'
    elif value in class_i:
        return 'Class I'
    elif value in class_ii:
        return 'Class II'
    elif value in class_iii:
        return 'Class III'
    elif value in class_iv:
        return 'Class IV'
    elif value in miscellaneous:
        return 'miscellaneous'
    else:
        return None  # 또는 'Other'를 반환할 수도 있습니다.

# 새 컬럼 생성
final_cohort_df['drug_category'] = final_cohort_df['drug_source_value'].apply(map_drug_category)

# 결과 데이터 확인
final_cohort_df.head()

# 결과를 CSV 파일로 저장
final_cohort_df.to_csv('tof_final_cohort_drug_with_group_and_category.csv', index=False)



In [9]:
import pandas as pd

# Load the datasets
ecg_data = pd.read_csv('tof_ecg.csv')
cohort_data = pd.read_csv('cohort_final.csv')


In [14]:
# Convert date columns to datetime type for proper comparison and manipulation
ecg_data['measurement_date'] = pd.to_datetime(ecg_data['measurement_date'])
cohort_data['condition_start_date'] = pd.to_datetime(cohort_data['condition_start_date'])

# Adjust the return statement for missing data case to include the correct number of None for all columns
def get_closest_measurement(row):
    # Filter ECG records for the same person
    person_measurements = ecg_data[ecg_data['person_id'] == row['person_id']]
    # Filter out measurements before the condition start date
    valid_measurements = person_measurements[person_measurements['measurement_date'] >= row['condition_start_date']]
    # Find the measurement closest to the condition start date
    if not valid_measurements.empty:
        closest_measurement = valid_measurements.loc[valid_measurements['measurement_date'].idxmin()]
        return closest_measurement
    else:
        return pd.Series([None] * len(ecg_data.columns), index=ecg_data.columns)

# Apply the function to each row in cohort data and join the results as new columns
closest_measurements = cohort_data.apply(get_closest_measurement, axis=1)
closest_measurements.tail()


Unnamed: 0,person_id,measurement_date,VentricularRate,PRInterval,QRSDuration,QTInterval,QTCorrected,Paxis,Raxis,Taxis,Qonset,Qoffset,Ponset,Poffset,QTcFrederica
1549,,NaT,,,,,,,,,,,,,
1550,,NaT,,,,,,,,,,,,,
1551,,NaT,,,,,,,,,,,,,
1552,,NaT,,,,,,,,,,,,,
1553,,NaT,,,,,,,,,,,,,


In [13]:
# Join the closest measurements to the cohort dataframe
augmented_cohort_data = cohort_data.join(closest_measurements, rsuffix='_ecg')

# Show the columns and first few rows of the updated dataframe to verify the join
augmented_cohort_data.head(), augmented_cohort_data.columns

# Save the augmented cohort data to a new CSV file
augmented_cohort_data.to_csv('cohort_final_ecg.csv', index=False)


In [2]:
import pandas as pd

# 데이터 로드
cohort_data = pd.read_csv('cohort_final.csv')
cohort_ecg2_data = pd.read_csv('cohort_final_ecg2.csv')

# cohort_final_ecg2.csv에 있는 person_id 추출
ecg2_person_ids = cohort_ecg2_data['person_id'].unique()

# cohort_final.csv에서 cohort_final_ecg2.csv에 있는 person_id만 남기기
filtered_cohort_data = cohort_data[cohort_data['person_id'].isin(ecg2_person_ids)]

# 결과를 새 CSV 파일로 저장
filtered_cohort_data.to_csv('cohort_final2.csv', index=False)
