In [1]:
import pandas as pd

cohort_df = pd.read_csv('cohort_final.csv')
tof_drug_df = pd.read_csv('tof_final_cohort_drug.csv')

# 'person_id'를 기준으로 데이터 병합 (left join)
merged_df = pd.merge(tof_drug_df, cohort_df[['person_id', 'tof_group']], on='person_id', how='left')

# 결과 저장
merged_df.to_csv('tof_final_cohort_drug_with_group.csv', index=False)


In [4]:
# 데이터 로드
merged_df = pd.read_csv('tof_final_cohort_drug_with_group.csv')

# 각 그룹별 고유 person_id 수 계산
group_distinct_counts = merged_df.drop_duplicates(subset=['person_id'])['tof_group'].value_counts()

group_distinct_counts

tof_group
3    925
2    182
1    140
Name: count, dtype: int64

In [5]:
# TOF Total 계산 (group 1, 2, 3 모두 포함)
tof_total = merged_df['person_id'].nunique()

# 결과 출력
tof_total

1247

In [7]:
import pandas as pd

# 데이터 로드
final_cohort_df = pd.read_csv('tof_final_cohort_drug_with_group.csv')

# drug_source_value에 따라 새로운 컬럼 값 할당
def map_drug_category(value):
    diuretics = ['FRS4', 'SRN2', 'HCT']
    raas = ['ENL', 'ENL5', 'LST100', 'LST25']
    beta_blocker = ['CVD3', 'CVD6', 'CVD1', 'CVD', 'CVDS8', 'CVDS16', 'CVDS64', 'CVDS32']
    antiplatelet = ['ASAT1', 'ASAE', 'ASA500', 'CPD', 'CPDN']
    anticoagulation = ['WFR2', 'WFR5']
    etc = ['SCBVS50', 'SCBVS100', 'SCBVS200', 'TVT15', 'TVT30', 'DPGZ']
    class_i = ['FCND50', 'QND', 'MXLT', 'LDC2I', 'LDC1I', 'LDC2I5']
    class_ii = ['NDL', 'ATN', 'ATN2', 'PPN1', 'PPN4', 'PPNS', 'MTP', 'BSPL', 'BSPL1K', 'BSPL2', 'BSPL10', 'CVD3', 'CVD6', 'CVD1', 'CVD', 'CVDS8', 'CVDS16', 'CVDS64', 'CVDS32']
    class_iii = ['AMO', 'AMOI', 'STL4']
    class_iv = ['VRP18', 'VRP4', 'VRP8', 'VRPI', 'DTA3IR', 'DTA3', 'DTA9', 'DTAI', 'DTA18']
    miscellaneous = ['DGXL', 'DGX25', 'DGXI', 'ADN90I', 'ADNI']

    if value in diuretics:
        return 'diuretics'
    elif value in raas:
        return 'RAAS'
    elif value in beta_blocker:
        return 'beta blocker'
    elif value in antiplatelet:
        return 'antiplatelet'
    elif value in anticoagulation:
        return 'anticoagulation'
    elif value in etc:
        return 'etc'
    elif value in class_i:
        return 'Class I'
    elif value in class_ii:
        return 'Class II'
    elif value in class_iii:
        return 'Class III'
    elif value in class_iv:
        return 'Class IV'
    elif value in miscellaneous:
        return 'miscellaneous'
    else:
        return None  # 또는 'Other'를 반환할 수도 있습니다.

# 새 컬럼 생성
final_cohort_df['drug_category'] = final_cohort_df['drug_source_value'].apply(map_drug_category)

# 결과 데이터 확인
final_cohort_df.head()

# 결과를 CSV 파일로 저장
final_cohort_df.to_csv('tof_final_cohort_drug_with_group_and_category.csv', index=False)



In [1]:
import pandas as pd

# Load the datasets
ecg_data = pd.read_csv('tof_ecg.csv')
cohort_data = pd.read_csv('cohort_final2.csv')


In [2]:
# Convert date columns to datetime type for proper comparison and manipulation
ecg_data['measurement_date'] = pd.to_datetime(ecg_data['measurement_date'])
cohort_data['condition_start_date'] = pd.to_datetime(cohort_data['condition_start_date'])

# Adjust the return statement for missing data case to include the correct number of None for all columns
def get_closest_measurement(row):
    # Filter ECG records for the same person
    person_measurements = ecg_data[ecg_data['person_id'] == row['person_id']]
    # Filter out measurements before the condition start date
    valid_measurements = person_measurements[person_measurements['measurement_date'] >= row['condition_start_date']]
    # Find the measurement closest to the condition start date
    if not valid_measurements.empty:
        closest_measurement = valid_measurements.loc[valid_measurements['measurement_date'].idxmin()]
        return closest_measurement
    else:
        return pd.Series([None] * len(ecg_data.columns), index=ecg_data.columns)

# Apply the function to each row in cohort data and join the results as new columns
closest_measurements = cohort_data.apply(get_closest_measurement, axis=1)
closest_measurements.tail()


Unnamed: 0,person_id,measurement_date,VentricularRate,PRInterval,QRSDuration,QTInterval,QTCorrected,Paxis,Raxis,Taxis,Qonset,Qoffset,Ponset,Poffset,QTcFrederica
1442,3789407,2021-11-22,162.0,94.0,66.0,268.0,439.0,27.0,105.0,76.0,232.0,265.0,185.0,222.0,373.0
1443,3789561,2022-03-29,143.0,122.0,58.0,258.0,398.0,31.0,150.0,107.0,223.0,252.0,162.0,207.0,344.0
1444,3790205,2021-10-11,137.0,88.0,44.0,262.0,395.0,38.0,85.0,59.0,234.0,256.0,190.0,217.0,344.0
1445,3792364,2021-11-05,116.0,90.0,60.0,272.0,378.0,39.0,139.0,51.0,221.0,251.0,176.0,204.0,339.0
1446,3793126,2021-12-31,176.0,88.0,56.0,252.0,431.0,48.0,98.0,84.0,221.0,249.0,177.0,203.0,360.0


In [3]:
# Join the closest measurements to the cohort dataframe
augmented_cohort_data = cohort_data.join(closest_measurements, rsuffix='_ecg')

# Show the columns and first few rows of the updated dataframe to verify the join
augmented_cohort_data.head(), augmented_cohort_data.columns

# Save the augmented cohort data to a new CSV file
augmented_cohort_data.to_csv('cohort_final_ecg.csv', index=False)


In [2]:
import pandas as pd

# 데이터 로드
cohort_data = pd.read_csv('cohort_final.csv')
cohort_ecg2_data = pd.read_csv('cohort_final_ecg2.csv')

# cohort_final_ecg2.csv에 있는 person_id 추출
ecg2_person_ids = cohort_ecg2_data['person_id'].unique()

# cohort_final.csv에서 cohort_final_ecg2.csv에 있는 person_id만 남기기
filtered_cohort_data = cohort_data[cohort_data['person_id'].isin(ecg2_person_ids)]

# 결과를 새 CSV 파일로 저장
filtered_cohort_data.to_csv('cohort_final2.csv', index=False)


In [None]:
import pandas as pd

# 데이터 불러오기
cohort_data = pd.read_csv('cohort_final2.csv')
cohort_ecg_data = pd.read_csv('tof_ecg.csv')

# 날짜 형식 변환
cohort_data['condition_start_date'] = pd.to_datetime(cohort_data['condition_start_date'])
cohort_ecg_data['measurement_date'] = pd.to_datetime(cohort_ecg_data['measurement_date'])

# cohort_data에 있는 person_id만을 기반으로 cohort_ecg_data 필터링
filtered_cohort_ecg_data = cohort_ecg_data[cohort_ecg_data['person_id'].isin(cohort_data['person_id'])]

# person_id를 기준으로 조건부 날짜 필터링
# 각 person_id에 대해 condition_start_date 이후의 measurement_date만 남김
filtered_cohort_ecg_data = filtered_cohort_ecg_data.merge(cohort_data[['person_id', 'condition_start_date']], on='person_id', how='left')
filtered_cohort_ecg_data = filtered_cohort_ecg_data[filtered_cohort_ecg_data['measurement_date'] > filtered_cohort_ecg_data['condition_start_date']]

# 결과 확인
print(filtered_cohort_ecg_data.head())

# 필요한 경우 결과를 csv 파일로 저장
filtered_cohort_ecg_data.to_csv('filtered_cohort_final_ecg.csv', index=False)


In [38]:
import pandas as pd

# 데이터 불러오기 및 날짜 형식 변환
data = pd.read_csv('filtered_cohort_final_ecg.csv')
data['measurement_date'] = pd.to_datetime(data['measurement_date'])

# 환자별로 데이터 정렬
data = data.sort_values(by=['person_id', 'measurement_date'])

# 각 환자별로 검사 간의 간격 계산
data['Interval_To_Next'] = data.groupby('person_id')['measurement_date'].transform(lambda x: x.diff().shift(-1))

# 결과 확인
data

Unnamed: 0,person_id,measurement_date,VentricularRate,PRInterval,QRSDuration,QTInterval,QTCorrected,Paxis,Raxis,Taxis,Qonset,Qoffset,Ponset,Poffset,QTcFrederica,condition_start_date,Interval_To_Next
0,342,2006-04-13,79.0,178.0,104.0,424.0,486.0,47.0,220.0,-22.0,209.0,261.0,120.0,175.0,,3/10/2006,25 days
1,342,2006-05-08,90.0,166.0,104.0,396.0,484.0,52.0,236.0,-1.0,209.0,261.0,126.0,181.0,,3/10/2006,3 days
2,342,2006-05-11,127.0,202.0,112.0,282.0,409.0,89.0,184.0,-44.0,227.0,283.0,126.0,171.0,,3/10/2006,40 days
3,342,2006-06-20,77.0,,138.0,412.0,466.0,,203.0,-14.0,205.0,274.0,,,,3/10/2006,8 days
4,342,2006-06-28,127.0,,116.0,322.0,467.0,,191.0,-37.0,206.0,264.0,,,,3/10/2006,1 days
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23819,3793126,2022-01-20,148.0,80.0,84.0,346.0,543.0,11.0,-76.0,-16.0,218.0,260.0,,,467.0,11/25/2021,1 days
23820,3793126,2022-01-21,131.0,70.0,104.0,400.0,590.0,-6.0,210.0,-12.0,216.0,268.0,181.0,203.0,519.0,11/25/2021,7 days
23821,3793126,2022-01-28,149.0,98.0,84.0,292.0,459.0,50.0,126.0,51.0,218.0,260.0,169.0,198.0,395.0,11/25/2021,18 days
23822,3793126,2022-02-15,162.0,96.0,74.0,270.0,443.0,42.0,98.0,37.0,220.0,257.0,172.0,202.0,376.0,11/25/2021,29 days


In [39]:
# Interval_To_Next 컬럼을 기준으로 내림차순 정렬
data_sorted = data.sort_values(by='Interval_To_Next', ascending=False)

# 상위 10개 데이터 추출
top_10_intervals = data_sorted.head(10)

# 결과 출력
top_10_intervals

Unnamed: 0,person_id,measurement_date,VentricularRate,PRInterval,QRSDuration,QTInterval,QTCorrected,Paxis,Raxis,Taxis,Qonset,Qoffset,Ponset,Poffset,QTcFrederica,condition_start_date,Interval_To_Next
3888,2164267,2004-11-23,88.0,208.0,150.0,402.0,486.0,36.0,-38.0,61.0,226.0,301.0,122.0,181.0,,11/22/2004,6288 days
946,41523,2006-03-13,53.0,160.0,152.0,484.0,454.0,21.0,-79.0,27.0,187.0,263.0,107.0,155.0,,12/20/2004,5436 days
12747,2242029,2007-10-24,99.0,132.0,66.0,354.0,454.0,66.0,94.0,78.0,231.0,264.0,165.0,205.0,,3/30/2005,4863 days
9884,2202743,2006-03-27,62.0,136.0,96.0,390.0,395.0,3.0,88.0,51.0,229.0,277.0,161.0,197.0,,12/8/2004,4658 days
16875,2364626,2009-05-13,159.0,106.0,62.0,274.0,445.0,25.0,94.0,59.0,230.0,261.0,177.0,214.0,,9/8/2008,4027 days
13666,2256084,2005-06-08,150.0,106.0,62.0,274.0,432.0,35.0,127.0,46.0,219.0,250.0,166.0,207.0,,2/4/2005,3725 days
11858,2230146,2011-10-26,56.0,196.0,130.0,460.0,443.0,87.0,-81.0,82.0,219.0,284.0,121.0,176.0,,11/22/2004,3581 days
16259,2345344,2010-04-15,124.0,114.0,72.0,320.0,459.0,17.0,16.0,58.0,215.0,251.0,158.0,194.0,,6/18/2008,3528 days
6999,2182098,2012-07-30,139.0,98.0,140.0,360.0,547.0,26.0,114.0,38.0,223.0,293.0,174.0,196.0,,4/1/2005,3522 days
14060,2270459,2012-03-15,78.0,130.0,88.0,358.0,408.0,33.0,98.0,117.0,224.0,268.0,159.0,215.0,,4/14/2010,3451 days


In [44]:
# 각 환자별로 검사 간의 간격 계산
data_sorted['Interval_To_Next'] = data_sorted.groupby('person_id')['measurement_date'].diff().shift(-1).dt.days

# 평균, 중앙값, 표준편차, 최소값, 최대값 계산
average_interval = data_sorted['Interval_To_Next'].mean()
median_interval = data_sorted['Interval_To_Next'].median()
std_dev_interval = data_sorted['Interval_To_Next'].std()
min_interval = data_sorted['Interval_To_Next'].min()
max_interval = data_sorted['Interval_To_Next'].max()

average_interval, median_interval, std_dev_interval, min_interval, max_interval


(136.09168862489392, -1.0, 1774.5911572724438, -6181.0, 6323.0)

In [20]:
import pandas as pd

# 데이터 불러오기 및 날짜 형식 변환
data = pd.read_csv('tof_ecg.csv')
data['measurement_date'] = pd.to_datetime(data['measurement_date'])

# 환자별 검사 횟수 계산
test_counts = data.groupby('person_id').size()

# 전체 평균 검사 횟수
average_test_count = test_counts.mean()

# 최대 검사 횟수
max_test_count = test_counts.max()
min_test_count = test_counts.min()

# 최대 검사 횟수를 가진 person_id 찾기
max_test_person_ids = test_counts[test_counts == max_test_count].index.tolist()

# 결과 출력
print("Average test count:", average_test_count)
print("Maximum test count:", max_test_count)
print("Person IDs with the maximum test count:", max_test_person_ids)
print("Minumum test count:", min_test_count)

Average test count: 15.846985210466439
Maximum test count: 100
Person IDs with the maximum test count: [2158270, 2230912]
Minumum test count: 1


In [22]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

# Load the CSV file
file_path = 'C:\\Users\\SNUH\\Desktop\\tof_final_cohort_baseline.csv'
data = pd.read_csv(file_path)

# Convert the birthdate column to datetime
data['birth'] = pd.to_datetime(data['birth'], errors='coerce')

# Calculate current age
today = datetime.today()
data['age'] = data['birth'].apply(lambda x: today.year - x.year - ((today.month, today.day) < (x.month, x.day)))

# Drop rows with invalid birthdates (if any)
data = data.dropna(subset=['age'])

# Separate the data into age groups under 18 and 18 or older
data_under_18 = data[data['age'] < 18]
data_over_18 = data[data['age'] >= 18]

# Count the number of patients in each group by gender
count_under_18 = data_under_18['gender'].value_counts().reindex(['F', 'M'], fill_value=0).to_dict()
count_over_18 = data_over_18['gender'].value_counts().reindex(['F', 'M'], fill_value=0).to_dict()

# Add total counts
count_under_18['Total'] = sum(count_under_18.values())
count_over_18['Total'] = sum(count_over_18.values())

# Create a summary table
age_summary = pd.DataFrame({
    'Age Group': ['Under 18', '18 and older'],
    'Female': [count_under_18['F'], count_over_18['F']],
    'Male': [count_under_18['M'], count_over_18['M']],
    'Total': [count_under_18['Total'], count_over_18['Total']]
})

# Display the summary table
age_summary


Unnamed: 0,Age Group,Female,Male,Total
0,Under 18,260,329,589
1,18 and older,462,679,1141


In [23]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

# Load the CSV file
file_path = 'C:\\Users\\SNUH\\Desktop\\tof_final_cohort_baseline.csv'
data = pd.read_csv(file_path)

# Convert the birthdate column to datetime
data['birth'] = pd.to_datetime(data['birth'], errors='coerce')

# Calculate current age
today = datetime.today()
data['current_age'] = data['birth'].apply(lambda x: today.year - x.year - ((today.month, today.day) < (x.month, x.day)))

# Drop rows with invalid birthdates (if any)
data = data.dropna(subset=['current_age'])

# Add a column for patients 18 or older
data['18_or_older'] = data['current_age'].apply(lambda x: 'Y' if x >= 18 else '')

# Save the updated DataFrame to a new CSV file
output_path = 'C:\\Users\\SNUH\\Desktop\\updated_tof_cohort.csv'
data.to_csv(output_path, index=False)

# Display the first few rows of the updated data
print(data.head())

     pt_no      birth gender  current_age 18_or_older
0  7733457 1967-08-15      F           56           Y
1  7866340 1964-10-15      F           59           Y
2  7895177 1960-03-15      F           64           Y
3  8296331 1969-12-15      F           54           Y
4  8471613 1964-02-15      F           60           Y


In [13]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

# Load the CSV file
file_path = 'C:\\Users\\SNUH\\Desktop\\tof_final_cohort_baseline.csv'
data = pd.read_csv(file_path)

# Convert the birthdate column to datetime
data['birth'] = pd.to_datetime(data['birth'], errors='coerce')

# Calculate current age
today = datetime.today()
data['age'] = data['birth'].apply(lambda x: today.year - x.year - ((today.month, today.day) < (x.month, x.day)))

# Drop rows with invalid birthdates (if any)
data = data.dropna(subset=['age'])

# Define age bins and labels
bins = [0, 20, 40, 60, float('inf')]
labels = ['0-19', '20-39', '40-59', '60+']

# Create age group column
data['age_group'] = pd.cut(data['age'], bins=bins, labels=labels, right=False)

# Count the number of patients in each group by gender and overall
summary_table = data.pivot_table(index='age_group', columns='gender', aggfunc='size', fill_value=0)

# Add a total column
summary_table['Total'] = summary_table.sum(axis=1)

# Rename columns for clarity
summary_table.columns = ['Female', 'Male', 'Total']

# Reset index to make the age_group a column
summary_table = summary_table.reset_index()

# Display the summary table
summary_table


Unnamed: 0,age_group,Female,Male,Total
0,0-19,292,375,667
1,20-39,325,524,849
2,40-59,92,95,187
3,60+,13,14,27
