In [1]:
import pandas as pd
from glob import glob
from tqdm import tqdm
import numpy as np
import pickle


## 심부전 고위험군 및 저위험군 데이터프레임 생성

In [2]:
edf = pd.read_csv("data/Participant_table_labeled.csv")
edf = edf[edf["risk"] != -1]
edf = edf[["Participant ID"]]
EHRs = glob("data/EHR_original_source/EHR_*.csv")
for ehr in EHRs:
    df = pd.read_csv(ehr)
    edf = pd.merge(edf, df, on='Participant ID', how='left')
print(edf.shape)

(7826, 73)


## EHR 데이터 전처리

In [3]:
# 중복컬럼 제거
duplicate_columns = edf.columns[edf.T.duplicated()]
edf = edf.drop(columns=duplicate_columns)
print(edf.shape)

# 카테고리형 변수 레이블 인코딩
for column in edf.columns:
    if edf[column].dtype == 'object':
        edf[column + '_encoded'] = pd.factorize(edf[column])[0]
        edf = edf.drop(column, axis = 1)

(7826, 61)


## 데이터프레임 저장

In [4]:
#edf.info()
edf.to_csv("data/EHR.csv", index=False)

## SNPs 데이터 시퀀스 형태로 저장

In [5]:
def load_exome_alt(pid, positionFile):
    vdf = pd.read_csv(f"data/filtered_exome_seq/{pid}.csv")
    local_dict = vdf.set_index('snps_position')['ALT'].to_dict()
    vector_values = [local_dict.get(key, 0) for key in positionFile]
    return vector_values

def transform_exome_vector(vector):
    transformed = np.zeros((256, 5), dtype=int)
    for i, v in enumerate(vector):
        if v != 0:
            transformed[i, 0] = 1
        for j, nucleotide in enumerate(['A', 'C', 'G', 'T'], start=1):
            if v == nucleotide:
                transformed[i, j] = 1
    return transformed

with open(f"positions/selected_positions/HFS.txt", "r") as file:
    position = file.read().splitlines()

pdf = pd.read_csv("data/Participant_table_labeled.csv")
pdf = pdf[pdf["risk"] != -1]
pids = list(pdf["Participant ID"])
exome_vectors = {pid: load_exome_alt(pid, position) for pid in tqdm(pids)}
transformed_vectors = {key: transform_exome_vector(value) for key, value in exome_vectors.items()}


100%|███████████████████████████████████████████████████████████████████████████████| 7826/7826 [00:51<00:00, 150.54it/s]


In [6]:
pickle_file_path = 'data/SNPs.pkl'
with open(pickle_file_path, 'wb') as file:
    pickle.dump(transformed_vectors, file)