In [2]:
import pandas as pd
import cudf
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import RobustScaler

%cd /data/dhk/physionet.org/files/mimiciv/2.2/hosp

/data/dhk/physionet.org/files/mimiciv/2.2/hosp


# 작업디렉토리 설정

In [None]:
# %cd C:\mimic-iv-2.2\mimic-iv-2.2\hosp

In [None]:
%cd /data/dhk/physionet.org/files/mimiciv/2.2/hosp

# d_icd_diagnoses파일

In [None]:
df_d_icd_diagnoses = pd.read_csv('d_icd_diagnoses.csv')
df_d_icd_diagnoses.head()

# diagnoses_icd파일

In [None]:
df_diagnoses_icd = pd.read_csv("diagnoses_icd.csv")
df_diagnoses_icd.head()

# 고혈압&암환자

In [None]:
# 고혈압 진단 코드 정규식
high_bp_regex = r'^I1[0-5]$|^401'

#암 진단 코드 정규식  
#ICD-9 코드 분류 (140-239)
#ICD-10 코드 분류 (C00-C96, D00-D09)
cancer_regex = r'^(D(0[0-9])|C([0-8][0-9]|9[0-6])|(1[4-9][0-9]|2[0-3][0-9])).*'


# 암 진단 코드 추출
cancer_df = df_diagnoses_icd[df_diagnoses_icd['icd_code'].str.match(cancer_regex)]

# 암 진단을 받은 환자 ID 추출
cancer_patient_ids = cancer_df['subject_id'].unique()

# 암 진단을 받은 환자들의 데이터 추출
cancer_diagnoses_merged = pd.merge(df_diagnoses_icd, df_d_icd_diagnoses, on=['icd_code'])
cancer_patients_data = cancer_diagnoses_merged[cancer_diagnoses_merged['subject_id'].isin(cancer_patient_ids)]



# 암 및 고혈압을 동시에 가진 환자들의 데이터 추출 (첫 번째 진단만 포함)
cancer_high_bp_ids = cancer_patients_data[cancer_patients_data['icd_code'].str.match(high_bp_regex)]['subject_id'].unique()
cancer_high_bp_df = cancer_diagnoses_merged[(cancer_diagnoses_merged['subject_id'].isin(cancer_high_bp_ids)) & 
                                            (cancer_diagnoses_merged['seq_num'] == 1) & 
                                            (cancer_diagnoses_merged['icd_code'].str.match(cancer_regex))]

cancer_high_bp_df

# 첫입원기록만 가져옴

In [None]:
# subject_id로 그룹화하고 각 그룹에서 hadm_id가 가장 작은 행 선택
result = cancer_high_bp_df.loc[cancer_high_bp_df.groupby('subject_id')['hadm_id'].idxmin()]

# 필요시 인덱스 리셋
result.reset_index(drop=True, inplace=True)
result

# patients파일

In [None]:
df_patients = pd.read_csv("patients.csv")
df_patients.head(3)

# result와 patients 병합

In [None]:
merge2 = pd.merge(result,df_patients, on=['subject_id'])
merge2

# 환자나이 계산하기 위해 merge

In [None]:
df_admission = pd.read_csv("admissions.csv")
merge2_admission = pd.merge(merge2,df_admission, on=['subject_id','hadm_id'])
merge2_admission

# 입원시점에서 환자나이 계산

In [None]:
# 출생 연도 계산
print(merge2_admission[['anchor_year','anchor_age']].head(5))
print("~~")

##adgmit_age가 이상한 경우
#  환자의 나이가 89세 이상인 경우, 데이터 보호를 위해 모든 연령이 89세로 고정되어 있다는 것입니다.
merge2_admission = merge2_admission[merge2_admission['anchor_age']<89]

# admittime 열을 datetime 형식으로 변환
merge2_admission['birth_year'] = merge2_admission['anchor_year'] - merge2_admission['anchor_age']

# 입원 연도 추출
admit_year = pd.to_datetime(merge2_admission['admittime']).dt.year

# 입원 시점의 나이 계산
merge2_admission['admit_age'] = admit_year - merge2_admission['birth_year']

#인덱스 초기화
merge2_admission.reset_index(drop=True, inplace=True)

#결과확인
print(merge2_admission[['subject_id', 'admittime', 'admit_age','birth_year','anchor_year','anchor_age']].head())

# 예측했을때 써야할 피쳐들만 가지고왔는데 의논이 필요한듯

In [None]:
merge2_admission = merge2_admission[['subject_id','hadm_id','admit_age','seq_num','gender','dod','deathtime','insurance','language','marital_status','race','hospital_expire_flag','birth_year',]]
merge2_admission

# death비율과 death하지 않은비율

In [None]:
merge2_admission['hospital_expire_flag'].value_counts()

# death 종양환자 평균나이 비교

In [None]:
print(f"종양환자의 평균나이 :{ merge2_admission['admit_age'].mean() }")

C_death = merge2_admission[merge2_admission['hospital_expire_flag']==1]
C_death_mean_age = C_death['admit_age'].mean()
print(f"death한 종양환자의 평균나이 :{C_death_mean_age}")

C_not_death = merge2_admission[merge2_admission['hospital_expire_flag']== 0]
C_not_death_mean_age = C_not_death['admit_age'].mean()
print(f"death하지않은 종양환자의 평균나이 :{C_not_death_mean_age}")

# omr(환자 몸무게 혈압 등등) merge

In [None]:
omr = pd.read_csv("omr.csv")
# omr
merge3_omr = pd.merge(merge2_admission,omr, on=['subject_id'])
merge3_omr

# pharmacy(환자 약물 처방 및 투약 내역) merge

In [None]:
pharmacy = pd.read_csv("pharmacy.csv")
pharmacy = pharmacy[['subject_id', 'hadm_id','poe_id','medication','frequency','doses_per_24_hrs']]
# pharmacy
merge4_pharmacy = pd.merge(merge3_omr,pharmacy, on=['subject_id','hadm_id'])
merge4_pharmacy

# prescriptions(약물 처방 기록) merge

In [None]:
prescriptions = pd.read_csv("prescriptions.csv")
prescriptions = prescriptions[['subject_id', 'hadm_id','poe_id','drug','dose_val_rx','form_unit_disp','doses_per_24_hrs']]
merge5_prescriptions = pd.merge(merge4_pharmacy,prescriptions, on=['subject_id','hadm_id','poe_id'])
merge5_prescriptions

# 전처리한 csv파일 저장

In [None]:
# merge5_prescriptions.to_csv('machine_dohun.csv', index=False)

# csv파일 불러오기

In [3]:
dh = pd.read_csv("machine_dohun.csv")
# dh = dh.to_pandas()

  dh = pd.read_csv("machine_dohun.csv")


# 머신러닝에 내가 쓸 칼럼만 사용

In [4]:
dh = dh[['subject_id', 'hadm_id', 'admit_age', 'gender','insurance', 'language', 'marital_status', 'race',
       'hospital_expire_flag', 'medication', 'frequency','drug']]

# marital_status칼럼 NaN값 핸들링

In [5]:
dh['marital_status'].fillna('Unknown', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dh['marital_status'].fillna('Unknown', inplace=True)


# medication 칼럼 NaN값 핸들링

In [6]:
dh['medication'].fillna('Unknown', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dh['medication'].fillna('Unknown', inplace=True)


# frequency 칼럼 NaN값 핸들링

In [7]:
dh['frequency'].fillna('Unknown', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dh['frequency'].fillna('Unknown', inplace=True)


In [8]:
# 라벨인코딩 후 스케일링

# 범주형데이터 라벨인코딩(범주형 데이터는 스케일 안해줄거임)

In [9]:
# LabelEncoder 객체 생성
label_encoder = LabelEncoder()

# marital_status 열 라벨 인코딩
dh['gender'] = label_encoder.fit_transform(dh['gender'])
dh['insurance'] = label_encoder.fit_transform(dh['insurance'])
dh['language'] = label_encoder.fit_transform(dh['language'])
dh['marital_status'] = label_encoder.fit_transform(dh['marital_status'])
dh['race'] = label_encoder.fit_transform(dh['race'])
dh['medication'] = label_encoder.fit_transform(dh['medication'])
dh['frequency'] = label_encoder.fit_transform(dh['frequency'])
dh['drug'] = label_encoder.fit_transform(dh['drug'])

# 스케일러 해줄 칼럼 : admit_age, 

In [10]:
# RobustScaler 인스턴스 생성
scaler = RobustScaler()

# 'feature1' 칼럼 스케일링
dh['admit_age'] = scaler.fit_transform(dh[['admit_age']])

In [11]:
dh['hospital_expire_flag'].value_counts()

hospital_expire_flag
0    84023796
1     5175148
Name: count, dtype: int64

# under sampling

In [12]:
from sklearn.utils import resample
import cudf

In [13]:
df = cudf.DataFrame(dh)

df_majority = df[df['hospital_expire_flag'] == 0]
df_minority = df[df['hospital_expire_flag'] == 1]

# 다수 클래스 언더샘플링
df_majority_downsampled = resample(df_majority,
                                   replace=False,    # 샘플을 복원하지 않음
                                   n_samples=len(df_minority),  # 소수 클래스와 동일한 수로 샘플링
                                   random_state=123)  # 재현성을 위한 랜덤 시드

# 소수 클래스와 결합
df_dh = cudf.concat([df_majority_downsampled, df_minority])

# 라벨 분포 확인
print(df_dh['hospital_expire_flag'].value_counts())

hospital_expire_flag
0    5175148
1    5175148
Name: count, dtype: int64


# 모델링

In [None]:
import cupy
from cuml.ensemble import RandomForestClassifier

from cuml.model_selection import train_test_split
from cuml.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

## RandomForestClassifier

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_dh, 'hospital_expire_flag', train_size = 0.8)

X_train = X_train.to_cupy()
X_test = X_test.to_cupy()
y_train = y_train.to_cupy()
y_test = y_test.to_cupy()

# 모델 및 그리드 서치 설정
cuml_randomForest = RandomForestClassifier(random_state=42, n_streams=1)

param_grid = {
    'n_estimators': [100, 200, 300],  # 정수형 값
    'max_depth': [10, 20, 30],        # 정수형 값
    'min_samples_split': [2, 5, 10]   # 정수형 값
}
randomForest_grid_search = GridSearchCV(
    cuml_randomForest,
    param_grid=param_grid,
    scoring ='accuracy',
    cv=3,
    return_train_score=True,
    verbose=2
)


#-------------------------------------------------------------
### 랜덤포레스트
# 모델 학습
print("random_forest 시작!")
randomForest_grid_search.fit(X_train.get(), y_train.get())

# 최적 모델로 예측
randomForest_best_model = randomForest_grid_search.best_estimator_
randomForest_y_pred = randomForest_best_model.predict(X_test)
print("random_forest 끝!")

random_forest 시작!
Fitting 3 folds for each of 27 candidates, totalling 81 fits


In [None]:
# 정확도 계산
accuracy = accuracy_score(y_test, randomForest_y_pred)
print("Accuracy:", accuracy)