In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate, StratifiedKFold, RepeatedStratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report, RocCurveDisplay, roc_curve,auc, precision_recall_curve, average_precision_score, f1_score, auc 
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from imblearn.metrics import geometric_mean_score
from copy import deepcopy
import json
import seaborn as sns
import warnings
import pickle
import shap
warnings.filterwarnings("ignore")
sns.set_theme()

from tqdm import tqdm

In [2]:
import os
# os.chdir('/data/camin/chlee/jupyter/ML project/Data/')
os.chdir('C:/Users/chaehyun/Dropbox/Work/PIPET/과제/산부인과/난소암 연구/Analysis dataset')

In [5]:
ov_cancer = pd.read_excel('OVSTORY_ADS.xlsx')

In [6]:
ov_cancer2 = deepcopy(ov_cancer)

In [7]:
ov_cancer2 = ov_cancer2[['AGE','MP_OXC','FHXNG','BRCACC','BMIG','DMC','PHX_CAC','DELI_OXC_2GROUP','GROUP_OV','GROUPEHY','GROUPEHN','GROUPENONHY','GROUPENONHN']]

In [8]:
ov_cancer2[['BRCACC']] = ov_cancer2[['BRCACC']].fillna(0)

In [9]:
ov_cancer2[['BRCACC']] = ov_cancer2[['BRCACC']].astype("Int64")
ov_cancer2[['BRCACC']] = ov_cancer2[['BRCACC']].astype("category")

In [None]:
print(ov_cancer2['GROUPEHY'].value_counts(dropna=False))
print(ov_cancer2['GROUPEHN'].value_counts(dropna=False))
print(ov_cancer2['GROUPENONHY'].value_counts(dropna=False))
print(ov_cancer2['GROUPENONHN'].value_counts(dropna=False))
print(ov_cancer2[['GROUPEHY','GROUPEHN','GROUPENONHY','GROUPENONHN']].value_counts(dropna=False))

In [None]:
print(ov_cancer2['GROUP_OV'].value_counts(dropna=False))

In [12]:
ov_cancer2.loc[ov_cancer2['GROUPEHN'].notna() | ov_cancer2['GROUPEHY'].notna(), 'FAMILY_HISTORY'] = 1
ov_cancer2.loc[ov_cancer2['GROUPENONHN'].notna() | ov_cancer2['GROUPENONHY'].notna(), 'FAMILY_HISTORY'] = 0

In [13]:
ov_cancer2.loc[ov_cancer2.GROUP_OV == 1, 'label'] = 1 # 1: OV ca
ov_cancer2.loc[ov_cancer2.GROUP_OV == 2, 'label'] = 0 # 0: Non OV ca

ov_cancer2.drop(columns='GROUP_OV', inplace=True)

In [14]:
ov_cancer2['BMIG'] = ov_cancer2['BMIG'].apply(lambda x: 1 if x <=2 else x)

In [15]:
ov_cancer2['FHXNG'] = ov_cancer2['FHXNG'].apply(lambda x: 2 if x >=2 else x)

In [16]:
ov_cancer2['BMIG'] = ov_cancer2['BMIG'].replace(1,0)
ov_cancer2['BMIG'] = ov_cancer2['BMIG'].replace(3,1)

In [17]:
ov_cancer2['FHXNG'] = ov_cancer2['FHXNG'].replace(1,0)
ov_cancer2['FHXNG'] = ov_cancer2['FHXNG'].replace(2,1)

In [18]:
ov_cancer2 = ov_cancer2.drop(['GROUPEHY', 'GROUPEHN', 'GROUPENONHY', 'GROUPENONHN'], axis=1)

In [19]:
X = ov_cancer2.drop(['label'], axis=1) 
y = ov_cancer2['label']

In [20]:
def categorical(data):

    category_col = ['BRCACC']

    data[category_col] = data[category_col].astype('Int64')
    data[category_col] = data[category_col].astype('category')

    binary_factors = ['MP_OXC','DMC','PHX_CAC','DELI_OXC_2GROUP']

    data[binary_factors] = data[binary_factors].astype('Int64')
    data[binary_factors] = data[binary_factors].replace(2,0)
    data[binary_factors] = data[binary_factors].astype('category')
    
    data['FAMILY_HISTORY'] = data['FAMILY_HISTORY'].astype('Int64')
    data['FAMILY_HISTORY'] = data['FAMILY_HISTORY'].astype('category')

    data2 = deepcopy(data)
    
    return data2

In [21]:
for num in range(1,50+1):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, stratify=y, random_state=num)
    
    X_train_ = categorical(X_train)
    X_test_ = categorical(X_test)
    
    continuous_col = ['AGE']
    categorical_col = ['MP_OXC','FHXNG','BRCACC','BMIG','DMC','PHX_CAC','DELI_OXC_2GROUP']

    medians = X_train_[continuous_col].median()
    X_train_[continuous_col] = X_train_[continuous_col].fillna(medians)

    modes = []
    
    for i in range(len(categorical_col)):
        col = categorical_col[i]
        modes.append(X_train_[col].mode(dropna=True)[0])
    
    modes = pd.Series(modes, index = categorical_col)
    X_train_[categorical_col] = X_train_[categorical_col].fillna(modes)


    ### X_test에도 적용 ###
    X_test_[continuous_col] = X_test_[continuous_col].fillna(medians)
    X_test_[categorical_col] = X_test_[categorical_col].fillna(modes)
    
    
    ### dummy화 ###
    category_col = ['BRCACC']
    binary_factors = ['MP_OXC','FHXNG','BMIG','DMC','PHX_CAC','DELI_OXC_2GROUP','FAMILY_HISTORY']

    X_train_ = pd.get_dummies(X_train_, columns=category_col, drop_first=True, dtype='int')
    X_train_[binary_factors] = X_train_[binary_factors].astype('Int64')

    X_test_ = pd.get_dummies(X_test_, columns=category_col, drop_first=True, dtype='int')
    X_test_[binary_factors] = X_test_[binary_factors].astype('Int64')
    
    globals()[f'X_train_{num}'] = X_train_ 
    globals()[f'X_test_{num}'] = X_test_
    globals()[f'y_train_{num}'] = y_train
    globals()[f'y_test_{num}'] = y_test

In [22]:
for i in range(1,50+1):
    exec(f'X_train = X_train_{i}')
    exec(f'X_test = X_test_{i}')
    exec(f'y_train = y_train_{i}')
    exec(f'y_test = y_test_{i}')
    
    X_train.to_csv(f'C:/Users/chaehyun/Dropbox/Work/PIPET/과제/산부인과/난소암 연구/Analysis dataset/Data for WS/[24-12-12]/X_train/X_train_{i}.csv')
    X_test.to_csv(f'C:/Users/chaehyun/Dropbox/Work/PIPET/과제/산부인과/난소암 연구/Analysis dataset/Data for WS/[24-12-12]/X_test/X_test_{i}.csv')
    y_train.to_csv(f'C:/Users/chaehyun/Dropbox/Work/PIPET/과제/산부인과/난소암 연구/Analysis dataset/Data for WS/[24-12-12]/y_train/y_train_{i}.csv')
    y_test.to_csv(f'C:/Users/chaehyun/Dropbox/Work/PIPET/과제/산부인과/난소암 연구/Analysis dataset/Data for WS/[24-12-12]/y_test/y_test_{i}.csv')                