# Intro

 * Target : 중증질환 (뇌경색, 뇌출혈, 복부손상, 심근경색)
 * 데이터 분석 결과를 바탕으로 Target에 영향을 주는 Feature 전처리 (함수 정의)
 * 머신러닝/딥러닝 모델링 후 성능 비교
 * 최적AI 모델 선정 및 저장
 * 새로운 출동 이력에 제시된 환자의 증상을 바탕으로 중증 질환 예측 함수 정의

# 0. import

In [1]:
import pandas as pd    # pandas 데이터프레임을 생성/편집하기 위해 사용 합니다.
import matplotlib.pyplot as plt   
from wordcloud import WordCloud    
from collections import Counter   
import re    
from PIL import Image 
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook_connected"
plt.rcParams['font.family'] = 'Malgun Gothic'
plt.rcParams['axes.unicode_minus'] = False

# 1. 데이터 로드

In [2]:
data = pd.read_csv("./119_emergency_dispatch.csv", encoding="cp949" )
desease = data[data['중증질환'].isin(['심근경색', '복부손상', '뇌경색', '뇌출혈'])].copy()

# 데이터 랜덤으로 섞기

desease = desease.sample(frac=1).reset_index(drop=True)

total_result = pd.read_csv("변수선택.csv")

# 2. 학습용, 평가용 데이터 준비

In [3]:
def preprocessing(desease) :

    desease = desease.copy()
    desease["발열"] = [1 if i >= 37 else 0 for i in desease["체온"]]
    desease["고혈압"] = [1 if i >= 140 else 0 for i in desease["수축기 혈압"]]
    desease["저혈압"] = [1 if i <= 90 else 0 for i in desease["수축기 혈압"]]

    X = desease[total_result["var"]]

    return X

In [4]:
target = "중증질환"
Y =  desease[target]
X = preprocessing(desease)

In [5]:
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y  = train_test_split(X, Y, test_size = 0.3, random_state = 2023)

# 3. 모델링

 * 활용 모델 : DecisionTreeClassifier, RandomForestClassifier, XGBClassifier, DNN
 * 성능 평가 : accuracy_score

## (1) tree

In [6]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import *

tree = DecisionTreeClassifier(max_depth = 5)

tree.fit(train_x, train_y)

tree_pred = tree.predict(test_x)

tree_acc = accuracy_score(test_y, tree_pred)
print(tree_acc)

0.8482772122161315


## (2) RandomForest

In [7]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(max_depth = 5, n_estimators = 5)

rf.fit(train_x, train_y)

rf_pred = rf.predict(test_x)

rf_acc = accuracy_score(test_y, rf_pred)
print(rf_acc)

0.8784259984338293


## (3) XGBoost

In [8]:
from xgboost import XGBClassifier

xg =  XGBClassifier()

train_y1 = train_y.map({'뇌경색':0, '뇌출혈':1, '복부손상':2, '심근경색':3})
test_y1 = test_y.map({'뇌경색':0, '뇌출혈':1, '복부손상':2, '심근경색':3})

xg.fit(train_x, train_y1)

xg_pred = xg.predict(test_x)

xg_acc = accuracy_score(test_y1, xg_pred)
print(xg_acc)

0.9193422083007048


## (4) DNN 

In [9]:
nf = train_x.shape[1]
nf

19

In [10]:
from keras.layers import Dense, Dropout
from keras.models import Sequential
from keras.losses import sparse_categorical_crossentropy
from keras.optimizers import Adam
from keras.backend import clear_session
from keras.callbacks import EarlyStopping, ModelCheckpoint

clear_session()


model = Sequential()
model.add(Dense(128, input_shape=(nf,), activation ="relu"))
model.add(Dense(4,  activation ="softmax"))

model.compile(loss = sparse_categorical_crossentropy,
                            optimizer = Adam(0.001), metrics = ["accuracy"])

history = model.fit(train_x, train_y1, validation_split = 0.2,
                              epochs = 20, batch_size = 16,  verbose = 0).history

dl_pred = model.predict(test_x).argmax(axis = 1)

dl_acc = accuracy_score(test_y1, dl_pred)



In [11]:
dl_acc

0.8539545810493344

## (5) 최적 모델 선정 및 저장

In [12]:
## 질문) 최적 모델로 선정된 것은 무엇인가?
result = pd.DataFrame({"model " : ["tree","RF", "XGBoost", "DL"],
                                "acc" : [tree_acc, rf_acc, xg_acc, dl_acc]})
result

Unnamed: 0,model,acc
0,tree,0.848277
1,RF,0.878426
2,XGBoost,0.919342
3,DL,0.853955


In [13]:
import joblib

joblib.dump(xg, "best_model.pkl")

['best_model.pkl']