In [1]:
import pandas as pd
import numpy as np

In [2]:
import torch
import sklearn

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [4]:
# 1. DataFrame.info()와 DataFrame.describe()를 통해 데이터 확인 (데이터 타입이 Object면 이상감지)
# 2. np.unique(array) 를 통해서 중복제거된 값 확인
# 3. 잘못들어간 값에 대해서 imputer를 이용해 최근 빈도로 넣게 되면 잘못된 학습이 될 가능성이 있음!
# 4. 최근 빈도 대신 상수값을 넣어 처리해도 된다. 
# 5. astype을 이용하여 문자열로 된 숫자 데이터를 실수형으로 변환

In [5]:
# 랜덤 시드 설정
torch.manual_seed(1)
if device == "cuda" :
    torch.cuda.manual_seed_all(1)

In [6]:
# csv 데이터 로드
train = pd.read_csv("2021-ai-midterm-p3/train.csv")
test = pd.read_csv("2021-ai-midterm-p3/test.csv")
submission = pd.read_csv("2021-ai-midterm-p3/submit_sample.csv")

# 데이터 확인
print(train)
print(train.info())
print(train.describe())

     index  age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  \
0        0   63    1   1       145   233    1        2      150      0   
1        1   67    1   4       160   286    0        2      108      1   
2        2   67    1   4       120   229    0        2      129      1   
3        4   41    0   2       130   204    0        2      172      0   
4        5   56    1   2       120   236    0        0      178      0   
..     ...  ...  ...  ..       ...   ...  ...      ...      ...    ...   
237    295   41    1   2       120   157    0        0      182      0   
238    296   59    1   4       164   176    1        2       90      0   
239    299   68    1   4       144   193    1        0      141      0   
240    300   57    1   4       130   131    0        0      115      1   
241    301   57    0   2       130   236    0        2      174      0   

     oldpeak  slope ca thal  target  
0        2.3      3  0    6       0  
1        1.5      2  3    3       2

In [7]:
# one-hot encoding을 통해 ca,thal에 ?란 값이 있음을 확인
print(pd.get_dummies(train['thal']))
print(pd.get_dummies(train['ca']))

     3  6  7  ?
0    0  1  0  0
1    1  0  0  0
2    0  0  1  0
3    1  0  0  0
4    1  0  0  0
..  .. .. .. ..
237  1  0  0  0
238  0  1  0  0
239  0  0  1  0
240  0  0  1  0
241  1  0  0  0

[242 rows x 4 columns]
     0  1  2  3  ?
0    1  0  0  0  0
1    0  0  0  1  0
2    0  0  1  0  0
3    1  0  0  0  0
4    1  0  0  0  0
..  .. .. .. .. ..
237  1  0  0  0  0
238  0  0  1  0  0
239  0  0  1  0  0
240  0  1  0  0  0
241  0  1  0  0  0

[242 rows x 5 columns]


In [8]:
# 데이터 전처리

# Object 타입인 ca, thal 컬럼의 전처리(결측값 = ?)
# from sklearn.impute import SimpleImputer
# imputer = SimpleImputer(missing_values="?",strategy="most_frequent")
# x_train = imputer.fit_transform(x_train)
# x_test = imputer.transform(x_test)

# # thal 처리
# train = pd.concat([train,pd.get_dummies(train['thal']).drop(['?'],axis=1)],axis=1)
# # ca 에서도 3 이라는 컬럼이 생성되므로 컬럼명 교체
# train['3t'] = train['3']
# train = train.drop(['3'],axis=1)
# # ca 처리
# train = pd.concat([train,pd.get_dummies(train['ca']).drop(['?'],axis=1)],axis=1)

# # thal 처리
# test = pd.concat([test,pd.get_dummies(test['thal']).drop(['?'],axis=1)],axis=1)
# # ca 에서도 3 이라는 컬럼이 생성되므로 컬럼명 교체
# test['3t'] = test['3']
# test = test.drop(['3'],axis=1)
# # ca 처리
# test = pd.concat([test,pd.get_dummies(test['ca']).drop(['?'],axis=1)],axis=1)

In [9]:
# 사용하지 않은 컬럼을 제거하여 Train, Test 데이터 설정
# x_train = np.array(train.drop(["index","target","thal","ca"],axis=1))
# x_test = np.array(test.drop(["index","thal","ca"],axis=1))
# y_train = np.array(train["target"])
x_train = np.array(train.drop(["index","target"],axis=1))
x_test = np.array(test.drop(["index"],axis=1))
y_train = np.array(train["target"])

In [10]:
# 데이터 전처리
# 값의 범위가 크므로 정규화가 필요

# Object 타입인 ca, thal 컬럼의 전처리(결측값 = ?)
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values="?",strategy="constant",fill_value=int(8))
x_train = imputer.fit_transform(x_train)
x_test = imputer.transform(x_test)

# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# x_train = scaler.fit_transform(x_train)
# x_test = scaler.transform(x_test)

x_train = x_train.astype(float)
x_test = x_test.astype(float)

In [11]:
# 분류 값 확인
np.unique(y_train)

array([0, 1, 2, 3, 4], dtype=int64)

In [12]:
# 데이터 모양 확인
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)

(242, 13)
(242,)
(61, 13)


In [13]:
# 데이터 Tensor에 올리기
x_train = torch.Tensor(x_train).to(device)
x_test = torch.Tensor(x_test).to(device)
y_train = torch.LongTensor(y_train).to(device)

In [14]:
# 레이어 생성
layer1 = torch.nn.Linear(x_train.shape[1],64,bias=True).to(device)
layer2 = torch.nn.Linear(64,64,bias=True).to(device)
layer3 = torch.nn.Linear(64,64,bias=True).to(device)
layer4 = torch.nn.Linear(64,5,bias=True).to(device)
relu = torch.nn.ReLU().to(device)
dropout = torch.nn.Dropout(0.2).to(device)

In [15]:
# 레이어 가중치 초기화
# torch.nn.init.xavier_normal_(layer1.weight)
# torch.nn.init.xavier_normal_(layer2.weight)
# torch.nn.init.xavier_normal_(layer3.weight)
# torch.nn.init.xavier_normal_(layer4.weight)

In [16]:
# 모델 생성
model = torch.nn.Sequential(layer1, relu,
                           layer2, relu,
                           layer3, relu,
                           layer4).to(device)

In [17]:
# 학습 파라미터 설정
lr = 1e-3
epochs = 700

In [18]:
# 손실 함수, 최적화 함수 정의
loss = torch.nn.CrossEntropyLoss().to(device)
optim = torch.optim.Adam(model.parameters(), lr=lr)

In [19]:
# 학습
model.train()
for epoch in range(epochs+1):
    optim.zero_grad()
    h = model(x_train)
    cost = loss(h,y_train)
    cost.backward()
    optim.step()
    if epoch % (epochs/10) == 0:
        predict = torch.argmax(h,dim=1)
        acc = sum(predict == y_train) / len(y_train)
        print(epoch, cost.item(),acc.item())

0 1.8719284534454346 0.05371900647878647
70 1.0628666877746582 0.5909090638160706
140 0.9015001654624939 0.6694214344024658
210 0.7570487260818481 0.6859503984451294
280 0.6831952333450317 0.7231404781341553
350 0.6244838237762451 0.7438015937805176
420 0.5651431083679199 0.7644627690315247
490 0.5031994581222534 0.8016528487205505
560 0.4547404646873474 0.8305785059928894
630 0.4059681296348572 0.85537189245224
700 0.359203577041626 0.8760330080986023


In [20]:
with torch.no_grad():
    model.eval()
    predict = model(x_test)
    predict = torch.argmax(predict,dim=1)
    predict = predict.cpu().detach().numpy()
    predict = predict != 0
    predict = predict.astype(int)
submission["target"] = predict
print(submission)

    index  target
0       3       0
1       6       1
2      21       0
3      24       1
4      31       1
..    ...     ...
56    293       1
57    294       0
58    297       0
59    298       1
60    302       1

[61 rows x 2 columns]


In [21]:
submission.to_csv("submission.csv",index=False)