In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/2022-ai-w4p1/submit.csv
/kaggle/input/2022-ai-w4p1/train.csv
/kaggle/input/2022-ai-w4p1/test.csv


In [2]:
import random
import torch
import torch.optim as optim
import torch.nn as nn

seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

In [3]:
train = pd.read_csv('../input/2022-ai-w4p1/train.csv')
train

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Diabetes
0,1,95,60,18,58,23.9,0.260,22,0
1,5,105,72,29,325,36.9,0.159,28,0
2,0,135,68,42,250,42.3,0.365,24,1
3,4,131,68,21,166,33.1,0.160,28,0
4,1,103,30,38,83,43.3,0.183,33,0
...,...,...,...,...,...,...,...,...,...
532,5,139,64,35,140,28.6,0.411,26,0
533,1,96,122,0,0,22.4,0.207,27,0
534,10,101,86,37,0,45.6,1.136,38,1
535,0,141,0,0,0,42.4,0.205,29,1


In [4]:
x_train = train.iloc[:,:-1]
y_train = train.iloc[:,-1]

In [5]:
print(x_train.shape)
print(y_train.shape)

(537, 8)
(537,)


# 데이터 정규화

In [6]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_train

array([[-0.8362943 , -0.80005088, -0.53576428, ..., -1.06015343,
        -0.61421636, -0.94861028],
       [ 0.39072767, -0.49054341,  0.12804365, ...,  0.64646721,
        -0.90973787, -0.43466673],
       [-1.14304979,  0.43797901, -0.09322566, ...,  1.35537117,
        -0.30699103, -0.77729576],
       ...,
       [ 1.92450513, -0.6143464 ,  0.90248622, ...,  1.78859026,
         1.94892066,  0.42190587],
       [-1.14304979,  0.62368349, -3.8548039 , ...,  1.36849903,
        -0.77514391, -0.34900947],
       [-1.14304979,  0.12847154,  1.45565949, ..., -1.24394334,
        -0.60836445, -1.03426754]])

In [7]:
x_train = torch.FloatTensor(x_train)
x_train

tensor([[-0.8363, -0.8001, -0.5358,  ..., -1.0602, -0.6142, -0.9486],
        [ 0.3907, -0.4905,  0.1280,  ...,  0.6465, -0.9097, -0.4347],
        [-1.1430,  0.4380, -0.0932,  ...,  1.3554, -0.3070, -0.7773],
        ...,
        [ 1.9245, -0.6143,  0.9025,  ...,  1.7886,  1.9489,  0.4219],
        [-1.1430,  0.6237, -3.8548,  ...,  1.3685, -0.7751, -0.3490],
        [-1.1430,  0.1285,  1.4557,  ..., -1.2439, -0.6084, -1.0343]])

In [8]:
y_train = torch.FloatTensor(y_train.to_numpy())
y_train[:5]

tensor([0., 0., 1., 0., 0.])

# 모델 학습하기

In [9]:
class CustomModel(nn.Module):
    def __init__(self):
        super(CustomModel, self).__init__()
        self.layer = nn.Sequential(
            nn.Linear(8, 120),
            nn.Linear(120, 84),
            nn.Linear(84, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.layer(x)
        return x

In [10]:
model = CustomModel()

In [11]:
# optimzier 선택
optimizer = optim.Adam(model.parameters(),lr=1e-5)

nb_epochs = 100000
for epoch in range(nb_epochs+1):

    
    #H(x)
    hypothesis = model(x_train)
    
    #cost
    loss = nn.BCELoss()
    cost = loss(hypothesis, y_train.unsqueeze(1))
    
    
    #
    optimizer.zero_grad()
    cost.backward()
    optimizer.step()
    
    if epoch%10000==0:
        print('Epoch: {:4d}/{}  Cost:{:.6f}'.format(
            epoch, nb_epochs, cost.item()
        ))

Epoch:    0/100000  Cost:0.725053
Epoch: 10000/100000  Cost:0.459388
Epoch: 20000/100000  Cost:0.459388
Epoch: 30000/100000  Cost:0.459388
Epoch: 40000/100000  Cost:0.459388
Epoch: 50000/100000  Cost:0.459388
Epoch: 60000/100000  Cost:0.459388
Epoch: 70000/100000  Cost:0.459388
Epoch: 80000/100000  Cost:0.459388
Epoch: 90000/100000  Cost:0.459388
Epoch: 100000/100000  Cost:0.459388


In [12]:
hypothesis = model(x_train)
hypothesis[:5]

prediction = hypothesis >= torch.FloatTensor([0.5])

correct_prediction = prediction.float() == y_train.unsqueeze(1)
accuracy = correct_prediction.sum().item() / len(correct_prediction)
accuracy

0.7821229050279329

# 모델 테스트하기

In [13]:
x_test = pd.read_csv('../input/2022-ai-w4p1/test.csv')
x_test

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,98,58,33,190,34.0,0.430,43
1,2,112,75,32,0,35.7,0.148,21
2,2,108,64,0,0,30.8,0.158,21
3,8,107,80,0,0,24.6,0.856,34
4,7,136,90,0,0,29.9,0.210,50
...,...,...,...,...,...,...,...,...
226,0,119,0,0,0,32.4,0.141,24
227,4,109,64,44,99,34.8,0.905,26
228,0,127,80,37,210,36.3,0.804,23
229,6,105,70,32,68,30.8,0.122,37


# 정규화

In [14]:
x_test = scaler.transform(x_test)
x_test

array([[ 0.69748316, -0.70719864, -0.64639893, ...,  0.26575953,
        -0.11680393,  0.85019217],
       [-0.52953881, -0.27388818,  0.29399563, ...,  0.488933  ,
        -0.94192338, -1.03426754],
       [-0.52953881, -0.39769117, -0.31449497, ..., -0.1543317 ,
        -0.91266382, -1.03426754],
       ...,
       [-1.14304979,  0.19037303,  0.57058226, ...,  0.56770011,
         0.97750343, -0.86295302],
       [ 0.69748316, -0.49054341,  0.01740899, ..., -0.1543317 ,
        -1.01799822,  0.33624861],
       [ 0.39072767, -1.35716433,  0.68121692, ...,  0.50206085,
        -0.91851573,  0.16493409]])

In [15]:
x_test = torch.FloatTensor(x_test)
x_test

tensor([[ 0.6975, -0.7072, -0.6464,  ...,  0.2658, -0.1168,  0.8502],
        [-0.5295, -0.2739,  0.2940,  ...,  0.4889, -0.9419, -1.0343],
        [-0.5295, -0.3977, -0.3145,  ..., -0.1543, -0.9127, -1.0343],
        ...,
        [-1.1430,  0.1904,  0.5706,  ...,  0.5677,  0.9775, -0.8630],
        [ 0.6975, -0.4905,  0.0174,  ..., -0.1543, -1.0180,  0.3362],
        [ 0.3907, -1.3572,  0.6812,  ...,  0.5021, -0.9185,  0.1649]])

In [16]:
hypothesis = model(x_test)
hypothesis[:5]

tensor([[0.2659],
        [0.1878],
        [0.1224],
        [0.1486],
        [0.5023]], grad_fn=<SliceBackward0>)

In [17]:
prediction = hypothesis >= torch.FloatTensor([0.5])
prediction = prediction.int()
prediction[:5]

tensor([[0],
        [0],
        [0],
        [0],
        [1]], dtype=torch.int32)

# 제출용 파일 만들기

In [18]:
prediction = np.array(prediction)
prediction = pd.DataFrame(prediction,columns=['Diabetes'])
prediction

Unnamed: 0,Diabetes
0,0
1,0
2,0
3,0
4,1
...,...
226,0
227,0
228,0
229,0


In [19]:
submit = pd.read_csv('../input/2022-ai-w4p1/submit.csv')
submit

Unnamed: 0,ID,Diabetes
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
...,...,...
226,226,0
227,227,0
228,228,0
229,229,0


In [20]:
submit.update(prediction)
submit

Unnamed: 0,ID,Diabetes
0,0,0
1,1,0
2,2,0
3,3,0
4,4,1
...,...,...
226,226,0
227,227,0
228,228,0
229,229,0


In [21]:
submit.to_csv('submit.csv',index=False)
submit

Unnamed: 0,ID,Diabetes
0,0,0
1,1,0
2,2,0
3,3,0
4,4,1
...,...,...
226,226,0
227,227,0
228,228,0
229,229,0
