In [1]:
import numpy as np
import pandas as pd

In [2]:
import torch
import random

In [3]:
# set device
device = "cuda" if torch.cuda.is_available() else "cpu"

In [4]:
# set seed
torch.manual_seed(1)
random.seed(1)
if device == "cuda":
    torch.cuda.manual_seed_all(1)

In [5]:
# Custom Dataset Class to use data_loader(batch_size)
from torch.utils.data import Dataset

class TrainDataset(Dataset):
    def __init__(self,x,y):
        self.x = torch.Tensor(x).to(device)
        self.y = torch.LongTensor(y).to(device)
        self.len = len(x)
    def __getitem__(self,index):
        return self.x[index],self.y[index]
    def __len__(self):
        return self.len

In [6]:
# data load
train = pd.read_csv("./data1/mnist_train.csv")
test = pd.read_csv("./data1/mnist_test.csv")
submission = pd.read_csv("./data1/submission.csv")

In [18]:
# check data
print(train.head())
print(test.head())
print(submission.head())

   Unnamed: 0    0    1    2    3    4    5    6    7    8  ...  775  776  \
0           0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   
1           1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   
2           2  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   
3           3  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   
4           4  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   

   777  778  779  780  781  782  783  784  
0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  5.0  
1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
2  0.0  0.0  0.0  0.0  0.0  0.0  0.0  4.0  
3  0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  
4  0.0  0.0  0.0  0.0  0.0  0.0  0.0  9.0  

[5 rows x 786 columns]
   Unnamed: 0    0    1    2    3    4    5    6    7    8  ...  774  775  \
0           0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   
1           1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   
2           2  0.0

In [8]:
# preprocessing and set x_train, y_train ,x_test
x_train = np.array(train.drop(['0','784'],axis=1))
y_train = np.array(train['784']).astype(int)
x_test = np.array(test.drop(['0'],axis=1))

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train = le.fit_transform(y_train)

# x_train = torch.Tensor(x_train).to(device)
# y_train = torch.LongTensor(y_train).to(device)
x_test = torch.Tensor(x_test).to(device)
# print(x_train.shape)
# print(y_train.shape)
train_dataset= TrainDataset(x_train,y_train)


In [9]:
# set layer
layer1 = torch.nn.Linear(784,784,bias=True).to(device)
layer2 = torch.nn.Linear(784,512,bias=True).to(device)
layer3 = torch.nn.Linear(512,512,bias=True).to(device)
layer4 = torch.nn.Linear(512,512,bias=True).to(device)
# layer5 = torch.nn.Linear(512,512,bias=True).to(device)
# layer6 = torch.nn.Linear(512,512,bias=True).to(device)
layer5 = torch.nn.Linear(512,len(le.classes_),bias=True).to(device)

relu = torch.nn.ReLU().to(device)
dropout = torch.nn.Dropout(0.5).to(device)

In [10]:
# init layers
torch.nn.init.xavier_normal_(layer1.weight)
torch.nn.init.xavier_normal_(layer2.weight)
torch.nn.init.xavier_normal_(layer3.weight)
torch.nn.init.xavier_normal_(layer4.weight)
torch.nn.init.xavier_normal_(layer5.weight)
# torch.nn.init.xavier_normal_(layer6.weight)
# torch.nn.init.xavier_normal_(layer7.weight)

Parameter containing:
tensor([[ 0.0428,  0.0986, -0.0521,  ..., -0.1125,  0.0049,  0.0007],
        [-0.0517,  0.0146, -0.0199,  ...,  0.0611, -0.0283, -0.0044],
        [-0.0786,  0.0395,  0.0706,  ..., -0.1171, -0.1067, -0.0755],
        ...,
        [ 0.0520,  0.0433,  0.0006,  ...,  0.0198,  0.0292,  0.0357],
        [ 0.0593, -0.1006,  0.0465,  ...,  0.0177,  0.0913, -0.0426],
        [-0.1846, -0.0373, -0.0379,  ...,  0.0046, -0.0413, -0.0390]],
       device='cuda:0', requires_grad=True)

In [11]:
# set model
model = torch.nn.Sequential(layer1,relu,dropout,
                            layer2,relu,dropout,
                            layer3,relu,dropout,
                            layer4,relu,dropout,
                            # layer5,relu,dropout,
                            # layer6,relu,dropout,
                            layer5).to(device)

In [12]:
# set Learning params and function
# 목표 !! cost 0.04 정도 
# layer 5
# epochs 15 , batch_size 100 -> cost 0.4/// 0.9566
# epochs 50, batch_size 600 -> cost 0.17// 0.9706
# epochs 100, batch_size 600 -> cost 0.073// -> overfitting? 0.9796
# epochs 100, batch_size 1000 -> cost 0.094 // 0.9785
# epochs 100, batch_size 500 -> cost  0.067 // 0.9797
# epochs 100, batch_size 300 -> cost  0.114676796 // 0.9782
# 실습 7번 세팅대로 -> cost 0.24 // 0.96
# layer 5->7, epochs 100, batch_size 500 ->  cost 0.08// 0.9657
# layer 5->7, epochs 150, batch_size 500 ->  cost 0.08// 0.9657

## 정답 csv를 다운 받아서 로컬에서 모델 평가를 해보자 
epochs = 150
lr = 0.001
batch_size = 500
optim = torch.optim.Adam(model.parameters(),lr=lr)
loss = torch.nn.CrossEntropyLoss().to(device)
data_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                         batch_size=batch_size,
                                         shuffle = True,
                                         drop_last=True)

In [13]:
# learning
model.train()
batch_num = len(data_loader)
for epoch in range(epochs+1):
    avg_cost = 0
    for x,y in data_loader:
        optim.zero_grad()
        h = model(x)
        cost = loss(h,y)
        cost.backward()
        optim.step()
        avg_cost += cost

    print("epoch = {:02d} || cost = {:.9f}".format(epoch,avg_cost/batch_num))

epoch = 00 || cost = 135.666061401
epoch = 01 || cost = 2.308957100
epoch = 02 || cost = 2.276726484
epoch = 03 || cost = 2.203403950
epoch = 04 || cost = 1.993398905
epoch = 05 || cost = 1.626382709
epoch = 06 || cost = 1.435201287
epoch = 07 || cost = 1.376437545
epoch = 08 || cost = 1.322339773
epoch = 09 || cost = 1.287824035
epoch = 10 || cost = 1.242817402
epoch = 11 || cost = 1.249763131
epoch = 12 || cost = 1.215005159
epoch = 13 || cost = 1.226069450
epoch = 14 || cost = 1.231827617
epoch = 15 || cost = 1.220015645


In [14]:
# accuracy 
with torch.no_grad():
    model.eval()
    predict = model(train_dataset.x)
predict = torch.softmax(predict,dim = 1)
predict = torch.argmax(predict,dim = 1)
# print(predict)
# print(dataset.y)
accuracy = (predict == train_dataset.y).float().mean()
print("Accuracy = {:f}".format(accuracy.item()))

Accuracy = 0.778750


In [15]:
# submission
with torch.no_grad():
    model.eval()
    predict = model(x_test)
predict = torch.softmax(predict,dim=1)
predict = torch.argmax(predict,dim=1)
predict = le.inverse_transform(predict.cpu().detach().numpy())
submission['Label'] = predict
print(submission)

        id  Label
0        0      7
1        1      2
2        2      1
3        3      0
4        4      9
...    ...    ...
9995  9995      2
9996  9996      3
9997  9997      4
9998  9998      8
9999  9999      6

[10000 rows x 2 columns]


In [16]:
submission.to_csv("submission.csv",index=False)
# 왜 학습 이후에 계속 한가지 정수로만 결과가 나올까?(batch_size 적용 X)
# batch size가 없으면 한가지 정수로만 예측 결과가 나온다. -> batch size 적용이 필요 
# batch size를 반복문으로 구현하여도 제대로된 예측 결과가 나오지 않음
# Dataloader를 통해 batch size를 사용해야함 
# Data loader 사용을 위해 Dataset Class 의 인스턴스가 필요함 
# CSV 파일에서 읽어온 데이터를 Dataset Class로 만들기 위해 Dataset을 상속받은 클래스 생성
# 생성자, __len__, __getitem__ 함수 구현해야함 


### 위의 추론은 틀렸다. 
# batch_size = 600 기준 , 신경망의 개수가 5개보다 많아지면 Gradient Vanishing이 발생하는 것 같다.
# 6개 부터는 위와 같이 한가지 정수로만 예측결과를 얻게 된다.
# 우선!!제대로된 batch_size의 적용을 위해 Dataset 클래스를 상속하는 커스텀 클래스를 생성해야함!!
# + learning late 조정 와 batch_size 조정 