In [1]:
import torch
import torch.nn as nn

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader

In [2]:
#preprocessing() 함수를 만들어서 전처리를 다 묶어버림

def preprocessing():
    dataset = pd.read_csv("data/car_evaluation.csv")
    categorical_columns = ["price", "maint", "doors", "persons", "lug_capacity", "safety"]
    for category in categorical_columns:
        dataset[category] = dataset[category].astype("category")
        #카테고리화 한다고 해서 데이터 형태는 변화 없음. 숫자로 인코딩하기 위한 데이터 타입이 'cateogory'가 되는것
        #원래는 아무것도 할 수 없는 문자열 형태였음을 기억하자.
    price = dataset["price"].cat.codes.values
    maint = dataset["maint"].cat.codes.values
    doors = dataset["doors"].cat.codes.values
    persons = dataset["persons"].cat.codes.values
    lug_capacity = dataset["lug_capacity"].cat.codes.values
    safety = dataset["safety"].cat.codes.values
    
    categorical_data = np.stack([price, maint, doors, persons, lug_capacity, safety],1)
    #데이터나 형태 변경 없이 타입 자체를 np에서 tensor로 변경하는거
    categorical_data = torch.tensor(categorical_data, dtype=torch.int64)

    #원래 사이즈가 필요한건 아님
    categorical_colum_sizes = [len(dataset[column].cat.categories) for column in categorical_columns]
    categorical_embedding_sizes = [(col_size, min(50, (col_size)+1) // 2) for col_size in categorical_colum_sizes]
    
    #dataset에 'output' column이 있기 때문에 전처리 과정에서 이렇게 쉽게 가능한 것
    outputs = dataset["output"].astype("category").cat.codes.values
    outputs = torch.tensor(outputs, dtype=torch.int64).flatten()

    return categorical_data, outputs, categorical_embedding_sizes
    
#호출
categorical_data, outputs, categorical_embedding_sizes=preprocessing()

In [3]:
X_train, X_test, y_train, y_test = train_test_split(categorical_data, outputs, test_size=0.2, random_state=42)

In [4]:
#nn.Module 하듯이 dataset 클래스 만들때 Dataset 상속 받아야 함

class CarEvaluationDataset(Dataset):
    def __init__(self, categorical_data, outputs):
        self.categorical_data=categorical_data
        self.outputs=outputs
        
    #pytorch가 몇개냐고 물어보는건 학습 데이터가 몇개냐고 물어보는 것
    #len은 길이가 아니라 갯수라는 걸 받아들일 것
    def __len__(self):
        return len(self.categorical_data)

    def __getitem__(self, idx):
        X = self.categorical_data[idx]
        y = self.outputs[idx]
        return X,y

In [5]:
train_dataset = CarEvaluationDataset(X_train, y_train)
test_dataset = CarEvaluationDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [6]:
total_records = 1728
test_records = int(total_records * 0.2)

In [7]:
class Model(nn.Module):
    def __init__(self, embedding_size, output_size, layers, p=0.4):
        super().__init__()
        self.all_embeddings = nn.ModuleList([nn.Embedding(ni, nf) for ni, nf in embedding_size])
        self.embedding_dropout = nn.Dropout(p)
        all_layers = []
        num_categorical_cols = sum((nf for ni, nf in embedding_size))
        input_size = num_categorical_cols
        for i in layers:
            all_layers.append(nn.Linear(input_size, i))
            all_layers.append(nn.ReLU(inplace=True))
            all_layers.append(nn.BatchNorm1d(i))
            all_layers.append(nn.Dropout(p))
            input_size = i
        all_layers.append(nn.Linear(layers[-1], output_size))
        self.layers = nn.Sequential(*all_layers)

    def forward(self, x_categorical):
        embeddings = []
        for i, e in enumerate(self.all_embeddings):
            embeddings.append(e(x_categorical[:,i]))
            
        x = torch.cat(embeddings, 1)
        x = self.embedding_dropout(x)
        x = self.layers(x)
        return x          

In [8]:
model = Model(categorical_embedding_sizes, 4, [200, 100, 50], p=0.4)
model

Model(
  (all_embeddings): ModuleList(
    (0-2): 3 x Embedding(4, 2)
    (3-5): 3 x Embedding(3, 2)
  )
  (embedding_dropout): Dropout(p=0.4, inplace=False)
  (layers): Sequential(
    (0): Linear(in_features=12, out_features=200, bias=True)
    (1): ReLU(inplace=True)
    (2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.4, inplace=False)
    (4): Linear(in_features=200, out_features=100, bias=True)
    (5): ReLU(inplace=True)
    (6): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): Dropout(p=0.4, inplace=False)
    (8): Linear(in_features=100, out_features=50, bias=True)
    (9): ReLU(inplace=True)
    (10): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (11): Dropout(p=0.4, inplace=False)
    (12): Linear(in_features=50, out_features=4, bias=True)
  )
)

In [9]:
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [10]:
epochs = 500
aggregated_losses = []

for epoch in range(epochs):
    model.train()
    for x_batch, y_batch in train_loader:
        y_pred = model(x_batch)
        single_loss = loss_function(y_pred, y_batch)
        aggregated_losses.append(single_loss.item())

        optimizer.zero_grad() #내 배치크기가 64잖아 그럼 1배치의 기울기 채우고 zero 만들고, step으로 다음배치 기울기로 넘어간다 
        single_loss.backward() #손실 기준 기울기 계산
        optimizer.step()

    #매 25에폭마다의 손실을 출력해서 추이 볼라고
    if (epoch+1) % 25 == 0:
        print(single_loss.item())

0.5518639087677002
0.6477234363555908
0.36091455817222595
0.7964324355125427
0.31091761589050293
0.404360830783844
0.32642608880996704
0.5188575387001038
0.621932864189148
0.6079772710800171
0.5263095498085022
0.40141311287879944
0.407059907913208
0.3303210139274597
0.4676009714603424
0.4527946710586548
0.45885592699050903
0.4913432002067566
0.36803796887397766
0.3121527135372162
