In [25]:
import torch
import torch.nn as nn

In [5]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

In [15]:
dataset = pd.read_csv("data/car_evaluation.csv")
dataset

Unnamed: 0,price,maint,doors,persons,lug_capacity,safety,output
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc
...,...,...,...,...,...,...,...
1723,low,low,5more,more,med,med,good
1724,low,low,5more,more,med,high,vgood
1725,low,low,5more,more,big,low,unacc
1726,low,low,5more,more,big,med,good


In [16]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   price         1728 non-null   object
 1   maint         1728 non-null   object
 2   doors         1728 non-null   object
 3   persons       1728 non-null   object
 4   lug_capacity  1728 non-null   object
 5   safety        1728 non-null   object
 6   output        1728 non-null   object
dtypes: object(7)
memory usage: 94.6+ KB


In [18]:
categorical_columns = ["price", "maint", "doors", "persons", "lug_capacity", "safety"]
for category in categorical_columns:
    dataset[category] = dataset[category].astype('category')

In [19]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   price         1728 non-null   category
 1   maint         1728 non-null   category
 2   doors         1728 non-null   category
 3   persons       1728 non-null   category
 4   lug_capacity  1728 non-null   category
 5   safety        1728 non-null   category
 6   output        1728 non-null   object  
dtypes: category(6), object(1)
memory usage: 24.7+ KB


In [20]:
price = dataset["price"].cat.codes.values
maint = dataset["maint"].cat.codes.values
doors = dataset["doors"].cat.codes.values
persons = dataset["persons"].cat.codes.values
lug_capacity = dataset["lug_capacity"].cat.codes.values
safety = dataset["safety"].cat.codes.values

In [21]:
categorical_data = np.stack([price, maint, doors, persons, lug_capacity, safety], 1)
categorical_data[:10]

array([[3, 3, 0, 0, 2, 1],
       [3, 3, 0, 0, 2, 2],
       [3, 3, 0, 0, 2, 0],
       [3, 3, 0, 0, 1, 1],
       [3, 3, 0, 0, 1, 2],
       [3, 3, 0, 0, 1, 0],
       [3, 3, 0, 0, 0, 1],
       [3, 3, 0, 0, 0, 2],
       [3, 3, 0, 0, 0, 0],
       [3, 3, 0, 1, 2, 1]], dtype=int8)

In [23]:
outputs = pd.get_dummies(dataset["output"]).values
outputs

array([[False, False,  True, False],
       [False, False,  True, False],
       [False, False,  True, False],
       ...,
       [False, False,  True, False],
       [False,  True, False, False],
       [False, False, False,  True]])

In [26]:
categorical_data = torch.tensor(categorical_data, dtype=torch.int64)
outputs = torch.tensor(outputs).flatten()
#flatten을 안 하면 원핫인코딩 완료된 [n, 4] 배열로 추출됨


In [31]:
print( categorical_data.size(), outputs.size())
#X와 y 사이즈가 안맞는 걸 볼 수 있음, X를 어떻게든 튀겨서 맞추는걸로 시작해야 계층을 줄여가면서 하든 기기 안에 들어가지

torch.Size([1728, 6]) torch.Size([6912])


In [34]:
categorical_column_sizes = [len(dataset[column].cat.categories) for column in categorical_columns]
categorical_embedding_sizes = [(col_size, min(50, (col_size)+1) // 2) for col_size in categorical_column_sizes ]
categorical_embedding_sizes

[(4, 2), (4, 2), (4, 2), (3, 2), (3, 2), (3, 2)]

In [35]:
total_records = 1728 
test_records = int(total_records * 0.2)

In [41]:
categorical_train_data=categorical_data[:total_records-test_records]
categorical_test_data = categorical_data[total_records-test_records:total_records]
train_outputs = outputs[:total_records-test_records]
test_outputs = outputs[total_records-test_records:total_records]
print(len(categorical_train_data), len(categorical_test_data), len(train_outputs), len(test_outputs))
#train, test 나누는 과정 예스럽지만 한번 해봄 ; 

1383 345 1383 345


In [50]:
class Model(nn.Module):
    def __init__(self, embedding_size, output_size, layers, p=0.4): #self. 이하는 다른데서 사용하게끔 만드는 변수
        super().__init__()
        self.all_embeddings = nn.ModuleList([nn.Embedding(ni, nf) for ni, nf in embedding_size])
        self.embedding_dropout = nn.Dropout(p)

        all_layers=[]
        num_categorical_cols = sum((nf for ni, nf in embedding_size))
        input_size = num_categorical_cols

        for i in layers:
            all_layers.append(nn.Linear(input_size, i))
            all_layers.append(nn.ReLU(inplace=True))
            all_layers.append(nn.BatchNorm1d(i))
            all_layers.append(nn.Dropout(p))
            input_size = 1

        all_layers.append(nn.Linear(layers[-1], output_size))
        self.layers=nn.Sequential(*all_layers)

    def forward(self, x_categorical):
        embeddings = [] 
        for i,e in enumerate(self.all_embeddings):
            embeddings.append(e(x_categorical[:,i]))
            x = torch.cat(embeddings, 1)
            x = self.embedding_dropout(x)
            x = self.layers(x)
            return x

In [51]:
model = Model(categorical_embedding_sizes, 4, [200, 100, 50], p=0.4) #4는 계층 갯수
print(model)
#Linear- 처음 인풋레이어는 12개 넣는건 진짜 데이터셋과 동일해야함, 200개는 알아서 튀겨내는거고.
#input layer의 인풋과 output layer의 아웃풋 피쳐는 정확해야 함 
#중간에 [200, 100, 50] 몇개씩 통과할건지. 
#12피쳐를 200개 피쳐로 튀겨내고, ReLU-가중치 알아서 정하고, Batch-reLu를 통과한 최대 200개를 받아서 , 거기서 임의로 40%를 비활성화=0으로 만든다.
#븐류 - linear 중간에 몇개가 들어오고 나가고는 상관없지만, 마지막에는 반드시 이진이면 1개의 값, 다진이면 확률값이 나와야 함.
#마지막 12번 linear에서 out_features=4로 아웃풋 레이어가 나왔기 때문에 그 뒤의 relU 등등을 생략함. -히든레이어 계층

Model(
  (all_embeddings): ModuleList(
    (0-2): 3 x Embedding(4, 2)
    (3-5): 3 x Embedding(3, 2)
  )
  (embedding_dropout): Dropout(p=0.4, inplace=False)
  (layers): Sequential(
    (0): Linear(in_features=12, out_features=200, bias=True)
    (1): ReLU(inplace=True)
    (2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.4, inplace=False)
    (4): Linear(in_features=1, out_features=100, bias=True)
    (5): ReLU(inplace=True)
    (6): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): Dropout(p=0.4, inplace=False)
    (8): Linear(in_features=1, out_features=50, bias=True)
    (9): ReLU(inplace=True)
    (10): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (11): Dropout(p=0.4, inplace=False)
    (12): Linear(in_features=50, out_features=4, bias=True)
  )
)


In [52]:
loss_function = nn.CrossEntropyLoss() 
#손실함수 - 계층의 features가 200,100,50되는 동안 손실되는 함수가 있겠지만 loss_function을 최대한 작게 나오게. 
#loss는 내가 풀려고 하는 문제에 따라 다름.
#회귀처럼 값을 내주는 방정식을 구하는 회귀문제에서는 MSE를 쓰고, 분류에서는 CrossEntropyLoss를 써줌

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
#오솔길 찾는 Optimizer는 Adam이 모를때 가장 무난한 옵티마이저 유형
#lr 얼만큼움직일건데 ~ 요만큼. 

In [56]:
epochs = 500
aggregated_losses = []
train_outputs = train_outputs.to(device='cpu', dtype=torch.int64)
for i in range(epochs):
    i += 1
    y_pred = model(categorical_train_data)
    single_loss = loss_function(y_pred, train_outputs)
    aggregated_losses.append(single_loss)

    if i % 25 == 1:
        print(f"epoch: {i:3} loss: {single_loss.item():10.8f}")

    optimizer.zero_grad()
    single_loss.backward()
    optimizer.step()

print(f"epoch: {i:3} loss: {single_loss.item():10.10f}")

RuntimeError: mat1 and mat2 shapes cannot be multiplied (1383x2 and 12x200)