# 1. 문서 분류하는 자연어 분류기
 - Input: 문서 --> ["현재 금리상태는 ...]
 - Output: 시제 --> [""현재, "미래, "과거"']
 - model: 글자 encoding = TF-idf, 학습- Logistic Regression

##### 0) 라이브러리 불러오기

In [40]:
# (1) 기본 라이브러리
import numpy as np
import pandas as pd

# (2) 머신러닝 라이브러리
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

##### 1) 데이터 불러오기

In [41]:
df = pd.read_csv("./data/train_data.csv")[:1000]

In [42]:
df.head(2)

Unnamed: 0,index,premise,hypothesis,label
0,0,"씨름은 상고시대로부터 전해져 내려오는 남자들의 대표적인 놀이로서, 소년이나 장정들이...",씨름의 여자들의 놀이이다.,contradiction
1,1,"삼성은 자작극을 벌인 2명에게 형사 고소 등의 법적 대응을 검토 중이라고 하였으나,...",자작극을 벌인 이는 3명이다.,contradiction


##### 2) Output Encoding

(1) 시제 Unique List 생성

In [43]:
df["label"].value_counts()

entailment       339
contradiction    334
neutral          327
Name: label, dtype: int64

In [44]:
df["label"].value_counts().keys()

Index(['entailment', 'contradiction', 'neutral'], dtype='object')

In [45]:
label_list = df["label"].value_counts().keys().tolist() # !!!
num_label = len(label_list)

In [46]:
num_label

3

(2) 시제 Label Encoding

In [47]:
def label_encoder(x):
  result = 0
  if str(x) == "entailment":
    result = 0
  elif str(x) == "contradiction":
    result = 1
  elif str(x) =="neutral":
    result = 2
  else:
    print("error")
  return result

In [48]:
label_encoder("entailment")

0

In [49]:
df["label_num"] = df["label"].apply(lambda x: label_encoder(x))

In [50]:
df.head(2)

Unnamed: 0,index,premise,hypothesis,label,label_num
0,0,"씨름은 상고시대로부터 전해져 내려오는 남자들의 대표적인 놀이로서, 소년이나 장정들이...",씨름의 여자들의 놀이이다.,contradiction,1
1,1,"삼성은 자작극을 벌인 2명에게 형사 고소 등의 법적 대응을 검토 중이라고 하였으나,...",자작극을 벌인 이는 3명이다.,contradiction,1


##### 3) Input encoding

(1) Corpus 추출

In [51]:
corpus = df["premise"]+df["hypothesis"]

(2) TF-IDF 추출

In [52]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

In [53]:
vectorizer.get_feature_names() # !!!

['000대',
 '000만원까지',
 '000여대로',
 '05월',
 '0시부터',
 '100',
 '1000개',
 '1000만원을',
 '1000명의',
 '100년의',
 '100대',
 '100만대',
 '100번째',
 '101빌딩',
 '104만',
 '10개',
 '10개년',
 '10년',
 '10년전',
 '10만원',
 '10명이',
 '10명이다',
 '10분',
 '10분도',
 '10분동안',
 '10시',
 '10시경',
 '10시쯤',
 '10억점',
 '10여',
 '10월',
 '10위권',
 '10일',
 '10점',
 '10점뤽',
 '10점을',
 '10점짜리',
 '10점평가들한듯큐브릭의',
 '113개',
 '113만대',
 '117만',
 '11만',
 '11명이',
 '11시',
 '11월',
 '11월까지',
 '11일',
 '11일까지이다',
 '12월',
 '12월까지의',
 '12일까지',
 '12층에',
 '12층이긴',
 '13개',
 '13년간',
 '13명이',
 '13일',
 '14',
 '14일',
 '14일부터',
 '15',
 '1517명이',
 '15만',
 '15분',
 '15일',
 '161명을',
 '163명이',
 '163억',
 '16년',
 '16만원',
 '16세기',
 '16일',
 '16일이었다',
 '17',
 '1792년',
 '17일',
 '1805년',
 '1893년',
 '18일',
 '18일부터',
 '1912년',
 '1930년대에',
 '1941명이',
 '1950년부터',
 '1953년',
 '1953년에',
 '1957년에',
 '1960년',
 '1960년대에',
 '1964년',
 '1964년에',
 '1968년',
 '1971년',
 '1978년',
 '1980년도에',
 '1980년에',
 '1981년',
 '1993년',
 '1993년에',
 '19세',
 '19세기',
 '19세기에는',
 '19세는',
 '19일부터',
 '1기당',
 '1남2녀

In [54]:
num_inputs =  len(vectorizer.get_feature_names())

In [55]:
vectorizer

TfidfVectorizer()

In [56]:
type(X)

scipy.sparse.csr.csr_matrix

In [57]:
print(X.shape)

(1000, 9527)


In [58]:
X.todense()

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [59]:
X.todense()[0].tolist()

[[0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,

In [60]:
X.todense()[0].tolist()[0]

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0

(3) Content Vector 추출

In [61]:
len(X.todense().tolist())

1000

In [62]:
df["doc_vec"] = X.todense().tolist()

In [63]:
df["doc_vec"]

0      [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
1      [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
2      [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
3      [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
4      [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
                             ...                        
995    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
996    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
997    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
998    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
999    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
Name: doc_vec, Length: 1000, dtype: object

In [64]:
for i in df['doc_vec']:
    print(len(i))

9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527
9527


##### 4) DataLoader

(1) CustomDataset

In [65]:
df["label_num"]

0      1
1      1
2      0
3      2
4      2
      ..
995    0
996    0
997    2
998    0
999    1
Name: label_num, Length: 1000, dtype: int64

In [66]:
class CustomDataset(torch.utils.data.Dataset):
  def __init__(self, df):
    self.df = df
    self.x_data = df["doc_vec"]
    self.y_data = df["label_num"]

  def __len__(self):
    return len(df)

  def __getitem__(self, idx):

    a_sample = torch.FloatTensor(self.x_data[idx])
    a_target = torch.LongTensor([self.y_data[idx]])

    result = {"samples": a_sample, "targets": a_target }
    
    return result
  
train_dataset = CustomDataset(df)

(2) DataLoader

In [67]:
train_loader = DataLoader(train_dataset, batch_size=20)

In [68]:
batch = next(iter(train_loader))

In [69]:
batch

{'samples': tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]),
 'targets': tensor([[1],
         [1],
         [0],
         [2],
         [2],
         [0],
         [2],
         [1],
         [1],
         [1],
         [0],
         [2],
         [0],
         [1],
         [1],
         [2],
         [0],
         [1],
         [2],
         [0]])}

In [70]:
len(batch)

2

##### 5) 모델 정의 (MultivariableLinearRegressionModel)

In [71]:
class MultivariableLinearRegressionModel(nn.Module):
  def __init__(self, num_feature, num_class):
    super().__init__()
    self.linear = nn.Linear(num_feature, num_class)

  def forward(self, X):
    logit = self.linear(X)
    return logit
  
model = MultivariableLinearRegressionModel(num_inputs, num_label)

##### 6) Optimizer 정의

In [72]:
optimizer = torch.optim.SGD(model.parameters(), lr=1)
# loss_fn = torch.nn.functional.cross_entropy()

##### 7) 모델 학습 + 성능 평가 (과제)

In [73]:
for idx, batch in enumerate(train_loader):
  print(batch)
  break

{'samples': tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]]), 'targets': tensor([[1],
        [1],
        [0],
        [2],
        [2],
        [0],
        [2],
        [1],
        [1],
        [1],
        [0],
        [2],
        [0],
        [1],
        [1],
        [2],
        [0],
        [1],
        [2],
        [0]])}


In [74]:
# (1) Epoch 순회
nb_epochs = 20
for epoch in range(nb_epochs + 1):
    for batch_idx, batch in enumerate(train_loader):
        # 1] train 데이터 불러오기
        # print(f"batch_idx, batch : {batch_idx, batch}")
        x_train, y_train = batch["samples"], batch["targets"].squeeze(dim=-1)
        # print(f"x_train, y_train : {x_train, y_train}")
        # 2] 모델 예측
        y_pred = model(x_train)
        # 3] Cost 계산
        cost = F.cross_entropy(y_pred, y_train)
        # print(f"y_pred, y_train : {y_pred, y_train}")
        
        # 4] 예측값 = 최대 확률
        # print(y_pred)
        prediction = torch.argmax(y_pred, dim=1) # !!!
        # print(prediction, y_train)
        # 5] Accuracy 계산
        check_prediction = prediction == y_train
        # print(f"len(check_prediction), check_prediction.sum().item() : {len(check_prediction), check_prediction.sum().item()}")
        accuracy = check_prediction.sum().item() / len(check_prediction) # !!!
        
        # 6] 모델 역전파
        optimizer.zero_grad()
        cost.backward()
        optimizer.step()

        # 7] 결과 출력
        print('Epoch {:4d}/{} Batch {}/{} Cost:{:.6f} Accruacy:{:.6f}'.format(epoch, nb_epochs, batch_idx+1, len(train_loader), cost.item(), accuracy * 100)) #  hypothesis: {} , pred.squeeze().detach()

Epoch    0/20 Batch 1/50 Cost:1.097637 Accruacy:40.000000
Epoch    0/20 Batch 2/50 Cost:1.107732 Accruacy:25.000000
Epoch    0/20 Batch 3/50 Cost:1.096871 Accruacy:40.000000
Epoch    0/20 Batch 4/50 Cost:1.135464 Accruacy:25.000000
Epoch    0/20 Batch 5/50 Cost:1.102299 Accruacy:40.000000
Epoch    0/20 Batch 6/50 Cost:1.120929 Accruacy:25.000000
Epoch    0/20 Batch 7/50 Cost:1.102057 Accruacy:20.000000
Epoch    0/20 Batch 8/50 Cost:1.103846 Accruacy:35.000000
Epoch    0/20 Batch 9/50 Cost:1.107496 Accruacy:30.000000
Epoch    0/20 Batch 10/50 Cost:1.094276 Accruacy:45.000000
Epoch    0/20 Batch 11/50 Cost:1.082783 Accruacy:35.000000
Epoch    0/20 Batch 12/50 Cost:1.114474 Accruacy:45.000000
Epoch    0/20 Batch 13/50 Cost:1.081138 Accruacy:40.000000
Epoch    0/20 Batch 14/50 Cost:1.100005 Accruacy:45.000000
Epoch    0/20 Batch 15/50 Cost:1.115338 Accruacy:35.000000
Epoch    0/20 Batch 16/50 Cost:1.139461 Accruacy:25.000000
Epoch    0/20 Batch 17/50 Cost:1.110271 Accruacy:35.000000
Epoch 