# 2. 문서 분류 자연어 분류기
 - Input: 문서 --> ["현재 금리상태는 ...]
 - Output: 시제 --> [""현재, "미래, "과거"']
 - model: 글자 encoding = TF-idf, 학습- Logistic Regression

##### 0) 라이브러리 불러오기

In [172]:
# (1) 기본 라이브러리
import numpy as np
import pandas as pd

# (2) 머신러닝 라이브러리
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

##### 1) 데이터 불러오기

In [173]:
df = pd.read_csv("./data/train.csv")[:1000]

In [174]:
df.head(2)

Unnamed: 0,ID,문장,유형,극성,시제,확실성,label
0,TRAIN_00000,0.75%포인트 금리 인상은 1994년 이후 28년 만에 처음이다.,사실형,긍정,현재,확실,사실형-긍정-현재-확실
1,TRAIN_00001,이어 ＂앞으로 전문가들과 함께 4주 단위로 상황을 재평가할 예정＂이라며 ＂그 이전이...,사실형,긍정,과거,확실,사실형-긍정-과거-확실


##### 2) Output Encoding

(1) 시제 Unique List 생성

In [175]:
df["시제"].value_counts()

과거    475
현재    421
미래    104
Name: 시제, dtype: int64

In [176]:
df["시제"].value_counts().keys()

Index(['과거', '현재', '미래'], dtype='object')

In [177]:
label_list = df["시제"].value_counts().keys().tolist() # !!!
num_label = len(label_list)

In [178]:
num_label

3

(2) 시제 Label Encoding

In [179]:
def label_encoder(x):
  result = 0
  if str(x) == "현재":
    result = 0
  elif str(x) == "과거":
    result = 1
  elif str(x) =="미래":
    result = 2
  else:
    print("error")
  return result

In [180]:
label_encoder("과거")

1

In [181]:
df["label_num"] = df["시제"].apply(lambda x: label_encoder(x))

In [182]:
df.head(2)

Unnamed: 0,ID,문장,유형,극성,시제,확실성,label,label_num
0,TRAIN_00000,0.75%포인트 금리 인상은 1994년 이후 28년 만에 처음이다.,사실형,긍정,현재,확실,사실형-긍정-현재-확실,0
1,TRAIN_00001,이어 ＂앞으로 전문가들과 함께 4주 단위로 상황을 재평가할 예정＂이라며 ＂그 이전이...,사실형,긍정,과거,확실,사실형-긍정-과거-확실,1


##### 3) Input encoding

(1) Corpus 추출

In [183]:
corpus = df["문장"]

(2) TF-IDF 추출

In [184]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

In [185]:
vectorizer.get_feature_names_out() # !!!

array(['000kg', '000kg인', '000대', ..., '힘으로', '힘을', '힘이'], dtype=object)

In [186]:
num_inputs =  len(vectorizer.get_feature_names_out())

In [187]:
vectorizer

TfidfVectorizer()

In [188]:
type(X)

scipy.sparse._csr.csr_matrix

In [189]:
print(X.shape)

(1000, 10005)


In [190]:
X.todense()

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [191]:
X.todense()[0].tolist()

[[0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,

In [192]:
X.todense()[0].tolist()[0]

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.3772896039588328,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 

(3) Content Vector 추출

In [193]:
len(X.todense().tolist())

1000

In [194]:
df["doc_vec"] = X.todense().tolist()

In [195]:
df["doc_vec"]

0      [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
1      [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
2      [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
3      [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
4      [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
                             ...                        
995    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
996    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
997    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
998    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
999    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
Name: doc_vec, Length: 1000, dtype: object

In [196]:
for i in df['doc_vec']:
    print(len(i))

10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
1000

##### 4) DataLoader

(1) CustomDataset

In [197]:
df["label_num"]

0      0
1      1
2      2
3      1
4      0
      ..
995    1
996    0
997    1
998    1
999    0
Name: label_num, Length: 1000, dtype: int64

In [198]:
class CustomDataset(torch.utils.data.Dataset):
  def __init__(self, df):
    self.df = df
    self.x_data = df["doc_vec"]
    self.y_data = df["label_num"]

  def __len__(self):
    return len(df)

  def __getitem__(self, idx):

    a_sample = torch.FloatTensor(self.x_data[idx])
    a_target = torch.LongTensor([self.y_data[idx]])

    result = {"samples": a_sample, "targets": a_target }
    
    return result
  
train_dataset = CustomDataset(df)

(2) DataLoader

In [199]:
train_loader = DataLoader(train_dataset, batch_size=20)

In [200]:
batch = next(iter(train_loader))

In [201]:
batch

{'samples': tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]),
 'targets': tensor([[0],
         [1],
         [2],
         [1],
         [0],
         [0],
         [0],
         [0],
         [1],
         [0],
         [0],
         [0],
         [2],
         [0],
         [1],
         [1],
         [0],
         [0],
         [1],
         [0]])}

In [202]:
len(batch)

2

##### 5) 모델 정의 (MultivariableLinearRegressionModel)

In [203]:
class MultivariableLinearRegressionModel(nn.Module):
  def __init__(self, num_feature, num_class):
    super().__init__()
    self.linear = nn.Linear(num_feature, num_class)

  def forward(self, X):
    logit = self.linear(X)
    return logit
  
model = MultivariableLinearRegressionModel(num_inputs, num_label)

##### 6) Optimizer 정의

In [204]:
optimizer = torch.optim.SGD(model.parameters(), lr=1)
# loss_fn = torch.nn.functional.cross_entropy()

##### 7) 모델 학습 + 성능 평가 (과제)

In [205]:
for idx, batch in enumerate(train_loader):
  print(batch)
  break

{'samples': tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]]), 'targets': tensor([[0],
        [1],
        [2],
        [1],
        [0],
        [0],
        [0],
        [0],
        [1],
        [0],
        [0],
        [0],
        [2],
        [0],
        [1],
        [1],
        [0],
        [0],
        [1],
        [0]])}


In [206]:
# (1) Epoch 순회
nb_epochs = 20
for epoch in range(nb_epochs + 1):
    for batch_idx, batch in enumerate(train_loader):
        # 1] train 데이터 불러오기
        # print(f"batch_idx, batch : {batch_idx, batch}")
        x_train, y_train = batch["samples"], batch["targets"].squeeze(dim=-1)
        # print(f"x_train, y_train : {x_train, y_train}")
        # 2] 모델 예측
        y_pred = model(x_train)
        # 3] Cost 계산
        cost = F.cross_entropy(y_pred, y_train)
        # print(f"y_pred, y_train : {y_pred, y_train}")
        
        # 4] 예측값 = 최대 확률
        # print(y_pred)
        prediction = torch.argmax(y_pred, dim=1) # !!!
        # print(prediction, y_train)
        # 5] Accuracy 계산
        check_prediction = prediction == y_train
        # print(f"len(check_prediction), check_prediction.sum().item() : {len(check_prediction), check_prediction.sum().item()}")
        accuracy = check_prediction.sum().item() / len(check_prediction) # !!!
        
        # 6] 모델 역전파
        optimizer.zero_grad()
        cost.backward()
        optimizer.step()

        # 7] 결과 출력
        print('Epoch {:4d}/{} Batch {}/{} Cost:{:.6f} Accruacy:{:.6f}'.format(epoch, nb_epochs, batch_idx+1, len(train_loader), cost.item(), accuracy * 100)) #  hypothesis: {} , pred.squeeze().detach()

Epoch    0/20 Batch 1/50 Cost:1.098382 Accruacy:40.000000
Epoch    0/20 Batch 2/50 Cost:1.072074 Accruacy:30.000000
Epoch    0/20 Batch 3/50 Cost:1.010095 Accruacy:50.000000
Epoch    0/20 Batch 4/50 Cost:0.949347 Accruacy:60.000000
Epoch    0/20 Batch 5/50 Cost:0.912079 Accruacy:55.000000
Epoch    0/20 Batch 6/50 Cost:0.934386 Accruacy:45.000000
Epoch    0/20 Batch 7/50 Cost:0.883005 Accruacy:55.000000
Epoch    0/20 Batch 8/50 Cost:1.054116 Accruacy:30.000000
Epoch    0/20 Batch 9/50 Cost:1.140160 Accruacy:40.000000
Epoch    0/20 Batch 10/50 Cost:0.987625 Accruacy:20.000000
Epoch    0/20 Batch 11/50 Cost:0.975522 Accruacy:45.000000
Epoch    0/20 Batch 12/50 Cost:0.963012 Accruacy:65.000000
Epoch    0/20 Batch 13/50 Cost:0.857783 Accruacy:60.000000
Epoch    0/20 Batch 14/50 Cost:0.997543 Accruacy:35.000000
Epoch    0/20 Batch 15/50 Cost:0.954964 Accruacy:35.000000
Epoch    0/20 Batch 16/50 Cost:0.927525 Accruacy:60.000000
Epoch    0/20 Batch 17/50 Cost:0.923720 Accruacy:55.000000
Epoch 