# 1. 문서 분류하는 자연어 분류기
 - Input: 문서 --> ["현재 금리상태는 ...]
 - Output: 시제 --> [""현재, "미래, "과거"']
 - model: 글자 encoding = TF-idf, 학습- Logistic Regression

##### 0) 라이브러리 불러오기

In [17]:
# (1) 기본 라이브러리
import numpy as np
import pandas as pd

# (2) 머신러닝 라이브러리
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

##### 1) 데이터 불러오기

In [18]:
df = pd.read_csv("./data/train.csv")[:1000]

In [19]:
df.head(2)

Unnamed: 0,ID,문장,유형,극성,시제,확실성,label
0,TRAIN_00000,0.75%포인트 금리 인상은 1994년 이후 28년 만에 처음이다.,사실형,긍정,현재,확실,사실형-긍정-현재-확실
1,TRAIN_00001,이어 ＂앞으로 전문가들과 함께 4주 단위로 상황을 재평가할 예정＂이라며 ＂그 이전이...,사실형,긍정,과거,확실,사실형-긍정-과거-확실


##### 2) Output Encoding

(1) 시제 Unique List 생성

In [20]:
df["시제"].value_counts()

과거    475
현재    421
미래    104
Name: 시제, dtype: int64

In [21]:
df["시제"].value_counts().keys()

Index(['과거', '현재', '미래'], dtype='object')

In [22]:
label_list = df["시제"].value_counts().keys().tolist() # !!!
num_label = len(label_list)

In [23]:
num_label

3

(2) 시제 Label Encoding

In [24]:
def label_encoder(x):
  result = 0
  if str(x) == "현재":
    result = 0
  elif str(x) == "과거":
    result = 1
  elif str(x) =="미래":
    result = 2
  else:
    print("error")
  return result

In [25]:
label_encoder("과거")

1

In [26]:
df["label_num"] = df["시제"].apply(lambda x: label_encoder(x))

In [27]:
df.head(2)

Unnamed: 0,ID,문장,유형,극성,시제,확실성,label,label_num
0,TRAIN_00000,0.75%포인트 금리 인상은 1994년 이후 28년 만에 처음이다.,사실형,긍정,현재,확실,사실형-긍정-현재-확실,0
1,TRAIN_00001,이어 ＂앞으로 전문가들과 함께 4주 단위로 상황을 재평가할 예정＂이라며 ＂그 이전이...,사실형,긍정,과거,확실,사실형-긍정-과거-확실,1


##### 3) Input encoding

(1) Corpus 추출

In [28]:
corpus = df["문장"]

(2) TF-IDF 추출

In [29]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

In [30]:
vectorizer.get_feature_names() # !!!

['000kg',
 '000kg인',
 '000대',
 '000피트',
 '01',
 '075',
 '10',
 '100',
 '1000개',
 '1000달러가',
 '1000억원',
 '100g에는',
 '100여',
 '100은',
 '100일',
 '100주년',
 '101일',
 '1020회',
 '10개월을',
 '10년',
 '10년도',
 '10만1500원을',
 '10만km',
 '10만개',
 '10명',
 '10배',
 '10번째',
 '10분',
 '10시',
 '10시를',
 '10월',
 '10위권',
 '10위는',
 '10일',
 '10점을',
 '10조원이',
 '10주년을',
 '11',
 '112',
 '11개',
 '11개월',
 '11만1265',
 '11시께',
 '11시부터',
 '11억1600만원에서',
 '11월',
 '11일',
 '11회의',
 '12',
 '123',
 '124조9000억원으로',
 '1292',
 '1296개를',
 '1296억원과',
 '12년',
 '12만1000명으로',
 '12월',
 '12월말까지',
 '12월중',
 '12일',
 '12편',
 '13',
 '13억7000만달러를',
 '13위',
 '13일',
 '13일까지',
 '13일에는',
 '14',
 '141',
 '141개',
 '1467',
 '147명의',
 '147억원을',
 '1481곳에서',
 '1482',
 '1488점을',
 '14개월',
 '14년',
 '14시간가량의',
 '14억3000만원',
 '14일',
 '14편',
 '15',
 '150',
 '1500',
 '1509',
 '150명으로',
 '1510은',
 '1519는',
 '153번',
 '1540만달러로',
 '1553',
 '155번',
 '1570',
 '1572',
 '1573',
 '1574',
 '1599',
 '15k',
 '15개가',
 '15개월',
 '15년간',
 '15년만에',
 '15만원만',
 '15일',
 '16',

In [31]:
num_inputs =  len(vectorizer.get_feature_names())

In [32]:
vectorizer

TfidfVectorizer()

In [33]:
type(X)

scipy.sparse.csr.csr_matrix

In [34]:
print(X.shape)

(1000, 10005)


In [35]:
X.todense()

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [36]:
X.todense()[0].tolist()

[[0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,

In [37]:
X.todense()[0].tolist()[0]

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.3772896039588328,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 

(3) Content Vector 추출

In [38]:
len(X.todense().tolist())

1000

In [39]:
df["doc_vec"] = X.todense().tolist()

In [40]:
df["doc_vec"]

0      [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
1      [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
2      [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
3      [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
4      [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
                             ...                        
995    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
996    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
997    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
998    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
999    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
Name: doc_vec, Length: 1000, dtype: object

In [41]:
for i in df['doc_vec']:
    print(len(i))

10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
10005
1000

##### 4) DataLoader

(0) Train / val Split

In [None]:
train_ratio = 0.8

In [53]:
df

Unnamed: 0,ID,문장,유형,극성,시제,확실성,label,label_num,doc_vec
0,TRAIN_00000,0.75%포인트 금리 인상은 1994년 이후 28년 만에 처음이다.,사실형,긍정,현재,확실,사실형-긍정-현재-확실,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,TRAIN_00001,이어 ＂앞으로 전문가들과 함께 4주 단위로 상황을 재평가할 예정＂이라며 ＂그 이전이...,사실형,긍정,과거,확실,사실형-긍정-과거-확실,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,TRAIN_00002,정부가 고유가 대응을 위해 7월부터 연말까지 유류세 인하 폭을 30%에서 37%까지...,사실형,긍정,미래,확실,사실형-긍정-미래-확실,2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,TRAIN_00003,"서울시는 올해 3월 즉시 견인 유예시간 60분을 제공하겠다고 밝혔지만, 하루 만에 ...",사실형,긍정,과거,확실,사실형-긍정-과거-확실,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,TRAIN_00004,익사한 자는 사다리에 태워 거꾸로 놓고 소금으로 코를 막아 가득 채운다.,사실형,긍정,현재,확실,사실형-긍정-현재-확실,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...,...,...,...,...
995,TRAIN_00995,＇TIGER 골드선물＇과 ＇KINDEX골드선물 레버리지＇도 매수세가 유입되며 순자산...,사실형,긍정,과거,확실,사실형-긍정-과거-확실,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
996,TRAIN_00996,"KF-X 개발 및 양산 주 사업자인 한국항공우주산업㈜(이하, KAI)은 지난 4월부...",사실형,긍정,현재,확실,사실형-긍정-현재-확실,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
997,TRAIN_00997,이날 ＇기생충＇의 작품상 수상으로 무대에 오른 조여정은 행복한 표정을 보였다.,사실형,긍정,과거,확실,사실형-긍정-과거-확실,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
998,TRAIN_00998,김 지사장은 ＂한국에서도 유아이패스의 비전인 ＇1인 1로봇＇을 시범 도입하는 한 해...,사실형,긍정,과거,확실,사실형-긍정-과거-확실,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


(1) CustomDataset

In [42]:
df["label_num"]

0      0
1      1
2      2
3      1
4      0
      ..
995    1
996    0
997    1
998    1
999    0
Name: label_num, Length: 1000, dtype: int64

In [44]:
class CustomDataset(torch.utils.data.Dataset):
  def __init__(self, df):
    self.df = df
    self.x_data = df["doc_vec"]
    self.y_data = df["label_num"]

  def __len__(self):
    return len(df)

  def __getitem__(self, idx):

    a_sample = torch.FloatTensor(self.x_data[idx])
    a_target = torch.LongTensor([self.y_data[idx]])

    result = {"samples": a_sample, "targets": a_target }
    
    return result
  
train_dataset = CustomDataset(df)

(2) DataLoader

In [45]:
train_loader = DataLoader(train_dataset, batch_size=20)

In [46]:
batch = next(iter(train_loader))

In [47]:
batch

{'samples': tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]),
 'targets': tensor([[0],
         [1],
         [2],
         [1],
         [0],
         [0],
         [0],
         [0],
         [1],
         [0],
         [0],
         [0],
         [2],
         [0],
         [1],
         [1],
         [0],
         [0],
         [1],
         [0]])}

In [48]:
len(batch)

2

##### 5) 모델 정의 (MultivariableLinearRegressionModel)

In [49]:
class MultivariableLinearRegressionModel(nn.Module):
  def __init__(self, num_feature, num_class):
    super().__init__()
    self.linear = nn.Linear(num_feature, num_class)

  def forward(self, X):
    logit = self.linear(X)
    return logit
  
model = MultivariableLinearRegressionModel(num_inputs, num_label)

##### 6) Optimizer 정의

In [50]:
optimizer = torch.optim.SGD(model.parameters(), lr=1)
# loss_fn = torch.nn.functional.cross_entropy()

##### 7) 모델 학습 + 성능 평가 (과제)

In [51]:
for idx, batch in enumerate(train_loader):
  print(batch)
  break

{'samples': tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]]), 'targets': tensor([[0],
        [1],
        [2],
        [1],
        [0],
        [0],
        [0],
        [0],
        [1],
        [0],
        [0],
        [0],
        [2],
        [0],
        [1],
        [1],
        [0],
        [0],
        [1],
        [0]])}


In [52]:
# (1) Epoch 순회
nb_epochs = 20
for epoch in range(nb_epochs + 1):
    for batch_idx, batch in enumerate(train_loader):
        # 1] train 데이터 불러오기
        # print(f"batch_idx, batch : {batch_idx, batch}")
        x_train, y_train = batch["samples"], batch["targets"].squeeze(dim=-1)
        # print(f"x_train, y_train : {x_train, y_train}")
        # 2] 모델 예측
        y_pred = model(x_train)
        # 3] Cost 계산
        cost = F.cross_entropy(y_pred, y_train)
        # print(f"y_pred, y_train : {y_pred, y_train}")
        
        # 4] 예측값 = 최대 확률
        # print(y_pred)
        prediction = torch.argmax(y_pred, dim=1) # !!!
        # print(prediction, y_train)
        # 5] Accuracy 계산
        check_prediction = prediction == y_train
        # print(f"len(check_prediction), check_prediction.sum().item() : {len(check_prediction), check_prediction.sum().item()}")
        accuracy = check_prediction.sum().item() / len(check_prediction) # !!!
        
        # 6] 모델 역전파
        optimizer.zero_grad()
        cost.backward()
        optimizer.step()

        # 7] 결과 출력
        print('Epoch {:4d}/{} Batch {}/{} Cost:{:.6f} Accruacy:{:.6f}'.format(epoch, nb_epochs, batch_idx+1, len(train_loader), cost.item(), accuracy * 100)) #  hypothesis: {} , pred.squeeze().detach()

Epoch    0/20 Batch 1/50 Cost:1.095841 Accruacy:55.000000
Epoch    0/20 Batch 2/50 Cost:1.076380 Accruacy:30.000000
Epoch    0/20 Batch 3/50 Cost:1.014155 Accruacy:50.000000
Epoch    0/20 Batch 4/50 Cost:0.947580 Accruacy:60.000000
Epoch    0/20 Batch 5/50 Cost:0.913540 Accruacy:55.000000
Epoch    0/20 Batch 6/50 Cost:0.932990 Accruacy:45.000000
Epoch    0/20 Batch 7/50 Cost:0.880742 Accruacy:55.000000
Epoch    0/20 Batch 8/50 Cost:1.051711 Accruacy:30.000000
Epoch    0/20 Batch 9/50 Cost:1.139846 Accruacy:40.000000
Epoch    0/20 Batch 10/50 Cost:0.985576 Accruacy:20.000000
Epoch    0/20 Batch 11/50 Cost:0.974567 Accruacy:45.000000
Epoch    0/20 Batch 12/50 Cost:0.962228 Accruacy:65.000000
Epoch    0/20 Batch 13/50 Cost:0.856779 Accruacy:60.000000
Epoch    0/20 Batch 14/50 Cost:0.995689 Accruacy:35.000000
Epoch    0/20 Batch 15/50 Cost:0.956692 Accruacy:35.000000
Epoch    0/20 Batch 16/50 Cost:0.930165 Accruacy:60.000000
Epoch    0/20 Batch 17/50 Cost:0.922125 Accruacy:55.000000
Epoch 