# 문서 분류를 하는 자연어 분류기를 만들어 보자
 - Input: 문서 --> ["현재 금리상태는 ...]
 - Output: 시제 --> [""현재, "미래, "과거"']
 - model: 글자 encoding = TF-idf, 학습- Logistic Regression

## 1. 패키지 설치

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.feature_extraction.text import TfidfVectorizer

## 2. Data preprocessing

In [3]:
df = pd.read_csv("./data/train.csv")[:1000]

### 2-1. label indexing

In [4]:
label_list = df["시제"].value_counts().keys().tolist()
num_label = len(label_list)

In [5]:
num_label

3

In [6]:
def label_encoder(x):
  result = 0
  if str(x) == "현재":
    result = 0
  elif str(x) == "과거":
    result = 1
  elif str(x) =="미래":
    result = 2
  else:
    print("error")
  return result

In [7]:
df["label_num"] = df["시제"].apply(lambda x: label_encoder(x))

In [8]:
df.head(2)

Unnamed: 0,ID,문장,유형,극성,시제,확실성,label,label_num
0,TRAIN_00000,0.75%포인트 금리 인상은 1994년 이후 28년 만에 처음이다.,사실형,긍정,현재,확실,사실형-긍정-현재-확실,0
1,TRAIN_00001,이어 ＂앞으로 전문가들과 함께 4주 단위로 상황을 재평가할 예정＂이라며 ＂그 이전이...,사실형,긍정,과거,확실,사실형-긍정-과거-확실,1


### 2-2. Input encoding

In [9]:
corpus = df["문장"]

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
vectorizer.get_feature_names_out()

array(['000kg', '000kg인', '000대', ..., '힘으로', '힘을', '힘이'], dtype=object)

In [10]:
num_inputs =  len(vectorizer.get_feature_names_out())


In [11]:
len(X.todense().tolist())

1000

In [12]:
df["doc_vec"] = X.todense().tolist()

### 2-3. Dataset 구축

In [13]:
class MyDocClassData(torch.utils.data.Dataset):
  def __init__(self, df):
    self.df = df
    self.x_data = df["doc_vec"]
    self.y_data = df["label_num"]

  def __len__(self):
    return len(df)

  def __getitem__(self, idx):

    a_sample = torch.FloatTensor(self.x_data[idx])
    a_target = torch.LongTensor([self.y_data[idx]])

    result = {"samples": a_sample, "targets": a_target }
    return result


In [14]:
from torch.utils.data import DataLoader
train_dataset = MyDocClassData(df)
train_loader = DataLoader(train_dataset, batch_size=2)

In [15]:
batch = next(iter(train_loader))

In [16]:
len(batch)

2

## 3. Build models
 - 모델 클래스를 define한다

In [20]:
class MyLinear(nn.Module):
  def __init__(self, num_feature, num_class):
    super().__init__()

    self.linear = nn.Linear(num_feature, num_class)

  def forward(self, X):
    logit = self.linear(X)
    return logit



## 4. loss function, optimizer, learning_rate, Model, dataloader을 선언

In [21]:
model = MyLinear(num_inputs, num_label)
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
#loss_fn = torch.nn.functional.cross_entropy()

## 5. 학습 진행

In [26]:
num_epoch = 100
for idx in range(num_epoch):
  total_loss = []
  total_sample_num = []
  total_predict = []
  total_correct = 0
  total_samples = 0


  for idx, batch in enumerate(train_loader):
    #print(batch)
    input = batch["samples"]
    target = batch["targets"]
    predicted = model(input)
    loss = torch.nn.functional.cross_entropy(predicted, target.squeeze()) 
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    #predicted ##  torch.argmax([11, 25, 37.1]) ==> int(2 == target)
    total_loss.append(loss.item())

  print("Loss: "+ str(sum(total_loss)))

Loss: 443.9899854660034
Loss: 427.9508224129677
Loss: 412.75651636719704
Loss: 398.3495507836342
Loss: 384.6768404841423
Loss: 371.689561188221
Loss: 359.34290531277657
Loss: 347.59581384062767
Loss: 336.41062438488007
Loss: 325.7527909874916
Loss: 315.5905500650406
Loss: 305.89465752243996
Loss: 296.63812574744225
Loss: 287.7959841787815
Loss: 279.34509086608887
Loss: 271.26393005251884
Loss: 263.5324697494507
Loss: 256.1320215910673
Loss: 249.0450955927372
Loss: 242.25533290207386
Loss: 235.74738305807114
Loss: 229.50683403015137
Loss: 223.52013905346394
Loss: 217.77454361319542
Loss: 212.2580415904522
Loss: 206.95931558310986
Loss: 201.8676919043064
Loss: 196.9730887711048
Loss: 192.26599888503551
Loss: 187.73743200302124
Loss: 183.37889394164085
Loss: 179.18235357105732
Loss: 175.1402133256197
Loss: 171.2452842593193
Loss: 167.49076510965824
Loss: 163.87021382898092
Loss: 160.37752982228994
Loss: 157.00692999362946
Loss: 153.7529430091381
Loss: 150.61037032306194
Loss: 147.57429270

## 6. 학습 성능 평가 (과제)