# 문서 분류를 하는 자연어 분류기를 만들어 보자
 - Input: 문서 --> ["현재 금리상태는 ...]
 - Output: 시제 --> [""현재, "미래, "과거"']
 - model: 글자 encoding = TF-idf, 학습- Logistic Regression

## 1. 패키지 설치

In [4]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.feature_extraction.text import TfidfVectorizer

## 2. Data preprocessing

In [31]:
df = pd.read_csv("train.csv")[:1000]

### 2-1. label indexing

In [32]:
label_list = df["시제"].value_counts().keys().tolist()
num_label = len(label_list)

In [33]:
num_label

3

In [34]:
def label_encoder(x):
  result = 0
  if str(x) == "현재":
    result = 0
  elif str(x) == "과거":
    result = 1
  elif str(x) =="미래":
    result = 2
  else:
    print("error")
  return result

In [35]:
df["label_num"] = df["시제"].apply(lambda x: label_encoder(x))

In [36]:
df.head(2)

Unnamed: 0,ID,문장,유형,극성,시제,확실성,label,label_num
0,TRAIN_00000,0.75%포인트 금리 인상은 1994년 이후 28년 만에 처음이다.,사실형,긍정,현재,확실,사실형-긍정-현재-확실,0
1,TRAIN_00001,이어 ＂앞으로 전문가들과 함께 4주 단위로 상황을 재평가할 예정＂이라며 ＂그 이전이...,사실형,긍정,과거,확실,사실형-긍정-과거-확실,1


### 2-2. Input encoding

In [37]:
corpus = df["문장"]

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
vectorizer.get_feature_names_out()

array(['000kg', '000kg인', '000대', ..., '힘으로', '힘을', '힘이'], dtype=object)

In [39]:
num_inputs =  len(vectorizer.get_feature_names_out())


In [46]:
len(X.todense().tolist())

1000

In [47]:
df["doc_vec"] = X.todense().tolist()

### 2-3. Dataset 구축

In [59]:
class MyDocClassData(torch.utils.data.Dataset):
  def __init__(self, df):
    self.df = df
    self.x_data = df["doc_vec"]
    self.y_data = df["label_num"]

  def __len__(self):
    return len(df)

  def __getitem__(self, idx):

    a_sample = self.x_data[idx]
    a_target = self.y_data[idx]
    print(a_sample)
    print(a_target)

    result = {"samples": a_sample, "targets": a_target }
    return result


In [63]:
from torch.utils.data import DataLoader
train_dataset = MyDocClassData(df)
train_loader = DataLoader(train_dataset, batch_size=2)

In [64]:
batch = next(iter(train_loader))

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.3772896039588328, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,

In [65]:
len(batch)

2