# 텍스트 분류 - 뉴스
---
- scikit-learn의 dataset인 20대 뉴스 데이터 분류
 

## [1] 데이터 준비
---

In [123]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split

In [124]:
news_data = fetch_20newsgroups(remove=('headers', 'footers'))

news_data.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [125]:
print(f'data = {len(news_data["data"])}')

data = 11314


In [126]:
news_data20 = news_data['data']

In [127]:
print(news_data20[0])

I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.


In [128]:
# 타겟 데이터
target = news_data['target']

print(f"target => {len(news_data['target_names'])}개")
for name in news_data["target_names"]:
    print(name)
print(f'target -> {target}')

target => 20개
alt.atheism
comp.graphics
comp.os.ms-windows.misc
comp.sys.ibm.pc.hardware
comp.sys.mac.hardware
comp.windows.x
misc.forsale
rec.autos
rec.motorcycles
rec.sport.baseball
rec.sport.hockey
sci.crypt
sci.electronics
sci.med
sci.space
soc.religion.christian
talk.politics.guns
talk.politics.mideast
talk.politics.misc
talk.religion.misc
target -> [7 4 4 ... 3 1 8]


In [129]:
X = news_data20
y = target

In [130]:
np.unique(y)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19])

## [2] 데이터 전처리
---
- 수집 데이터 기반 단어사전 생성
- 텍스트 데이터 => 수치 데이터 변환
- 데이터 길이 설정
- 2진 정수형 변환

### [2-1] 토큰화와 단어사전 생성

In [131]:
import torch
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

In [132]:
tokenizer = get_tokenizer('spacy', 'en_core_web_sm')

def yield_tokens(data:list):
    for sent in data:
        yield tokenizer(sent)

vocab = build_vocab_from_iterator(yield_tokens(X), specials=["<unk>"])
vocab.set_default_index(vocab['<unk>'])

In [133]:
text_pipeline = lambda x: vocab(tokenizer(x))

### [2-2] 데이터 배치와 반복자 생성

In [134]:
from torch.utils.data import Dataset, DataLoader, random_split
device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')

In [135]:
class CustomDataset(Dataset):
    def __init__(self, X, y, train=True):
        self.train = train
        self.X = X
        self.y = y
        self.classes = news_data["target_names"]

    def __len__(self):
        len_dataset = None
        len_dataset = len(self.X)
        return len_dataset

    def __getitem__(self, idx):
        X, y = None, None
        X = self.X[idx]
        if self.train is True:
            y = self.y[idx]
        return y, X

    def split_dataset(self, val_ratio=0.2):
        data_size = len(self)
        val_set_size = int(data_size * 0.2)
        train_set_size = data_size - val_set_size

        train_set, val_set = random_split(self, [train_set_size, val_set_size])
        return train_set, val_set

In [136]:
dataset = CustomDataset(X, y, train=True)
train_dataset, val_dataset = dataset.split_dataset()

next(iter(train_dataset))

(0,
 'In article 28833@monu6.cc.monash.edu.au,  darice@yoyo.cc.monash.edu.au (Fred Rice) writes:\n#In <1993Apr14.143121.26376@bmw.mayo.edu> vdp@mayo.edu (Vinayak Dutt) writes:\n#>So instead of calling it interest on deposits, you call it *returns on investements*\n#>and instead of calling loans you call it *investing in business* (that is in other words\n#>floating stocks in your company). \n#\n#No, interest is different from a return on an investment.  For one\n#thing, a return on an investment has greater risk, and not a set return\n#(i.e. the amount of money you make can go up or down, or you might even\n#lose money).  The difference is, the risk of loss is shared by the\n#investor, rather than practically all the risk being taken by the\n#borrower when the borrower borrows from the bank.\n#\n\nBut is it different from stocks ?  If you wish to call an investor in stocks as\na banker, well then its your choice .....\n\n#>Relabeling does not make it interest free !!\n#\n#It is not jus

In [137]:
def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]

    for (_label, _text) in batch:
        label_list.append(_label)
        processed_text = torch.tensor(text_pipeline(_text))
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))
    
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)

    text_list = torch.cat(text_list)
    return label_list.to(device), text_list.to(device), offsets.to(device)


train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=False, collate_fn=collate_batch)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=collate_batch)

In [138]:
next(iter(train_dataloader))

(tensor([ 0,  9,  8, 13,  9,  4, 12,  7,  9,  7,  4, 14,  8, 19, 17,  5, 17,  6,
          0, 11, 13, 14, 13,  6, 15, 14, 14,  1,  5, 14,  8,  1],
        device='cuda:0'),
 tensor([   52,    70, 72251,  ...,    26,   112,     3], device='cuda:0'),
 tensor([    0,   336,   506,   642,   913,   973,  1065,  1267,  1372,  1401,
          1461,  1640,  1671,  1863,  2790,  3243, 12606, 13194, 13331, 13559,
         13761, 13841, 14088, 15223, 15610, 15703, 15745, 15752, 15871, 16184,
         16243, 16466], device='cuda:0'))

## [3] 모델 생성
---

In [139]:
import torch.nn as nn

In [143]:
class TextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers, num_classes):
        super().__init__()
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.num_classes = num_classes

        self.embedding = nn.EmbeddingBag(self.vocab_size, self.embed_dim, sparse=True)
        self.rnn = nn.RNN(self.embed_dim, self.hidden_dim, self.num_layers, batch_first=True)
        self.linear = nn.Linear(self.hidden_dim, self.num_classes)

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets).view(batch_size, -1, self.embed_dim)
        hidden = torch.zeros(
            self.num_layers, embedded.size(0), self.hidden_dim
        ).to(device)
        rnn_out, hidden = self.rnn(embedded, hidden)
        out = self.linear(rnn_out[:, -1]).view([-1, self.num_classes])

        return out

## [4] 모델 학습
---

In [144]:
vocab_size = len(vocab)
embed_dim = 64
hidden_dim = 32
num_layers = 1
num_classes = 20

model = TextClassifier(vocab_size, embed_dim, hidden_dim, num_layers, num_classes)

In [147]:
learning_rate = 0.01
epochs = 20

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 20, gamma=0.5)

In [None]:
def train(dataloader, epoch):
    model.train()
    train_acc = 0
    train_count = 0
    log_interval = 2000
    for idx, (labels, texts, offsets) in enumerate(dataloader):
        optimizer.zero_grad()

        outs = model(texts, offsets)
        predicts = torch.argmax(outs, dim=1)
        loss = criterion(outs, labels)
        loss.backward()
        optimizer.step()
        
        train_acc += (predicts == labels).sum().item()
        train_count += labels.size

        if idx % log_interval == 0 and idx > 0:
