# 데이터 살펴보기

In [1]:
import os
import string

import pandas as pd

In [2]:
df = pd.read_csv("../input/New York Times Comments/ArticlesApril2017.csv")
df.columns

Index(['abstract', 'articleID', 'articleWordCount', 'byline', 'documentType',
       'headline', 'keywords', 'multimedia', 'newDesk', 'printPage', 'pubDate',
       'sectionName', 'snippet', 'source', 'typeOfMaterial', 'webURL'],
      dtype='object')

사람이 직접 작성한 기사가 들어있는 headline만 사용

# 학습용 데이터 만들기

In [3]:
import glob

import numpy as np
from torch.utils.data.dataset import Dataset

In [5]:
class TextGeneration(Dataset):
    def clean_text(self, txt):
        txt = "".join(v for v in txt if v not in string.punctuation).lower()
        return txt

    def __init__(self):
        all_headlines = []

        for filename in glob.glob("../input/New York Times Comments/*.csv"):
            if "Articles" in filename:
                article_df = pd.read_csv(filename)

                all_headlines.extend(list(article_df.headline.values))
                break

        all_headlines = [h for h in all_headlines if h != "Unknown"]

        self.corpus = [self.clean_text(x) for x in all_headlines]
        self.BOW = {}

        for line in self.corpus:
            for word in line.split():
                if word not in self.BOW.keys():
                    self.BOW[word] = len(self.BOW.keys())

        self.data = self.generate_sequence(self.corpus)

    def generate_sequence(self, txt):
        seq = []
        for line in txt:
            line = line.split()
            line_bow = [self.BOW[word] for word in line]

            # 단어 2개를 입력으로, 그 다음 단어를 정답으로
            data = [
                ([line_bow[i], line_bow[i + 1]], line_bow[i + 2])
                for i in range(len(line_bow) - 2)
            ]

            seq.extend(data)

        return seq

    def __len__(self):
        return len(self.data)

    def __getitem__(self, i):
        data = np.array(self.data[i][0])
        label = np.array(self.data[i][1]).astype(np.float32)

        return data, label

# LSTM 모델 정의하기

In [6]:
import torch.nn as nn

In [7]:
class LSTM(nn.Module):
    def __init__(self, num_embeddings):
        super(LSTM, self).__init__()

        self.embed = nn.Embedding(num_embeddings=num_embeddings, embedding_dim=16)

        # LSTM 층의 출력은 (batch_size, sequence_length, hidden_size)
        self.lstm = nn.LSTM(
            input_size=16, hidden_size=64, num_layers=5, batch_first=True
        )

        self.fc1 = nn.Linear(128, num_embeddings)
        self.fc2 = nn.Linear(num_embeddings, num_embeddings)

        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.embed(x)

        x, _ = self.lstm(x)
        x = torch.reshape(x, (x.shape[0], -1))
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)

        return x

# 학습하기

In [10]:
import torch
import tqdm
from torch.optim.adam import Adam
from torch.utils.data.dataloader import DataLoader

In [11]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [12]:
dataset = TextGeneration()
model = LSTM(num_embeddings=len(dataset.BOW)).to(device)
loader = DataLoader(dataset, batch_size=64)
optim = Adam(model.parameters(), lr=0.001)

In [14]:
for epoch in range(200):
    iterator = tqdm.tqdm(loader)
    for data, label in iterator:
        optim.zero_grad()

        pred = model(torch.tensor(data, dtype=torch.long).to(device))

        loss = nn.CrossEntropyLoss()(
            pred, torch.tensor(label, dtype=torch.long).to(device)
        )
        loss.backward()
        optim.step()

        iterator.set_description(f"epoch {epoch} loss: {loss.item()}")

  pred = model(torch.tensor(data, dtype=torch.long).to(device))
  pred, torch.tensor(label, dtype=torch.long).to(device)
epoch 0 loss: 7.37069845199585: 100%|██████████████████████████████████████████████████| 63/63 [00:00<00:00, 75.57it/s]
epoch 1 loss: 7.011480808258057: 100%|████████████████████████████████████████████████| 63/63 [00:00<00:00, 138.30it/s]
epoch 2 loss: 6.760417938232422: 100%|████████████████████████████████████████████████| 63/63 [00:00<00:00, 134.30it/s]
epoch 3 loss: 6.566870212554932: 100%|████████████████████████████████████████████████| 63/63 [00:00<00:00, 136.29it/s]
epoch 4 loss: 6.428116798400879: 100%|████████████████████████████████████████████████| 63/63 [00:00<00:00, 134.19it/s]
epoch 5 loss: 6.278395652770996: 100%|████████████████████████████████████████████████| 63/63 [00:00<00:00, 138.47it/s]
epoch 6 loss: 6.092379570007324: 100%|████████████████████████████████████████████████| 63/63 [00:00<00:00, 136.88it/s]
epoch 7 loss: 5.989106178283691: 100%|█

In [15]:
torch.save(model.state_dict(), "lstm.pth")

# 모델 성능 평가하기

In [16]:
def generate(model, BOW, string="finding an ", strlen=10):
    device = "cuda" if torch.cuda.is_available() else "cpu"

    print(f"input word: {string}")

    with torch.no_grad():
        for p in range(strlen):
            words = torch.tensor([BOW[w] for w in string.split()], dtype=torch.long).to(
                device
            )
            # 배치 차원 추가
            input_tensor = torch.unsqueeze(words[-2:], dim=0)
            output = model(input_tensor)
            output_word = torch.argmax(output).cpu().numpy()
            string += list(BOW.keys())[output_word]
            string += " "

    print(f"predicted string: {string}")

In [19]:
model.load_state_dict(torch.load("lstm.pth", weights_only=True, map_location=device))

<All keys matched successfully>

In [20]:
pred = generate(model, dataset.BOW)

input word: finding an 
predicted string: finding an expansive view of a quest or bush no ad insurers 
