Colab 
-----

In [1]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Colab Notebooks/공모전/Factcheck

Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks/공모전/Factcheck


In [2]:
# Mecab
!git clone https://github.com/SOMJANG/Mecab-ko-for-Google-Colab.git
%cd Mecab-ko-for-Google-Colab
!bash install_mecab-ko_on_colab190912.sh
%cd ..

fatal: destination path 'Mecab-ko-for-Google-Colab' already exists and is not an empty directory.
/content/drive/MyDrive/Colab Notebooks/공모전/Factcheck/Mecab-ko-for-Google-Colab
Installing konlpy.....
Collecting konlpy
  Downloading konlpy-0.5.2-py2.py3-none-any.whl (19.4 MB)
[K     |████████████████████████████████| 19.4 MB 1.2 MB/s 
[?25hCollecting JPype1>=0.7.0
  Downloading JPype1-1.3.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (448 kB)
[K     |████████████████████████████████| 448 kB 69.3 MB/s 
Collecting colorama
  Downloading colorama-0.4.4-py2.py3-none-any.whl (16 kB)
Collecting beautifulsoup4==4.6.0
  Downloading beautifulsoup4-4.6.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 7.7 MB/s 
Installing collected packages: JPype1, colorama, beautifulsoup4, konlpy
  Attempting uninstall: beautifulsoup4
    Found existing installation: beautifulsoup4 4.6.3
    Uninstalling beautifulsoup4-4.6.3:
      Successfully uninstalled beautifulso

In [3]:
!pip install -q konlpy gluonnlp mxnet

[K     |████████████████████████████████| 344 kB 8.9 MB/s 
[K     |████████████████████████████████| 46.9 MB 75 kB/s 
[?25h  Building wheel for gluonnlp (setup.py) ... [?25l[?25hdone


In [4]:
!pip install dill



In [5]:
import time
import gc
from contextlib import contextmanager

# nice way to report running times
@contextmanager
def timer(name):
    t0 = time.time()
    yield
    print(f'[{name}] done in {time.time() - t0:.4f} s')

Utils
-----

In [6]:
import json
import torch
from pathlib import Path
from typing import Union


class Config:
    """Config class"""

    def __init__(self, json_path_or_dict: Union[str, dict]) -> None:
        """Instantiating Config class
        Args:
            json_path_or_dict (Union[str, dict]): filepath of config or dictionary which has attributes
        """
        if isinstance(json_path_or_dict, dict):
            self.__dict__.update(json_path_or_dict)
        else:
            with open(json_path_or_dict, mode="r") as io:
                params = json.loads(io.read())
            self.__dict__.update(params)

    def save(self, json_path: Union[str, Path]) -> None:
        """Saving config to json_path

        Args:
            json_path (Union[str, Path]): filepath of conifg
        """
        with open(json_path, mode="w") as io:
            json.dump(self.__dict__, io, indent=4)

    def update(self, json_path_or_dict) -> None:
        """Updating Config instance
        Args:
            json_path_or_dict (Union[str, dict]): filepath of config or dictionary which has attributes
        """
        if isinstance(json_path_or_dict, dict):
            self.__dict__.update(json_path_or_dict)
        else:
            with open(json_path_or_dict, mode="r") as io:
                params = json.loads(io.read())
            self.__dict__.update(params)

    @property
    def dict(self) -> dict:
        return self.__dict__

In [7]:
from konlpy.tag import Mecab

split_morphs = Mecab().morphs

Build dataset
-------------

In [23]:
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split

# loading dataset
snufc_dir = Path("snufc")
filepath = snufc_dir / "processed_data.csv"
dataset = pd.read_csv(filepath, encoding='CP949').loc[:, ["document", "label"]]
dataset = dataset.loc[dataset["document"].isna().apply(lambda elm: not elm), :]
train, validation = train_test_split(dataset, test_size=0.2, random_state=777)
train, test = train_test_split(train, test_size=0.25, random_state=777)

train.to_csv(snufc_dir / "train.txt", sep="\t", index=False)
validation.to_csv(snufc_dir / "validation.txt", sep="\t", index=False)
test.to_csv(snufc_dir / "test.txt", sep="\t", index=False)

config = Config(
    {
        "train": str(snufc_dir / "train.txt"),
        "validation": str(snufc_dir / "validation.txt"),
        "test": str(snufc_dir / "test.txt"),
    }
)
config.save("conf/dataset/snufc.json")

Build vocab
-----------

### Vocab Class

In [24]:
from typing import List, Callable, Union, Dict


class Vocab:
    """Vocab class"""

    def __init__(
        self,
        list_of_tokens: List[str] = None,
        padding_token: str = "<pad>",
        unknown_token: str = "<unk>",
        bos_token: str = "<bos>",
        eos_token: str = "<eos>",
        reserved_tokens: List[str] = None,
        token_to_idx: Dict[str, int] = None,
    ):
        """Instantiating Vocab class
        Args:
            list_of_tokens (List[str]): list of tokens is source of vocabulary. each token is not duplicate
            padding_token (str): the representation for padding token
            unknown_token (str): the representation for any unknown token
            bos_token (str): the representation for the special token of beginning-of-sequence token
            eos_token (str): the representation for the special token of end-of-sequence token
            reserved_tokens (List[str]): a list specifying additional tokens to be added to the vocabulary
            token_to_idx (Dict[str, int]): If not `None`, specifies the indices of tokens to be used by the vocabulary.
                                           Each token in `token_to_index` must be part of the Vocab and each index can
                                           only be associated with a single token. `token_to_idx` is not required to
                                           contain a mapping for all tokens. For example, it is valid to only set the
                                            `unknown_token` index to 10 (instead of the default of 0) with
                                           `token_to_idx = {'<unk>': 10}`, assuming that there are at least 10 tokens in
                                            the vocabulary.
        """
        self._unknown_token = unknown_token
        self._padding_token = padding_token
        self._bos_token = bos_token
        self._eos_token = eos_token
        self._reserved_tokens = reserved_tokens
        self._special_tokens = []

        for tkn in [
            self._unknown_token,
            self._padding_token,
            self._bos_token,
            self._eos_token,
        ]:
            if tkn:
                self._special_tokens.append(tkn)

        if self._reserved_tokens:
            self._special_tokens.extend(self._reserved_tokens)

        if list_of_tokens:
            self._special_tokens.extend(
                list(
                    filter(lambda elm: elm not in self._special_tokens, list_of_tokens)
                )
            )
        
        self._token_to_idx, self._idx_to_token = self._build(self._special_tokens)

        if token_to_idx:
            self._sort_index_according_to_user_specification(token_to_idx)

        self._embedding = None

    def to_indices(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:
        """Looks up indices of text tokens according to the vocabulary

        Args:
            tokens (Union[str, List[str]]): a source token or tokens to be converted
        Returns:
            Union[int, List[int]]: a token index or a list of token indices according to the vocabulary
        """
        if isinstance(tokens, list):
            return [
                self._token_to_idx[tkn]
                if tkn in self._token_to_idx
                else self._token_to_idx[self._unknown_token]
                for tkn in tokens
            ]
        else:
            return (
                self._token_to_idx[tokens]
                if tokens in self._token_to_idx
                else self._token_to_idx[self._unknown_token]
            )
    
    def to_tokens(self, indices: Union[int, List[int]]) -> Union[str, List[str]]:
        """Converts token indices to tokens according to the vocabulary
        Args:
            indices (Union[int, List[int]]): a source token index or token indices to be converted
        Returns:
            Union[str, List[str]]: a token or a list of tokens according to the vocabulary
        """
        if isinstance(indices, list):
            return [self._idx_to_token[idx] for idx in indices]
        else:
            return self._idx_to_token[indices]

    def _build(self, list_of_tokens):
        token_to_idx = {tkn: idx for idx, tkn in enumerate(list_of_tokens)}
        idx_to_token = list_of_tokens
        return token_to_idx, idx_to_token

    def _sort_index_according_to_user_specification(self, token_to_idx):
        # Sanity checks
        if not set(token_to_idx.keys()).issubset(self._token_to_idx.keys()):
            raise ValueError(
                "User-specified token_to_idx mapping can only contain "
                "tokens that will be part of the vocabulary."
            )
        if len(set(token_to_idx.values())) != len(token_to_idx):
            raise ValueError("User-specified indices must not contain duplicates.")
        if min(token_to_idx.values()) < 0 or max(token_to_idx.values()) >= len(
            self._token_to_idx
        ):
            raise ValueError(
                "User-specified indices must not be < 0 or >= the number of tokens "
                "that will be in the vocabulary. The current vocab contains {}"
                "tokens.".format(len(self._token_to_idx))
            )

        # Update index ordering
        for token, new_idx in token_to_idx.items():
            old_idx = self._token_to_idx[token]
            ousted_token = self._idx_to_token[new_idx]

            self._token_to_idx[token] = new_idx
            self._token_to_idx[ousted_token] = old_idx
            self._idx_to_token[old_idx] = ousted_token
            self._idx_to_token[new_idx] = token

    def __len__(self):
        return len(self._token_to_idx)

    @property
    def token_to_idx(self):
        return self._token_to_idx

    @property
    def idx_to_token(self):
        return self._idx_to_token

    @property
    def padding_token(self):
        return self._padding_token

    @property
    def unknown_token(self):
        return self._unknown_token

    @property
    def bos_token(self):
        return self._bos_token

    @property
    def eos_token(self):
        return self._eos_token

    @property
    def embedding(self):
        return self._embedding

    @embedding.setter
    def embedding(self, array):
        self._embedding = array


### Implement

In [25]:
import itertools
import pickle
import gluonnlp as nlp
import pandas as pd
from pathlib import Path
from collections import Counter

# loading dataset
snufc_dir = Path("snufc")
config = Config("conf/dataset/snufc.json")
tr = pd.read_csv(config.train, sep="\t").loc[:, ["document", "label"]]

# extracting morph in sentences
list_of_tokens = tr["document"].apply(split_morphs).tolist()

# generating the vocab
token_counter = Counter(itertools.chain.from_iterable(list_of_tokens))
tmp_vocab = nlp.Vocab(
    counter=token_counter, min_freq=10, bos_token=None, eos_token=None
)

# connecting SISG embedding with vocab
ptr_embedding = nlp.embedding.create("fasttext", source="wiki.ko")
tmp_vocab.set_embedding(ptr_embedding)
array = tmp_vocab.embedding.idx_to_vec.asnumpy()

vocab = Vocab(
    tmp_vocab.idx_to_token,
    padding_token="<pad>",
    unknown_token="<unk>",
    bos_token=None,
    eos_token=None,
)
vocab.embedding = array

Embedding file wiki.ko.npz is not found. Downloading from Gluon Repository. This may take some time.
Downloading /root/.mxnet/embedding/fasttext/wiki.ko.npz from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/embeddings/fasttext/wiki.ko.npz...


In [26]:
import dill

# saving vocab
with open(snufc_dir / "vocab.pkl", mode="wb") as io:
    dill.dump(vocab, io)

config.update({"vocab": str(snufc_dir / "vocab.pkl")})
config.save("conf/dataset/snufc.json")

Model
----

### Utils - `PadSequence`, `Tokenizer`

In [27]:
class PadSequence:
    """PadSequence class"""

    def __init__(self, length: int, pad_val: int = 0, clip: bool = True) -> None:
        """Instantiating PadSequence class
        Args:
            length (int): the maximum length to pad/clip the sequence
            pad_val (int): the pad value
            clip (bool): whether to clip the length, if sample length is longer than maximum length
        """
        self._length = length
        self._pad_val = pad_val
        self._clip = clip

    def __call__(self, sample):
        sample_length = len(sample)
        if sample_length >= self._length:
            if self._clip and sample_length > self._length:
                return sample[: self._length]
            else:
                return sample
        else:
            return sample + [self._pad_val for _ in range(self._length - sample_length)]

In [28]:
class Tokenizer:
    """Tokenizer class"""

    def __init__(
        self,
        vocab: Vocab,
        split_fn: Callable[[str], List[str]],
        pad_fn: Callable[[List[int]], List[int]] = None,
    ):
        """Instantiating Tokenizer class
        Args:
            vocab (model.utils.Vocab): the instance of model.utils.Vocab created from specific split_fn
            split_fn (Callable): a function that can act as a splitter
            pad_fn (Callable): a function that can act as a padder
        """
        self._vocab = vocab
        self._split = split_fn
        self._pad = pad_fn

    def split(self, string: str) -> List[str]:
        list_of_tokens = self._split(string)
        return list_of_tokens

    def transform(self, list_of_tokens: List[str]) -> List[int]:
        list_of_indices = self._vocab.to_indices(list_of_tokens)
        list_of_indices = self._pad(list_of_indices) if self._pad else list_of_indices
        return list_of_indices

    def split_and_transform(self, string: str) -> List[int]:
        return self.transform(self.split(string))

    @property
    def vocab(self):
        return self._vocab

### `Corpus`

In [29]:
import pandas as pd
import torch
from torch.utils.data import Dataset
from typing import Tuple, List, Callable


class Corpus(Dataset):
    """Corpus class"""

    def __init__(self, filepath: str, transform_fn: Callable[[str], List[int]]) -> None:
        """Instantiating Corpus class

        Args:
            filepath (str): filepath
            transform_fn (Callable): a function that can act as a transformer
        """
        self._corpus = pd.read_csv(filepath, sep="\t").loc[:, ['document', 'label']]
        self._transform = transform_fn

    def __len__(self) -> int:
        return len(self._corpus)

    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
        tokens2indices = torch.tensor(self._transform(self._corpus.iloc[idx]['document']))
        label = torch.tensor(self._corpus.iloc[idx]['label'])
        return tokens2indices, label


def batchify(data: List[Tuple[torch.tensor, torch.tensor, torch.tensor]]) ->\
        Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    """custom collate_fn for DataLoader
    Args:
        data (list): list of torch.Tensors
    Returns:
        data (tuple): tuple of torch.Tensors
    """
    indices, labels = zip(*data)
    indices = pad_sequence(indices, batch_first=True, padding_value=1, )
    labels = torch.stack(labels, 0)
    return indices, labels

### `Net`

In [30]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from typing import Tuple

In [31]:
class MultiChannelEmbedding(nn.Module):
    """MultiChannelEmbedding class"""

    def __init__(self, vocab: Vocab) -> None:
        """Instantiating MultiChannelEmbedding class
        Args:
            vocab (model.utils.Vocab): the instance of model.utils.Vocab
        """
        super(MultiChannelEmbedding, self).__init__()
        self._static = nn.Embedding.from_pretrained(
            torch.from_numpy(vocab.embedding),
            freeze=True,
            padding_idx=vocab.to_indices(vocab.padding_token),
        )
        self._non_static = nn.Embedding.from_pretrained(
            torch.from_numpy(vocab.embedding),
            freeze=False,
            padding_idx=vocab.to_indices(vocab.padding_token),
        )

    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        static = self._static(x).permute(0, 2, 1)
        non_static = self._non_static(x).permute(0, 2, 1)
        return static, non_static

In [32]:
class ConvolutionLayer(nn.Module):
    """ConvolutionLayer class"""

    def __init__(self, in_channels: int, out_channels: int) -> None:
        """Instantiating ConvolutionLayer class
        Args:
            in_channels (int): the number of channels from input feature map
            out_channels (int): the number of channels from output feature map
        """
        super(ConvolutionLayer, self).__init__()
        self._tri_gram = nn.Conv1d(
            in_channels=in_channels, out_channels=out_channels // 3, kernel_size=3
        )
        self._tetra_gram = nn.Conv1d(
            in_channels=in_channels, out_channels=out_channels // 3, kernel_size=4
        )
        self._penta_gram = nn.Conv1d(
            in_channels=in_channels, out_channels=out_channels // 3, kernel_size=5
        )

    def forward(
        self, x: Tuple[torch.Tensor, torch.Tensor]
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        static, non_static = x
        tri_fmap = F.relu(self._tri_gram(static)) + F.relu(self._tri_gram(non_static))
        tetra_fmap = F.relu(self._tetra_gram(static)) + F.relu(
            self._tetra_gram(non_static)
        )
        penta_fmap = F.relu(self._penta_gram(static)) + F.relu(
            self._penta_gram(non_static)
        )
        return tri_fmap, tetra_fmap, penta_fmap

In [33]:
class MaxOverTimePooling(nn.Module):
    """MaxOverTimePooling class"""

    def forward(
        self, x: Tuple[torch.Tensor, torch.Tensor, torch.Tensor]
    ) -> torch.Tensor:
        tri_fmap, tetra_fmap, penta_fmap = x
        fmap = torch.cat(
            [
                tri_fmap.max(dim=-1)[0],
                tetra_fmap.max(dim=-1)[0],
                penta_fmap.max(dim=-1)[0],
            ],
            dim=-1,
        )
        return fmap

### `SenCNN`

In [34]:
class SenCNN(nn.Module):
    """SenCNN class"""

    def __init__(self, num_classes: int, vocab: Vocab) -> None:
        """Instantiating SenCNN class
        Args:
            num_classes (int): the number of classes
            vocab (model.utils.Vocab): the instance of model.utils.Vocab
        """
        super(SenCNN, self).__init__()
        self._embedding = MultiChannelEmbedding(vocab)
        self._convolution = ConvolutionLayer(300, 300)
        self._pooling = MaxOverTimePooling()
        self._dropout = nn.Dropout()
        self._fc = nn.Linear(300, num_classes)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        fmap = self._embedding(x)
        fmap = self._convolution(fmap)
        feature = self._pooling(fmap)
        feature = self._dropout(feature)
        score = self._fc(feature)

        return score

Train
-----

### Metric - `evaluate`, `acc`

In [35]:
import torch
from tqdm.notebook import tqdm


def evaluate(model, data_loader, metrics, device):
    if model.training:
        model.eval()

    summary = {metric: 0 for metric in metrics}

    for step, mb in tqdm(enumerate(data_loader), desc="steps", total=len(data_loader)):
        x_mb, y_mb = map(lambda elm: elm.to(device), mb)

        with torch.no_grad():
            y_hat_mb = model(x_mb)

            for metric in metrics:
                summary[metric] += (
                    metrics[metric](y_hat_mb, y_mb).item() * y_mb.size()[0]
                )
    else:
        for metric in metrics:
            summary[metric] /= len(data_loader.dataset)

    return summary


def acc(yhat, y):
    with torch.no_grad():
        yhat = yhat.max(dim=1)[1]
        acc = (yhat == y).float().mean()
    return acc

### Utils - `CheckpointManager`, `SummaryManager`

In [36]:
import json
import torch
from pathlib import Path
from typing import Union


class CheckpointManager:
    """CheckpointManager class"""

    def __init__(self, model_dir: Union[str, Path]) -> None:
        """Instantiating CheckpointManager class
        Args:
            model_dir (Union[str, Path]): directory path for saving a checkpoint
        """
        if not isinstance(model_dir, Path):
            model_dir = Path(model_dir)

        if not model_dir.exists():
            model_dir.mkdir(parents=True)

        self._model_dir = model_dir
    
    def save_checkpoint(self, state: dict, filename: str) -> None:
        """Saving a checkpoint
        Args:
            state (dict): a checkpoint
            filename (str): the filename of a checkpoint
        """
        torch.save(state, self._model_dir / filename)
    
    def load_checkpoint(self, filename: str, device: torch.device = None) -> dict:
        """Loading a checkpoint
        Args:
            filename (str): the filename of a checkpoint
            device (torch.device): device where a checkpoint will be stored
        Returns:
            state (dict): a checkpoint
        """
        device = device or (
            torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
        )
        state = torch.load(self._model_dir / filename, map_location=device)
        return state

In [37]:
class SummaryManager:
    """SummaryManager class"""

    def __init__(self, model_dir: Union[str, Path]) -> None:
        if not isinstance(model_dir, Path):
            model_dir = Path(model_dir)
        if not model_dir.exists():
            model_dir.mkdir(parents=True)

        self._model_dir = model_dir
        self._summary = {}

    def save(self, filename: str) -> None:
        """Saving a summary to model_dir
        Args:
            filename (str): the filename of a summary
        """
        with open(self._model_dir / filename, mode="w") as io:
            json.dump(self._summary, io, indent=4)

    def load(self, filename) -> None:
        """Loading a summary from model_dir
        Args:
            filename (str): the filename of a summary
        """
        with open(self._model_dir / filename, mode="r") as io:
            metric = json.loads(io.read())
        self.update(metric)

    def update(self, summary: dict) -> None:
        """Updating a summary
        Args:
            summary (dict): a summary
        """
        self._summary.update(summary)

    def reset(self) -> None:
        """Resetting a summary"""
        self._summary = {}

    @property
    def summary(self):
        return self._summary

### Implement

In [40]:
import dill
import torch
import torch.nn as nn
import torch.optim as optim
from pathlib import Path
from tqdm.notebook import tqdm
from torch.utils.data import DataLoader
from torch.nn.utils import clip_grad_norm_
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.tensorboard import SummaryWriter


def get_tokenizer(dataset_config, model_config):
    with open(dataset_config.vocab, mode="rb") as io:
        vocab = dill.load(io)
    pad_sequence = PadSequence(
        length=model_config.length, pad_val=vocab.to_indices(vocab.padding_token)
    )
    tokenizer = Tokenizer(vocab=vocab, split_fn=split_morphs, pad_fn=pad_sequence)
    return tokenizer


def get_data_loaders(dataset_config, tokenizer, batch_size):
    tr_ds = Corpus(dataset_config.train, tokenizer.split_and_transform)
    tr_dl = DataLoader(
        tr_ds, batch_size=batch_size, shuffle=True, num_workers=0, drop_last=True
    )
    val_ds = Corpus(dataset_config.validation, tokenizer.split_and_transform)
    val_dl = DataLoader(val_ds, batch_size=batch_size, num_workers=0)
    return tr_dl, val_dl

In [41]:
## RUN
fix_seed = True
epochs = 20
batch_size = 256
learning_rate = 1e-3
summary_step = 500


dataset_config = Config("conf/dataset/snufc.json")
model_config = Config("conf/model/sencnn.json")

exp_dir = Path("experiments") / model_config.type
exp_dir = exp_dir.joinpath(
    f"epochs_{epochs}_batch_size_{batch_size}_learning_rate_{learning_rate}"
)

if not exp_dir.exists():
    exp_dir.mkdir(parents=True)

if fix_seed:
    torch.manual_seed(777)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

tokenizer = get_tokenizer(dataset_config, model_config)
tr_dl, val_dl = get_data_loaders(dataset_config, tokenizer, batch_size=batch_size)
model = SenCNN(num_classes=model_config.num_classes, vocab=tokenizer.vocab)

loss_fn = nn.CrossEntropyLoss()
opt = optim.Adam(params=model.parameters(), lr=learning_rate)
scheduler = ReduceLROnPlateau(opt, patience=5)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

writer = SummaryWriter(f"{exp_dir}/runs")
checkpoint_manager = CheckpointManager(exp_dir)
summary_manager = SummaryManager(exp_dir)
best_val_loss = 1e10

In [42]:
for epoch in tqdm(range(epochs), desc="epochs"):

    tr_loss = 0
    tr_acc = 0

    model.train()
    for step, mb in tqdm(enumerate(tr_dl), desc="steps", total=len(tr_dl)):
        x_mb, y_mb = map(lambda elm: elm.to(device), mb)

        opt.zero_grad()
        y_hat_mb = model(x_mb)
        mb_loss = loss_fn(y_hat_mb, y_mb)
        mb_loss.backward()
        clip_grad_norm_(model._fc.weight, 5)
        opt.step()

        with torch.no_grad():
            mb_acc = acc(y_hat_mb, y_mb)

        tr_loss += mb_loss.item()
        tr_acc += mb_acc.item()

        if (epoch * len(tr_dl) + step) % summary_step == 0:
            val_loss = evaluate(model, val_dl, {"loss": loss_fn}, device)["loss"]
            writer.add_scalars("loss", {"train": tr_loss / (step + 1), "validation": val_loss},
                                epoch * len(tr_dl) + step)
            model.train()
    else:
        tr_loss /= step + 1
        tr_acc /= step + 1

        tr_summary = {"loss": tr_loss, "acc": tr_acc}
        val_summary = evaluate(model, val_dl, {"loss": loss_fn, "acc": acc}, device)
        scheduler.step(val_summary["loss"])
        tqdm.write(f"epoch: {epoch+1}\n"
                    f"tr_loss: {tr_summary['loss']:.3f}, val_loss: {val_summary['loss']:.3f}\n"
                    f"tr_acc: {tr_summary['acc']:.2%}, val_acc: {val_summary['acc']:.2%}")

        val_loss = val_summary["loss"]
        is_best = val_loss < best_val_loss

        if is_best:
            state = {
                "epoch": epoch + 1,
                "model_state_dict": model.state_dict(),
                "opt_state_dict": opt.state_dict(),
            }
            summary = {
                "epoch": epoch + 1,
                "train": tr_summary,
                "validation": val_summary,
            }

            summary_manager.update(summary)
            summary_manager.save("summary.json")
            checkpoint_manager.save_checkpoint(state, "best.tar")

            best_val_loss = val_loss

epochs:   0%|          | 0/20 [00:00<?, ?it/s]

steps:   0%|          | 0/8 [00:00<?, ?it/s]

steps:   0%|          | 0/3 [00:00<?, ?it/s]

steps:   0%|          | 0/3 [00:00<?, ?it/s]

epoch: 1
tr_loss: 0.715, val_loss: 0.552
tr_acc: 59.72%, val_acc: 77.33%


steps:   0%|          | 0/8 [00:00<?, ?it/s]

steps:   0%|          | 0/3 [00:00<?, ?it/s]

epoch: 2
tr_loss: 0.545, val_loss: 0.466
tr_acc: 71.78%, val_acc: 79.22%


steps:   0%|          | 0/8 [00:00<?, ?it/s]

steps:   0%|          | 0/3 [00:00<?, ?it/s]

epoch: 3
tr_loss: 0.464, val_loss: 0.454
tr_acc: 77.98%, val_acc: 80.03%


steps:   0%|          | 0/8 [00:00<?, ?it/s]

steps:   0%|          | 0/3 [00:00<?, ?it/s]

epoch: 4
tr_loss: 0.425, val_loss: 0.433
tr_acc: 79.54%, val_acc: 82.05%


steps:   0%|          | 0/8 [00:00<?, ?it/s]

steps:   0%|          | 0/3 [00:00<?, ?it/s]

epoch: 5
tr_loss: 0.386, val_loss: 0.440
tr_acc: 81.40%, val_acc: 81.11%


steps:   0%|          | 0/8 [00:00<?, ?it/s]

steps:   0%|          | 0/3 [00:00<?, ?it/s]

epoch: 6
tr_loss: 0.375, val_loss: 0.443
tr_acc: 81.59%, val_acc: 80.70%


steps:   0%|          | 0/8 [00:00<?, ?it/s]

steps:   0%|          | 0/3 [00:00<?, ?it/s]

epoch: 7
tr_loss: 0.345, val_loss: 0.430
tr_acc: 84.96%, val_acc: 82.59%


steps:   0%|          | 0/8 [00:00<?, ?it/s]

steps:   0%|          | 0/3 [00:00<?, ?it/s]

epoch: 8
tr_loss: 0.320, val_loss: 0.431
tr_acc: 87.45%, val_acc: 81.78%


steps:   0%|          | 0/8 [00:00<?, ?it/s]

steps:   0%|          | 0/3 [00:00<?, ?it/s]

epoch: 9
tr_loss: 0.290, val_loss: 0.431
tr_acc: 89.26%, val_acc: 81.65%


steps:   0%|          | 0/8 [00:00<?, ?it/s]

steps:   0%|          | 0/3 [00:00<?, ?it/s]

epoch: 10
tr_loss: 0.263, val_loss: 0.433
tr_acc: 90.87%, val_acc: 81.78%


steps:   0%|          | 0/8 [00:00<?, ?it/s]

steps:   0%|          | 0/3 [00:00<?, ?it/s]

epoch: 11
tr_loss: 0.241, val_loss: 0.441
tr_acc: 91.11%, val_acc: 80.84%


steps:   0%|          | 0/8 [00:00<?, ?it/s]

steps:   0%|          | 0/3 [00:00<?, ?it/s]

epoch: 12
tr_loss: 0.220, val_loss: 0.444
tr_acc: 92.33%, val_acc: 81.65%


steps:   0%|          | 0/8 [00:00<?, ?it/s]

steps:   0%|          | 0/3 [00:00<?, ?it/s]

epoch: 13
tr_loss: 0.195, val_loss: 0.453
tr_acc: 93.90%, val_acc: 80.03%


steps:   0%|          | 0/8 [00:00<?, ?it/s]

steps:   0%|          | 0/3 [00:00<?, ?it/s]

epoch: 14
tr_loss: 0.171, val_loss: 0.452
tr_acc: 95.70%, val_acc: 80.43%


steps:   0%|          | 0/8 [00:00<?, ?it/s]

steps:   0%|          | 0/3 [00:00<?, ?it/s]

epoch: 15
tr_loss: 0.174, val_loss: 0.454
tr_acc: 95.31%, val_acc: 79.76%


steps:   0%|          | 0/8 [00:00<?, ?it/s]

steps:   0%|          | 0/3 [00:00<?, ?it/s]

epoch: 16
tr_loss: 0.167, val_loss: 0.454
tr_acc: 95.65%, val_acc: 80.16%


steps:   0%|          | 0/8 [00:00<?, ?it/s]

steps:   0%|          | 0/3 [00:00<?, ?it/s]

epoch: 17
tr_loss: 0.160, val_loss: 0.455
tr_acc: 96.09%, val_acc: 80.16%


steps:   0%|          | 0/8 [00:00<?, ?it/s]

steps:   0%|          | 0/3 [00:00<?, ?it/s]

epoch: 18
tr_loss: 0.162, val_loss: 0.458
tr_acc: 95.56%, val_acc: 79.49%


steps:   0%|          | 0/8 [00:00<?, ?it/s]

steps:   0%|          | 0/3 [00:00<?, ?it/s]

epoch: 19
tr_loss: 0.161, val_loss: 0.456
tr_acc: 96.04%, val_acc: 80.43%


steps:   0%|          | 0/8 [00:00<?, ?it/s]

steps:   0%|          | 0/3 [00:00<?, ?it/s]

epoch: 20
tr_loss: 0.150, val_loss: 0.457
tr_acc: 96.24%, val_acc: 80.43%


In [43]:
# save model
torch.save(model.state_dict(), "model/SenCNN.st")

Predict
-------

In [44]:
import torch.nn.functional as F

device = torch.device('cpu')
model = SenCNN(num_classes=2, vocab=tokenizer.vocab)
model.load_state_dict(torch.load("model/SenCNN.st"), strict=False)
model.to(device)

if model.training:
    model.eval()

In [54]:
s = "지난 9월 10일 한 블로그에 방호복을 입은 사람들이 문을 부수는 영상과 함께 한국에서도 경찰력을 동원한 백신 강제접종이 이뤄질 것이라고 주장하는 글이 올라왔다. 해당 게시물은 응급의료에 관한 법률 개정안, 경찰관 직무집행법 개정안이 통과되면 경찰이 가정을 찾아와 백신을 강제로 접종할 것이라고 주장했다. 작성자의 말대로 경찰을 동원한 백신 강제 접종이 가능해질까."
input = torch.tensor([tokenizer.split_and_transform(s)])

In [55]:
with torch.no_grad():
    y_hat = model(input)
    prob = F.softmax(y_hat, dim=1)[0]

In [56]:
if prob[1] >= 0.5:
    print("{:.2f} 확률로 진짜 뉴스".format(prob[1] * 100))
else:
    print("{:.2f} 확률로 가짜 뉴스".format(prob[0] * 100))

94.59 확률로 가짜 뉴스
