# 필요한 라이브러리 임포트

In [88]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import pandas as pd

# 전처리 함수 정의

In [2]:
# 문자 종류 확인
def build_vocab_from_df(df: pd.DataFrame):
    char_vocab = set()
    for line in df["msg"]:
        for c in line:
            char_vocab.add(c)
    return char_vocab

In [605]:
train = pd.read_csv("data/train.csv")
val = pd.read_csv("data/val.csv")
test = pd.read_csv("data/test.csv")

print("train:",train.shape)
print("val:",val.shape)
print("test:",test.shape)

train: (3618, 2)
val: (775, 2)
test: (776, 2)


### 문자 종류 갯수 확인

In [606]:
train_vocab = build_vocab_from_df(train)
val_vocab = build_vocab_from_df(val)
test_vocab = build_vocab_from_df(test)
print(len(train_vocab))
print(sorted(train_vocab))
print(len(val_vocab))
print(sorted(val_vocab))
print(len(test_vocab))
print(sorted(test_vocab))

print(set(test_vocab) - set(train_vocab))
print(set(val_vocab) - set(train_vocab))

94
[' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '|', '~', '£', '…']
88
[' ', '!', '"', '#', '$', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '|', '£', '…']
88
[' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/

## vocab 생성

In [607]:
char2idx = dict()
char2idx["pad"] = 0
char2idx["unk"] = 1
for index, char in enumerate(train_vocab):
    char2idx[char] = index + 2

idx2char = {value: key for key, value in char2idx.items()}
print(idx2char)
print("vocab size:", len(char2idx))

{0: 'pad', 1: 'unk', 2: 'P', 3: '!', 4: "'", 5: 'L', 6: '"', 7: 'z', 8: 'Z', 9: '.', 10: 'r', 11: '[', 12: 'K', 13: '1', 14: 'w', 15: '…', 16: '&', 17: 'p', 18: 'V', 19: '\\', 20: 'J', 21: '@', 22: '#', 23: 'O', 24: 'S', 25: '?', 26: 'M', 27: '$', 28: 'F', 29: 'b', 30: 'T', 31: 'W', 32: '7', 33: ']', 34: 'G', 35: '=', 36: '5', 37: 'N', 38: 'o', 39: 'B', 40: '%', 41: 'x', 42: '(', 43: 'q', 44: 'D', 45: 'I', 46: 'i', 47: 'C', 48: ',', 49: 'U', 50: ' ', 51: 'h', 52: 'g', 53: 'A', 54: 'c', 55: '9', 56: '~', 57: ':', 58: ')', 59: '*', 60: 'l', 61: '-', 62: 'H', 63: 'y', 64: '|', 65: 'd', 66: 's', 67: '+', 68: 'X', 69: '0', 70: '6', 71: '8', 72: 'v', 73: 'Y', 74: '<', 75: 'k', 76: 'u', 77: '3', 78: 'a', 79: 'f', 80: 'R', 81: 'Q', 82: '4', 83: '_', 84: '/', 85: 'm', 86: 'e', 87: 'n', 88: 'j', 89: '>', 90: '£', 91: '^', 92: 'E', 93: '2', 94: ';', 95: 't'}
vocab size: 96


In [608]:
seq_len = train["msg"].str.len().max()
input_length = len(char2idx)
print(seq_len)

199


# 커스텀 데이터셋 정의

In [609]:
class MyDataset(Dataset):
    def __init__(self, data=train):
        self.data = data

    def __len__(self):
        return self.data.shape[0]

    def __getitem__(self, idx):
        x = self.data["msg"][idx]
        y = self.data["label"][idx]

        return x, y

# 토크나이저 정의

In [610]:
def tokenize(string: str):
    x_data = [char2idx[c] for c in string]
    return torch.tensor(x_data)

# Collate function 정의

In [611]:
def collate_fn(samples):
    x = [i[0] for i in samples]
    x = [tokenize(i) for i in x]
    x = pad_sequence(x, batch_first=True)
    
    y = [i[1] for i in samples]
    y = torch.tensor(y, dtype=torch.float32)

    return x, y

# RNN Cell과 모델 정의

In [698]:
class RNNCell_Encoder(nn.Module):
    def __init__(self, input_dim, hidden_size):
        super(RNNCell_Encoder, self).__init__()

        self.input_dim = input_dim
        self.hidden_size = hidden_size

        self.rnn = nn.RNNCell(input_dim, hidden_size)

    def forward(self, input):
        ht = torch.zeros(self.hidden_size)

        for character in input:
            ht = self.rnn(character, ht)
        return ht


class Net(nn.Module):
    def __init__(self, embedding_dim, hidden_size):
        super(Net, self).__init__()

        self.em = nn.Embedding(len(char2idx), embedding_dim)  # 임베딩
        self.rnn = RNNCell_Encoder(embedding_dim, hidden_size)
        self.fc1 = nn.Linear(hidden_size, 1)
        self.fc2 = nn.Linear(10, 1)

    def forward(self, x):
        x = self.em(x)
        x = x.squeeze(0)
        x = self.rnn(x)
        x = F.sigmoid(self.fc1(x))
        # x = F.relu(self.fc1(x))
        return x

# 하이퍼파라미터 정의

In [705]:
model = Net(2, 50)

loss_fn = nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

# 훈련

In [706]:
def training(epoch, model, trainloader, valloader):
    correct = 0
    total = 0
    running_loss = 0

    model.train()

    for x, y in trainloader:
        optimizer.zero_grad()
        y_pred = model(x)
        loss = loss_fn(y_pred, y)
        loss.backward()
        optimizer.step()

        with torch.no_grad():
            y_pred = torch.argmax(y_pred, dim=0)
            correct += (y_pred == y).sum().item()
            total += y.size(0)
            running_loss += loss.item()

    epoch_loss = running_loss / len(trainloader)
    epoch_acc = correct / total

    val_correct = 0
    val_total = 0
    val_running_loss = 0

    model.eval()
    with torch.no_grad():
        for x, y in valloader:
            y_pred = model(x)
            loss = loss_fn(y_pred, y)
            y_pred = torch.argmax(y_pred, dim=0)
            val_correct += (y_pred == y).sum().item()
            val_total += y.size(0)
            val_running_loss += loss.item()

    epoch_val_loss = val_running_loss / len(valloader)
    epoch_val_acc = val_correct / val_total

    print(f"train_loss :{epoch_loss:.6f} train_acc :{epoch_acc:.6f} val_loss :{epoch_val_loss:.6f} val_acc :{epoch_val_acc:.6f}")

    return epoch_loss, epoch_acc, epoch_val_loss, epoch_val_acc

In [707]:
import time

train_ds = MyDataset(train)
val_ds = MyDataset(val)

epochs = 5
train_loss = []
train_acc = []
val_loss = []
val_acc = []

start = time.time()
for epoch in range(epochs):
    epoch_start = time.time()

    train_iterator = iter(
        DataLoader(train_ds, batch_size=1, shuffle=True, collate_fn=collate_fn)
    )
    val_iterator = iter(
        DataLoader(val_ds, batch_size=1, shuffle=True, collate_fn=collate_fn)
    )

    epoch_loss, epoch_acc, epoch_val_loss, epoch_val_acc = training(
        epoch, model, train_iterator, val_iterator
    )

    train_loss.append(epoch_loss)
    train_acc.append(epoch_acc)
    val_loss.append(epoch_val_loss)
    val_acc.append(epoch_val_acc)

    epoch_end = time.time()
    print(f"epoch {epoch} finished in {epoch_end-epoch_start:.2f} seconds")

end = time.time()
print(f"train_loss :{sum(train_loss)/epochs:.6f}\
      train_acc :{sum(train_acc)/epochs:.6f}\
        val_loss :{sum(val_loss)/epochs:.6f}\
        val_acc :{sum(val_acc)/epochs:.6f}")

print(f"training finished in {end-start:.2f} seconds")

train_loss :0.433509 train_acc :0.873687 val_loss :0.374697 val_acc :0.873548
epoch 0 finished in 21.77 seconds
train_loss :0.375332 train_acc :0.873687 val_loss :0.373275 val_acc :0.873548
epoch 1 finished in 21.59 seconds
train_loss :0.374057 train_acc :0.873687 val_loss :0.371781 val_acc :0.873548
epoch 2 finished in 21.57 seconds
train_loss :0.372934 train_acc :0.873687 val_loss :0.370693 val_acc :0.873548
epoch 3 finished in 21.56 seconds
train_loss :0.371854 train_acc :0.873687 val_loss :0.369035 val_acc :0.873548
epoch 4 finished in 21.60 seconds
train_loss :0.385537      train_acc :0.873687        val_loss :0.371896        val_acc :0.873548
training finished in 108.09 seconds


# 테스트

In [712]:
test_ds = MyDataset(test)
test_iterator = iter(DataLoader(test_ds, batch_size=1, collate_fn=collate_fn))

test_correct = 0
test_total = 0
test_loss = 0

model.eval()
with torch.no_grad():
    for x, y in test_iterator:
        y_pred = model(x)
        loss = loss_fn(y_pred, y)
        y_pred = torch.argmax(y_pred, dim=0)
        if y_pred == 1:
            print('y is spam')
        test_correct += (y_pred == y).sum().item()
        test_total += y.size(0)
        test_loss += loss.item()

test_loss = test_loss / len(test_iterator)
test_acc = test_correct / test_total
print(f'test loss: {test_loss}\
      test accuracy: {test_acc}')

test loss: 0.3739169838924691      test accuracy: 0.8737113402061856


# 결과

In [713]:
(test["label"] == 0).sum() / test.shape[0]

0.8737113402061856

모든 데이터포인트에 대해 ham을 예측했다.  
낮은 성능의 이유로는 문자 레벨 토큰화로 인해 너무 길어진 시퀀스 길이, RNNCell 사용으로 배치 훈련 불가 등으로 추측된다.