In [27]:
import torch
import pandas as pd
import numpy as np

from collections import Counter
from torch import nn, optim
from torch.utils.data import DataLoader

In [28]:
class Dataset(torch.utils.data.Dataset):
    def __init__(
        self, sequence_length, documents, labels,
    ):
        self.sequence_length = sequence_length
        self.words = self.load_words(documents)
        self.uniq_words = self.get_uniq_words()

        # id vocab mulai dari 1, bukan 0; 0 untuk [PAD]
        self.index_to_word = {(index + 1): word for index, word in enumerate(self.uniq_words)}
        self.word_to_index = {word: (index + 1) for index, word in enumerate(self.uniq_words)}
        self.index_to_word[0] = "[PAD]"
        self.word_to_index["[PAD]"] = 0

        self.labels = labels
        self.docs = []
        for doc in documents:
            self.docs.append(self.to_ids(doc))

    def to_ids(self, doc):
        doc = [self.word_to_index[w] for w in self.tokenize(doc)]
        if len(doc) >= self.sequence_length:
            doc = doc[:self.sequence_length]
        else:
            doc += [0] * (self.sequence_length - len(doc))
        return doc

    def tokenize(self, text):
        return text.split(' ')

    def load_words(self, documents):
        text = ""
        for doc in documents:
          text += doc + " "
        return self.tokenize(text)

    def get_uniq_words(self):
        word_counts = Counter(self.words)
        return sorted(word_counts, key=word_counts.get, reverse=True)

    def __len__(self):
        return len(self.docs)

    def __getitem__(self, index):
        return (
            torch.tensor(self.docs[index]),
            torch.tensor(self.labels[index]),
        )

In [29]:
class Conv1DNet(nn.Module):

    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, kernel_width):

        super(Conv1DNet,self).__init__()

        # Embedding layer
        # padding_idx (int, optional) – If specified, the entries at padding_idx do
        # not contribute to the gradient; therefore, the embedding vector at
        # padding_idx is not updated during training, i.e. it remains as a fixed “pad”.
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

        # Conv1D layer process the vector sequences
        self.conv1D = nn.Conv1d(embedding_dim, hidden_dim, kernel_width)

        # Dense layer to predict
        self.fc = nn.Linear(hidden_dim, output_dim)

        # Prediction activation function
        self.sigmoid = nn.Sigmoid()


    def forward(self, text):
        embedded = self.embedding(text)

        embedded = embedded.permute(0, 2, 1) # Input shape: (batch_size, embedding_dim, sequence_length)

        output = self.conv1D(embedded)
        output = torch.mean(output, dim=2)
        output = self.fc(output)
        output = self.sigmoid(output)
        return output

In [30]:
def train(dataset, model, batch_size, max_epochs=400):
    model.train()

    dataloader = DataLoader(dataset, batch_size=batch_size)
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(max_epochs):
        for batch, (x, y) in enumerate(dataloader):
            y_pred = model(x)
            loss = criterion(y_pred, y)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            print({ 'epoch': epoch, 'batch': batch, 'loss': loss.item() })

In [31]:
documents = ["buku bagus rapih cerdas dan menarik",
             "rumah rapih cantik dan bersih",
             "hotel kotor berisik dan bau",
             "kantin jorok kotor mahal dan panas"]
labels = [[1.], [1.], [0.], [0.]]

dataset = Dataset(8, documents, labels)
model = Conv1DNet(len(dataset.index_to_word), 16, 16, 1, 3)
train(dataset, model, 2, max_epochs=200)

{'epoch': 0, 'batch': 0, 'loss': 0.5971552133560181}
{'epoch': 0, 'batch': 1, 'loss': 0.8034646511077881}
{'epoch': 1, 'batch': 0, 'loss': 0.5876234173774719}
{'epoch': 1, 'batch': 1, 'loss': 0.7947719097137451}
{'epoch': 2, 'batch': 0, 'loss': 0.5835554599761963}
{'epoch': 2, 'batch': 1, 'loss': 0.7847976684570312}
{'epoch': 3, 'batch': 0, 'loss': 0.5802994966506958}
{'epoch': 3, 'batch': 1, 'loss': 0.7743726968765259}
{'epoch': 4, 'batch': 0, 'loss': 0.5773488283157349}
{'epoch': 4, 'batch': 1, 'loss': 0.7637473940849304}
{'epoch': 5, 'batch': 0, 'loss': 0.5745291709899902}
{'epoch': 5, 'batch': 1, 'loss': 0.7530231475830078}
{'epoch': 6, 'batch': 0, 'loss': 0.5717544555664062}
{'epoch': 6, 'batch': 1, 'loss': 0.7422456741333008}
{'epoch': 7, 'batch': 0, 'loss': 0.5689735412597656}
{'epoch': 7, 'batch': 1, 'loss': 0.7314350605010986}
{'epoch': 8, 'batch': 0, 'loss': 0.5661522150039673}
{'epoch': 8, 'batch': 1, 'loss': 0.7205992937088013}
{'epoch': 9, 'batch': 0, 'loss': 0.56326538324

In [32]:
# prediction
model.eval()

with torch.no_grad():
  sent = "hotel dan kantin rapih bersih menarik dan bagus"
  sent = torch.tensor([dataset.to_ids(sent)])
  print(model(sent))

tensor([[0.9925]])
