In [1]:
from bilstm_crf import BiLSTMCRF
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

# Load material

In [2]:
import json

# load vocab
with open('data/vocab.txt', 'r') as f:
    vocab = f.read().split('\n')
len(vocab)

# load tag_to_id
with open('data/tag_to_id.json', 'r') as f:
    tag_to_id = json.load((f))

# load train and dev data
TRAIN_PATH = 'data/process_data/train.json'
DEV_PATH = 'data/process_data/dev.json'

with open(TRAIN_PATH, 'r') as f:
    train_data = json.load(f)

with open(DEV_PATH, 'r') as f:
    dev_data = json.load(f)

train_sentences = list(train_data['text'].values())
dev_sentences = list(dev_data['text'].values())

train_labels = list(train_data['labels'].values())
dev_labels = list(dev_data['labels'].values())

In [3]:
# load embedding
embedding_maxtrix = np.load('embedding/embedding_matrix.npy')

In [None]:
# train_labels is list of [start, end, tag#sentiment] for each sentence
# convert to list of [start, end, tag] for each sentence to new form for train
new_train_labels = []
for i in range(len(train_labels)):

    new_aspect_polarity = []
    for aspect_plority in train_labels[i]:

        start, end, tag_sentiment = aspect_plority
        tag, sentiment = tag_sentiment.split('#')
        new_aspect_polarity.append([start, end, tag])
    new_train_labels.append(new_aspect_polarity)

# convert to list of [start, end, tag] for each sentence to new form for dev
new_dev_labels = []
for i in range(len(dev_labels)):

    new_aspect_polarity = []
    for aspect_plority in dev_labels[i]:

        start, end, tag_sentiment = aspect_plority
        tag, sentiment = tag_sentiment.split('#')
        new_aspect_polarity.append([start, end, tag])
    new_dev_labels.append(new_aspect_polarity)


In [5]:
import numpy as np

# Convert data to ids
def convert_to_ids(data, vocab, max_len=256):
    id_data = []

    pad_token_id = vocab.index('<PAD>')
    ukn_token_id = vocab.index('<UNK>')
    for sentence in data:
        ids = []
        for word in sentence.split():
            if word in vocab:
                ids.append(vocab.index(word))
            else:
                ids.append(ukn_token_id)

        if len(ids) < max_len:
            ids += [pad_token_id] * (max_len - len(ids))
        id_data.append(np.array(ids))
        
    return id_data

In [6]:
train_tokenized = convert_to_ids(train_sentences, vocab)
dev_tokenized = convert_to_ids(dev_sentences, vocab)

In [7]:
train_tokenized = [torch.LongTensor(tokenized) for tokenized in train_tokenized]
dev_tokenized = [torch.LongTensor(tokenized) for tokenized in dev_tokenized]

# Model

In [8]:
span_detection_model = BiLSTMCRF(vocab_size=len(vocab), tag_to_ix=tag_to_id, hidden_dim=200, embedding_maxtrix=embedding_maxtrix)

In [25]:
sentence = train_tokenized[0]
feats = span_detection_model._get_lstm_features(sentence)

In [9]:
# train
optimizer = optim.SGD(span_detection_model.parameters(), lr=0.01, weight_decay=1e-4)
epoch_num = 10

for epoch in range(epoch_num):
    print('epoch: {}'.format(epoch))
    for i in range(len(train_tokenized)):
        sentence = train_tokenized[i]
        label = new_train_labels[i]
        span_detection_model.zero_grad()
        loss = span_detection_model.neg_log_likelihood(sentence, label)
        loss.backward()
        optimizer.step()
        if i % 100 == 0:
            print('loss: {}'.format(loss.item()))

    # dev test
    # with torch.no_grad():
    #     for i in range(len(dev_sentences)):
    #         sentence = dev_sentences[i]
    #         label = dev_labels[i]
    #         span_detection_model.zero_grad()
    #         loss = span_detection_model.neg_log_likelihood(sentence, label)
    #         loss.backward()
    #         optimizer.step()
    #         if i % 100 == 0:
    #             print('loss: {}'.format(loss.item()))

# save model
# torch.save(span_detection_model.state_dict(), 'model/span_detection_model.pkl')

epoch: 0


AttributeError: 'list' object has no attribute 'shape'