In [18]:
import torch
from bilstm_crf import BiLSTMCRF

# Load data

In [None]:
import json

# load vocab
with open('data/vocab.txt', 'r') as f:
    vocab = f.read().split('\n')
len(vocab)

# load tag_to_id
with open('data/tag_to_id.json', 'r') as f:
    tag_to_id = json.load((f))

In [12]:
# load train and dev data
TRAIN_PATH = 'data/process_data/train.json'
DEV_PATH = 'data/process_data/dev.json'

with open(TRAIN_PATH, 'r') as f:
    train_data = json.load(f)

with open(DEV_PATH, 'r') as f:
    dev_data = json.load(f)

In [None]:
train_sentences = list(train_data['text'].values())
dev_sentences = list(dev_data['text'].values())

train_labels = list(train_data['labels'].values())
dev_labels = list(dev_data['labels'].values())

In [1]:
import numpy as np

# Convert data to ids
def convert_to_ids(data, vocab, max_len=256):
    id_data = []

    pad_token_id = vocab.index('<PAD>')
    ukn_token_id = vocab.index('<UKN>')
    for sentence in data:
        ids = []
        for word in sentence.split():
            if word in vocab:
                ids.append(vocab.index(word))
            else:
                ids.append(ukn_token_id)

        if len(ids) < max_len:
            ids += [pad_token_id] * (max_len - len(ids))
        id_data.append(np.array(ids))
        
    return id_data

In [None]:
train_tokenized = convert_to_ids(train_sentences, vocab)
dev_tokenized = convert_to_ids(dev_sentences, vocab)

# Model

In [None]:
import fasttext

# Path to the FastText word vectors file
weights_path = 'weights/cc.vi.300.vec'

# Load FastText word vectors
model = fasttext.load_model(weights_path)

# Get the vocabulary and vector dimension
vocabulary = model.words
vector_dim = model.get_dimension()

# Create an embedding matrix
embedding_matrix = []

for word in vocab:
    vector = model.get_word_vector(word)
    embedding_matrix.append(vector)

# Convert the embedding matrix to a NumPy array
import numpy as np
embedding_matrix = np.array(embedding_matrix)

# Now you have your embedding matrix
print("Embedding Matrix Shape:", embedding_matrix.shape)

In [None]:
# save embedding matrix
np.save('data/embedding_matrix.npy', embedding_matrix)

# End