## Gerador de Word Embedding (Gensim) utilizando os datasets disponibilizados

In [2]:
import json
import os
import string
import re

from gensim.models.word2vec import Word2Vec
from gensim.models.keyedvectors import KeyedVectors
import multiprocessing

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gdutr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
LABEL_DRUG_PROTEIN = 'DRUG-PROTEIN'
LABEL_CHEMICAL = 'CHEMICAL'
LABEL_DISEASE = 'DISEASE'
LABEL_SPECIES = 'SPECIES'

LABEL_LIST = [LABEL_DRUG_PROTEIN,
              LABEL_CHEMICAL,
              LABEL_DISEASE,    
              LABEL_SPECIES]

LABEL_TO_DIR = {
    LABEL_DRUG_PROTEIN: ['BC2GM', 'JNLPBA'],
    LABEL_CHEMICAL: ['BC4CHEMD','BC5CDR-chem'],
    LABEL_DISEASE: ['BC5CDR-disease', 'NCBI-disease'],    
    LABEL_SPECIES: ['linnaeus', 's800']
}

DATA_ORIGIN_PATH = os.path.join("data","origin")
DATA_PREPARED_PATH = os.path.join("data", "prepared")
DATA_AGGREGATE_PATH = os.path.join(DATA_PREPARED_PATH, "aggregate")

WORD_VECTOR_PATH = "word_vec"
WORD_VECTOR_MODEL_NAME = os.path.join(WORD_VECTOR_PATH, "biomed.model")
WORD_VECTOR_GENSIM = os.path.join(WORD_VECTOR_PATH, "biomed_word2vec.txt")
WORD_VECTOR_GLOVE = os.path.join(WORD_VECTOR_PATH, "glove.840B.300d.txt")
WORD_VECTOR_EXPORT_TENSORBOARD_PATH = os.path.join(WORD_VECTOR_PATH, "tensorboard", "biomed_ner")

MODEL_PATH = "model"
MODEL_TRAIN_PATH = os.path.join(MODEL_PATH, "train")
MODEL_ACTUAL_PATH = os.path.join(MODEL_PATH, "actual")

TSV_EXTENSION = ".tsv"
JSON_EXTENSION = ".json"
SPACY_EXTENSION = ".spacy"

TRAIN_DEV_DATASET = "train_dev"
TRAIN_DATASET = "train"
VALIDATE_DATASET = "devel"
TEST_DATASET = "test"

DATASET_TYPE = [TRAIN_DATASET, VALIDATE_DATASET, TEST_DATASET]

In [4]:
def generate_word2vec():
    lista_stopwords = set(stopwords.words('english'))
    list_files = os.listdir(DATA_AGGREGATE_PATH)
    sentences = []
    for file in list_files:
        file_full_path = os.path.join(DATA_AGGREGATE_PATH, file) 
        
        if not os.path.isfile(file_full_path):
            continue
        
        with open (file_full_path, "r", encoding="utf-8") as f:
            json_data = json.load(f)
        
        for an in json_data:            
            texto = an["texto"].lower()
            # retirar nÃºmeros
            texto = re.sub(r"\b\d+(?:\.\d*(?:[eE]\d+))?\b",'', texto)
            lista_token = []
            for token in texto.split():
                if token not in string.punctuation and token not in lista_stopwords and len(token) > 1:
                    lista_token.append(token)
                
            sentences.append(lista_token)

    cores = multiprocessing.cpu_count()
    w2v_model = Word2Vec(min_count=1,
                        window=5,
                        vector_size=500,
                        sample=6e-5,
                        alpha=0.03,
                        min_alpha=0.0007,
                        negative=20,
                        workers=5)
    
    w2v_model.build_vocab(sentences)
    w2v_model.train(sentences,total_examples=w2v_model.corpus_count, epochs=1)
    
    w2v_model.save(WORD_VECTOR_MODEL_NAME)      
    w2v_model.wv.save_word2vec_format(WORD_VECTOR_FILE_NAME)    
        

In [79]:
generate_word2vec()

#### Exportando arquivos para visualizar no Tensorboard

In [10]:
!python -m gensim.scripts.word2vec2tensor -i word_vec/biomed_word2vec.txt -o word_vec/tensorboard/biomed

2021-04-13 05:12:54,706 - word2vec2tensor - INFO - running C:\Users\gdutr\miniconda3\envs\nlp-gpu\lib\site-packages\gensim\scripts\word2vec2tensor.py -i word_vec/biomed_word2vec.txt -o word_vec/tensorboard/biomed
2021-04-13 05:12:54,706 - keyedvectors - INFO - loading projection weights from word_vec/biomed_word2vec.txt
2021-04-13 05:13:36,310 - utils - INFO - KeyedVectors lifecycle event {'msg': 'loaded (90024, 500) matrix of type float32 from word_vec/biomed_word2vec.txt', 'binary': False, 'encoding': 'utf8', 'datetime': '2021-04-13T05:13:36.310462', 'gensim': '4.0.1', 'python': '3.8.8 (default, Feb 24 2021, 15:54:32) [MSC v.1928 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'load_word2vec_format'}
2021-04-13 05:14:02,965 - word2vec2tensor - INFO - 2D tensor file saved to word_vec/tensorboard/biomed_tensor.tsv
2021-04-13 05:14:02,965 - word2vec2tensor - INFO - Tensor metadata file saved to word_vec/tensorboard/biomed_metadata.tsv
2021-04-13 05:14:02,965 - word2v