In [1]:
from gensim.models import KeyedVectors
import os

##make sure you install the correct version of scipy (v1.12 or earlier or else this block wont work)

model = KeyedVectors.load_word2vec_format(os.path.join(os.getcwd(), 'Embedding Models', 'BioWordVec Embedding Model.bin'),binary = True)



In [2]:
parent_dir = os.path.dirname(os.getcwd())

In [4]:
#create vector dictionaries for each unique drug, disease, target used in the triples
import pandas as pd
import csv


###Read in Processed Data from dataset

df = pd.read_csv(os.path.join(parent_dir, 'Step 1 Data Processing/ROBOKOP+DrugMechDB/ROBOKOP+DrugmechDB Data/ROBOMechDB Processed Triples.csv')) #input array for embedding
df = df.drop('Unnamed: 0', axis = 1)

triples_drug = sorted(list(set(df['drug_name'].tolist())))
triples_disease = sorted(list(set(df['disease_name'].tolist())))
triples_protein = sorted(list(set(df['protein_name'].tolist())))

print(len(triples_drug))
print(len(triples_disease))
print(len(triples_protein))

#Vector embeddings

def get_phrase_vec(vocablist, phrase):
    words = phrase.replace(",", "")
    words_split = words.split(' ')
    #print(words)
    count = 0
    flag  = 0
    for i in words_split:
        if i in vocablist:
            if count == 0:
                comb_emb = model[i]
            else:
                comb_emb = model[i] + comb_emb
            count = count + 1
        else:
            flag = 1 + flag
            break
    if flag != 0:
        return flag, phrase, "no embedding"
        #print(phrase, "no embedding") 
        flag = flag
    if flag == 0:
        return flag, phrase, list(comb_emb)
         

vocab = model.index_to_key

# dns is the name or phrase you want to get vector for
# .lower is to make everything lower case so that ALL words are consistently lower case
# str should contain the vector for the phrase

drug_embeddings = []
embeddable_drug_list = []
disease_embeddings = []
embeddable_disease_list = []
protein_embeddings = []
embeddable_protein_list = []

for i in range(0,len(triples_drug)):
    fl,ph,str = get_phrase_vec(vocab,triples_drug[i])
    if str == "no embedding":
        continue
    str_array = [float(i) for i in str]
    drug_embeddings.append(str_array)
    embeddable_drug_list.append(ph)

for i in range(0,len(triples_disease)):
    fl,ph,str = get_phrase_vec(vocab, triples_disease[i])
    if str == "no embedding":
        continue
    str_array = [float(i) for i in str]
    disease_embeddings.append(str_array)
    embeddable_disease_list.append(ph)

for i in range(0,len(triples_protein)):
    fl,ph,str = get_phrase_vec(vocab,triples_protein[i])
    if str == "no embedding":
        continue
    str_array = [float(i) for i in str]
    protein_embeddings.append(str_array)
    embeddable_protein_list.append(ph)

disease_vector_dict= {}
drug_vector_dict = {}
protein_vector_dict = {}

for key, value in zip(embeddable_protein_list, protein_embeddings):
    protein_vector_dict[key] = value

for key, value in zip(embeddable_drug_list, drug_embeddings):
    drug_vector_dict[key] = value

for key, value in zip(embeddable_disease_list, disease_embeddings):
    disease_vector_dict[key] = value








1014
592
899


In [5]:
print(len(embeddable_disease_list))
print(len(disease_embeddings))

print(len(embeddable_drug_list))
print(len(drug_embeddings))

print(len(embeddable_protein_list))
print(len(protein_embeddings))

590
590
999
999
836
836


In [6]:
print(len(drug_vector_dict))
print(len(disease_vector_dict))
print(len(protein_vector_dict))

999
590
836


In [7]:
print(os.getcwd())

/Users/eding/PycharmProjects/U24-ROBOKOP-Project-8-7-24/Data Embedding & Model Development


In [8]:
#export vector dictionary for use in embedding triples

with open(os.path.join(os.getcwd(), 'Vector Dictionaries', 'ROBOMechDB Disease Vector Dictionary.csv') , 'w', encoding="utf-8", newline='') as csvfile:
    writer = csv.writer(csvfile)
    
    # Write the header
    writer.writerow(['key'] + [f'value_{i+1}' for i in range(200)])
    
    # Write the key and the values
    for key, values in disease_vector_dict.items():
        writer.writerow([key] + values)

    
with open(os.path.join(os.getcwd(), 'Vector Dictionaries', 'ROBOMechDB Drug Vector Dictionary.csv'), 'w', encoding="utf-8", newline='') as csvfile:
    writer = csv.writer(csvfile)
    
    # Write the header
    writer.writerow(['key'] + [f'value_{i+1}' for i in range(200)])
    
    # Write the key and the values
    for key, values in drug_vector_dict.items():
        writer.writerow([key] + values)

with open(os.path.join(os.getcwd(), 'Vector Dictionaries', 'ROBOMechDB Protein Vector Dictionary.csv'), 'w', encoding="utf-8", newline='') as csvfile:
    writer = csv.writer(csvfile)
    
    # Write the header
    writer.writerow(['key'] + [f'value_{i+1}' for i in range(200)])
    
    # Write the key and the values
    for key, values in protein_vector_dict.items():
        writer.writerow([key] + values)


    


In [13]:
test = pd.read_csv('/Users/eding/Desktop/U24 ROBOKOP Project/Data Embedding/Vector Dictionaries/ROBOMechDB Disease Vector Dictionary.csv')

In [14]:
test2 = pd.read_csv('/Users/eding/Desktop/U24 ROBOKOP Project/Data Embedding/Vector Dictionaries/ROBOMechDBa Disease Vector Dictionary.csv')

In [15]:
print(test2)

                                            key   value_1   value_2   value_3  \
0                             abdominal abscess -0.322403  0.135110 -0.764980   
1                           abdominal infection  0.818227 -0.024040 -0.592960   
2                              absence epilepsy  0.067868  1.407110 -0.075173   
3                             acidosis disorder -0.306392  1.126410  0.551575   
4                                          acne  0.341810  0.444530  0.289900   
..                                          ...       ...       ...       ...   
585                    vulvovaginal candidiasis -0.136126  0.820252  0.251530   
586               waldenstrom macroglobulinemia  0.962720  1.390580  0.220943   
587                               west syndrome  1.101950  0.397000  1.842845   
588                                 wilms tumor  0.531680  0.386080  0.119300   
589  x-linked dominant hypophosphatemic rickets  0.647884  1.248490 -0.275390   

      value_4   value_5   v

In [16]:
print(test)

                                            key   value_1   value_2   value_3  \
0                             abdominal abscess -0.322403  0.135110 -0.764980   
1                           abdominal infection  0.818227 -0.024040 -0.592960   
2                              absence epilepsy  0.067868  1.407110 -0.075173   
3                             acidosis disorder -0.306392  1.126410  0.551575   
4                                          acne  0.341810  0.444530  0.289900   
..                                          ...       ...       ...       ...   
585                    vulvovaginal candidiasis -0.136126  0.820252  0.251530   
586               waldenstrom macroglobulinemia  0.962720  1.390580  0.220943   
587                               west syndrome  1.101950  0.397000  1.842845   
588                                 wilms tumor  0.531680  0.386080  0.119300   
589  x-linked dominant hypophosphatemic rickets  0.647884  1.248490 -0.275390   

      value_4   value_5   v