### STEP 2. GENERATE INTERNAL VALIDATION SETS & MODEL TRAINING####

### RUN ALL THE CODE BLOCK COMBINATIONS BELOW####
### MAKE SURE TO RESTART THE KERNEL BEFORE EACH RUN 

#represents the block number of the code block

## Complete Set 1. positive triples + randomNegative triples (without distanceThreshold) —> run #1, #2, #3, #4
## Complete Set 2. positive triples + restrictedNegative triples (with distanceThreshold) —> run #1, #2, #3, #5


In [1]:
###FUNCTIONS#####
from functions import csvtodict, compute_threshold, generate_restricted_negatives, build_model
import pandas as pd
import numpy as np
import os
import itertools
import random


In [2]:
#####1.
parent_dir = os.path.dirname(os.getcwd())
print(parent_dir)

unified_columns = [f"{i}" for i in range(604)]
filename = os.path.join(parent_dir, 'Step 1 Data Processing/ROBOKOP+DrugMechDB/ROBOKOP+DrugmechDB Data/ROBOMechDB Processed Triples.csv') ###read in the therapeutic triples
df=pd.read_csv(filename)

df.columns = ['0','1','2']

df_80 = df.sample(n=int(0.8*len(df)), random_state=42)  ##take 80% of the positive therapeutic triples
df_80_indices = df_80.index.tolist()

####Find all unique drugs, diseases, and proteins in this 80% set. 
unique_triples_drug = sorted(list(set(df_80['0'])))
unique_triples_disease =sorted(list(set(df_80['1'])))
unique_triples_protein = sorted(list(set(df_80['2'])))



###Create a dictionary with all the external validation set triples so they're not accidentally part of model training set
df_randNegs = pd.read_csv(os.path.join(parent_dir, 'Step 3 External Validation and Model Development/External Validation Datasets/randomNegatives External Validation Set.csv'))
df_restrictedNegs = pd.read_csv(os.path.join(parent_dir, 'Step 3 External Validation and Model Development/External Validation Datasets/restrictedNegatives External Validation Set.csv'))
df_diverseNegs = pd.read_csv(os.path.join(parent_dir, 'Step 3 External Validation and Model Development/External Validation Datasets/diverseNegatives External Validation Set.csv'))

df_ext = pd.concat([df_randNegs.iloc[:,:3],df_restrictedNegs.iloc[:,:3],df_diverseNegs.iloc[:,:3],df],axis=0)
pos_neg_triples_dictionary = {}
values = my_list = [1] * len(df_ext)
keys = [df_ext.iloc[i,0] + " " + df_ext.iloc[i,1]+ " " + df_ext.iloc[i,2] for i in range(0,len(df_ext))]

for key,value in zip(keys,values):
    pos_neg_triples_dictionary[key] = value



/Users/eding/PycharmProjects/U24-ROBOKOP-Project-8-21-24


In [3]:
"""2. Importing created protein, disease, and drug vector dictionaries into here"""
protein_dict = csvtodict(os.path.join(parent_dir,
                                      'Step 2 Data Embedding/Vector Dictionaries/ROBOMechDB Protein Vector Dictionary.csv'))

disease_dict = csvtodict(os.path.join(parent_dir,
                                      'Step 2 Data Embedding/Vector Dictionaries/ROBOMechDB Disease Vector Dictionary.csv'))

drug_dict = csvtodict(os.path.join(parent_dir,
                                   'Step 2 Data Embedding/Vector Dictionaries/ROBOMechDB Drug Vector Dictionary.csv'))

protein_df = pd.read_csv(os.path.join(parent_dir,
                                      'Step 2 Data Embedding/Vector Dictionaries/ROBOMechDB Protein Vector Dictionary.csv'),
                         header=0)
disease_df = pd.read_csv(os.path.join(parent_dir,
                                      'Step 2 Data Embedding/Vector Dictionaries/ROBOMechDB Disease Vector Dictionary.csv'),
                         header=0)
drug_df = pd.read_csv(os.path.join(parent_dir,
                                   'Step 2 Data Embedding/Vector Dictionaries/ROBOMechDB Drug Vector Dictionary.csv'),
                      header=0)

In [4]:
"""3. Create feature embedding dataframe for positive triples"""
positive_triples_vector_array = []
for i in range(0,len(df_80)):
    try:
        drug_vector = drug_df.iloc[drug_dict[df_80.iloc[i,0]],1:201].tolist()
        disease_vector = disease_df.iloc[disease_dict[df_80.iloc[i,1]],1:201].tolist()
        protein_vector = protein_df.iloc[protein_dict[df_80.iloc[i,2]],1:201].tolist()
        
        drug_name = df_80.iloc[i,0]
        disease_name = df_80.iloc[i,1]
        protein_name = df_80.iloc[i,2]
        
        row = [[drug_name],[disease_name],[protein_name],drug_vector,disease_vector,protein_vector,[1]]
        merged = list(itertools.chain(*row))
        positive_triples_vector_array.append(merged)

    except KeyError:
        continue

positive_triples_dataframe = pd.DataFrame(positive_triples_vector_array)

###Initialize random seeds (so we can replicate experiments)
random_seeds = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,42]

model_auc_values = []

In [5]:
print(positive_triples_dataframe)

                   0                                  1    \
0           nintedanib      non-small cell lung carcinoma   
1          vinblastine                   kaposi's sarcoma   
2           alprazolam                            anxiety   
3      methamphetamine                   obesity disorder   
4             pindolol              hypertensive disorder   
...                ...                                ...   
7338      prednisolone                            uveitis   
7339  ethinylestradiol  postmenopausal atrophic vaginitis   
7340         mianserin          major depressive disorder   
7341          nicotine                nicotine dependence   
7342       gemcitabine                     ovarian cancer   

                                                    2        3         4    \
0               pre-mrna processing factor kinase prp4k  0.61658  0.393360   
1                             tubulin beta 2a class iia  0.80437  0.168270   
2     gamma-aminobutyric acid typ

In [7]:
"""4. Complete Set 1 Development"""

for i in random_seeds:
    random.seed(i)

    negative_triples_array = []
    negative_triples_vector_array = []
    temp = set()
    j=0
    while j < int(1.1*len(positive_triples_dataframe)):  ###create negative triples
        drug = random.sample(unique_triples_drug,k=1)[0]
        protein = random.sample(unique_triples_protein, k=1)[0]
        disease = random.sample(unique_triples_disease,k=1)[0]
        if (drug + " " + disease + " " + protein) in pos_neg_triples_dictionary or (drug + " " + disease + " " + protein) in temp:
            continue
        temp.add(drug + " " + disease + " " + protein)
        negative_triples_array.append([drug, disease, protein])
        j+= 1 

    df_negative_triples = pd.DataFrame(negative_triples_array)
    j=0
    for j in range(0,len(df_negative_triples)):
        try:
            drug_vector = drug_df.iloc[drug_dict[df_negative_triples.iloc[j,0]],1:201].tolist()
            disease_vector = disease_df.iloc[disease_dict[df_negative_triples.iloc[j,1]],1:201].tolist()
            protein_vector = protein_df.iloc[protein_dict[df_negative_triples.iloc[j,2]],1:201].tolist()
        
            drug_name = df_negative_triples.iloc[j,0]
            disease_name = df_negative_triples.iloc[j,1]
            protein_name = df_negative_triples.iloc[j,2]
        
            row = [[drug_name],[disease_name],[protein_name],drug_vector,disease_vector,protein_vector,[0]]
            merged = list(itertools.chain(*row))
            negative_triples_vector_array.append(merged)

        except KeyError:
            continue

    negative_triples_dataframe = pd.DataFrame(negative_triples_vector_array)
    df = pd.concat([positive_triples_dataframe,negative_triples_dataframe],axis=0,ignore_index=True)
    df.columns = unified_columns
    
    ###CHECK TO MAKE SURE NO OVRLAP BETWEEN TRAIN AND EXTERNAL VALIDATION SET
    test = pd.concat([df_diverseNegs, df],axis=0)
    col = ['0', '1', '2']
    print(len(test))
    rem = test.drop_duplicates(subset = col)
    print(len(rem))
    ##Build 20 models generate performance stats
    stat_list = build_model(df, i, 1)
    
    model_auc_values.append(stat_list)

fcv_df = pd.DataFrame(model_auc_values, columns=['min_auc', 'max_auc', 'fcv_mean_auc','fcv_std_auc'])

min_auc = np.min(fcv_df.iloc[:,0])
max_auc = np.max(fcv_df.iloc[:,1])
avg_auc = np.mean(fcv_df.iloc[:,2])
std_dev_auc = np.mean(fcv_df.iloc[:,3])

ensemble_stats = np.array([min_auc,max_auc,avg_auc,std_dev_auc])
ensemble_stats = ensemble_stats.reshape(1, -1)
ensemble_stats_df = pd.DataFrame(ensemble_stats, columns = ['min_auc', 'max_auc', 'avg_auc', 'std_dev_auc'])
ensemble_stats_df.to_csv(os.path.join(os.getcwd(), 'Classification Models/Complete Set 1 Models/complete_set_1_ensemble_stats.csv'))


34901
34901
                     0                              1  \
2245         meloxicam           rheumatoid arthritis   
6419        prednisone             nephrotic syndrome   
8541    dimenhydrinate                    tyrosinemia   
533          celecoxib                   dysmenorrhea   
3964         gefitinib  non-small cell lung carcinoma   
...                ...                            ...   
905        perospirone                  schizophrenia   
5192        prilocaine          premature ejaculation   
12172       stanozolol         vitamin b12 deficiency   
235        vincristine   acute lymphoblastic leukemia   
13349  incadronic acid            bacterial pneumonia   

                                                       2         3         4  \
2245               prostaglandin-endoperoxide synthase 2  0.610850 -0.219320   
6419                                             albumin  0.239800 -0.247750   
8541   potassium voltage-gated channel subfamily a me... -0.021


KeyboardInterrupt



In [None]:
###5 Complete Set 2 Development
import keras

model_auc_values = []

unique_protein_vectors = np.array(protein_df.iloc[:,1:201])
unique_disease_vectors = np.array(disease_df.iloc[:,1:201])
unique_drug_vectors = np.array(drug_df.iloc[:,1:201])

unique_protein_dict_names = np.array(protein_df.iloc[:,0])
unique_disease_dict_names = np.array(disease_df.iloc[:,0])
unique_drug_dict_names = np.array(drug_df.iloc[:,0])

drug_threshold = compute_threshold(unique_drug_vectors)
disease_threshold = compute_threshold(unique_disease_vectors)
protein_threshold = compute_threshold(unique_protein_vectors)

for i in random_seeds:
    random.seed(i)
    random_rows = positive_triples_dataframe.sample(int(len(positive_triples_dataframe)/4), random_state = i)
    #we are now going to split these rows into 3 equal chunks. 
    #the triples in the first chunk will have its protein parameter randomized, 
    #the triples in the second chunk will have its disease parameter randomized, and so on

    temp_set = set()

# Split the data
    split_rows = np.array_split(random_rows, 3)
    drug_disease_x, drug_x_protein, x_disease_protein = split_rows
    

# Generate restrictedNegative triples for each category
    drug_disease_x_negatives = generate_restricted_negatives(
    drug_disease_x, (403, 603), unique_protein_vectors, unique_protein_dict_names, 2,
    protein_threshold, pos_neg_triples_dictionary, temp_set, max_count=6, portion = 1
    )

    drug_x_protein_negatives = generate_restricted_negatives(
    drug_x_protein, (203, 403), unique_disease_vectors, unique_disease_dict_names, 1,
    disease_threshold, pos_neg_triples_dictionary, temp_set, max_count=6, portion = 1
    )

    x_disease_protein_negatives = generate_restricted_negatives(
    x_disease_protein, (3, 203), unique_drug_vectors, unique_drug_dict_names, 0,
    drug_threshold, pos_neg_triples_dictionary, temp_set, max_count=5, portion = 1
    )

# Combine all negative triples and create the final dataframe
    negative_df = pd.concat([pd.DataFrame(triples) for triples in [drug_disease_x_negatives,    drug_x_protein_negatives, x_disease_protein_negatives]], ignore_index=True)
    df = pd.concat([positive_triples_dataframe, negative_df], axis=0, ignore_index=True)
    df.columns = unified_columns
    print(df)

    ##check to make sure no duplicates
    test = pd.concat([df_diverseNegs, df],axis=0)
    col = ['0', '1', '2']
    print(len(test))
    rem = test.drop_duplicates(subset = col)
    print(len(rem))
    print(i)
    
    # Perform 5-fold cross-validation to test the accuracy of the model######
    stat_list = build_model(df, i, 2)
    model_auc_values.append(stat_list)
    
fcv_df = pd.DataFrame(model_auc_values, columns=['min_auc', 'max_auc', 'fcv_mean_auc','fcv_std_auc'])

min_auc = np.min(fcv_df.iloc[:,0])
max_auc = np.max(fcv_df.iloc[:,1])
avg_auc = np.mean(fcv_df.iloc[:,2])
std_dev_auc = np.mean(fcv_df.iloc[:,3])

ensemble_stats = np.array([min_auc,max_auc,avg_auc,std_dev_auc])
ensemble_stats = ensemble_stats.reshape(1, -1)
ensemble_stats_df = pd.DataFrame(ensemble_stats, columns = ['min_auc', 'max_auc', 'avg_auc', 'std_dev_auc'])
ensemble_stats_df.to_csv(os.path.join(os.getcwd(), 'Classification Models/Complete Set 2 Models/complete_set_2_ensemble_stats.csv'))

min distance: 1.3490766481180738
mean distance: 3.9296513978117704
stdev distances: 1.1837539597778333
min distance: 1.3756730714527872
mean distance: 6.979102681442049
stdev distances: 2.672071221220308
min distance: 1.00340273053115
mean distance: 6.690955269076238
stdev distances: 4.105268395143029


  return bound(*args, **kwds)


                     0                              1  \
0           nintedanib  non-small cell lung carcinoma   
1          vinblastine               kaposi's sarcoma   
2           alprazolam                        anxiety   
3      methamphetamine               obesity disorder   
4             pindolol          hypertensive disorder   
...                ...                            ...   
14431      acamprosate        lennox-gastaut syndrome   
14432      agomelatine        lennox-gastaut syndrome   
14433    amitriptyline        lennox-gastaut syndrome   
14434     aripiprazole        lennox-gastaut syndrome   
14435      armodafinil        lennox-gastaut syndrome   

                                                       2        3        4  \
0                pre-mrna processing factor kinase prp4k  0.61658  0.39336   
1                              tubulin beta 2a class iia  0.80437  0.16827   
2      gamma-aminobutyric acid type a receptor subuni...  0.28355  0.57538   
3  