In [15]:
#Imports

import pandas as pd
import numpy as np
import random 
import openai
import requests
import json
from transformers import set_seed
from nltk.tokenize import RegexpTokenizer
import joblib
from sklearn.metrics import classification_report, roc_auc_score
import pickle

In [2]:
# Load task 1 positive and negative datasets, select 'is_a' links, shuffle

df_positive = pd.read_csv('data/df_positive_test.csv', index_col=0)
df_negative = pd.read_csv('data/df_negative_1_test.csv', index_col=0)

df_positive_is_a = df_positive[df_positive.link == 'is_a'].sample(frac=1, random_state = 101)
df_negative_is_a = df_negative[df_negative.link == 'is_a'].sample(frac=1, random_state = 101)

In [3]:
# Split pos and neg dfs into example and prompt triples

# Prompt (50 pos, 50 neg)
df_pos_50 = df_positive_is_a.iloc[:50]
df_neg_50 = df_negative_is_a.iloc[:50]
df_pos_50['y'] = 1
df_neg_50['y'] = 0

# Example (Remainder)
df_pos_example = df_positive_is_a.iloc[50:]
df_neg_example = df_negative_is_a.iloc[50:]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_pos_50['y'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_neg_50['y'] = 0


In [4]:
# Concatenate, shuffle, export test sample

df_prompting_test = pd.concat([df_pos_50, df_neg_50], axis=0).sample(frac=1, random_state = 101)
df_prompting_test.to_csv('prompting_test_Task_1')

# GPT-4 

In [5]:
API_KEY = ""
API_ENDPOINT = "https://api.openai.com/v1/chat/completions"

In [6]:
# Function to submit prompt 

def generate_chat_completion(messages, model="gpt-4", temperature=0, max_tokens=10):
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {API_KEY}",
    }

    data = {
        "model": model,
        "messages": messages,
        "temperature": temperature,
    }

    if max_tokens is not None:
        data["max_tokens"] = max_tokens

    response = requests.post(API_ENDPOINT, headers=headers, data=json.dumps(data))

    if response.status_code == 200:
        return response.json()["choices"][0]["message"]["content"]
    else:
        raise Exception(f"Error {response.status_code}: {response.text}")

In [7]:
# Instantiate dataframe to store positive and negative example triples, triple for model to classify
# response returned and ground-truth label 

prediction_table = pd.DataFrame(columns=['Pos_Ex_1', 'Pos_Ex_2', 'Pos_Ex_3', 'Neg_Ex_1', 'Neg_Ex_2', 
                                         'Neg_Ex_3','Prompt', 'Response', 'Label'])

In [8]:
# Function to randomly select 3x positive and 3x negative example triples, a randomly selected positive or 
# negative triple to classify, generate/submit prompt and keep track of results

def run_prompts(run_no):
    
    global prediction_table
    
    pos_examples = []
    neg_examples = []
    new_row = {}
    
    prompt = df_prompting_test.iloc[run_no]
    new_row['Label'] = prompt['y']
    prompt_triple = prompt['head_name'] + " " + " ".join(prompt['link'].split('_')) + " " + prompt['tail_name']
    new_row['Prompt'] = prompt_triple
    
    samp_pos = df_pos_example.sample(3)
    PT1 = samp_pos.iloc[0,1] + " " + " ".join(samp_pos.iloc[0,2].split('_')) + " " + samp_pos.iloc[0,4]
    PT2 = samp_pos.iloc[1,1] + " " + " ".join(samp_pos.iloc[1,2].split('_')) + " " + samp_pos.iloc[1,4]
    PT3 = samp_pos.iloc[2,1] + " " + " ".join(samp_pos.iloc[2,2].split('_')) + " " + samp_pos.iloc[2,4]
    pos_examples.extend([PT1, PT2, PT3])
    
    samp_neg = df_neg_example.sample(3)
    NT1 = samp_neg.iloc[0,1] + " " + " ".join(samp_neg.iloc[0,2].split('_')) + " " + samp_neg.iloc[0,4]
    NT2 = samp_neg.iloc[1,1] + " " + " ".join(samp_neg.iloc[1,2].split('_')) + " " + samp_neg.iloc[1,4]
    NT3 = samp_neg.iloc[2,1] + " " + " ".join(samp_neg.iloc[2,2].split('_')) + " " + samp_neg.iloc[2,4]
    neg_examples.extend([NT1, NT2, NT3])
    
    for i in range(3):
        new_row[f"Pos_Ex_{i+1}"] = pos_examples[i]
        new_row[f"Neg_Ex_{i+1}"] = neg_examples[i]
    
    
    full_prompt = f"""
    Your task is to classify triples as True or False. If you do not know the answer, state 'I don't know.'

    <triple>: {neg_examples[0]}
    <classification>: False
    
    <triple>: {pos_examples[0]}
    <classification>: True
    
    <triple>: {neg_examples[1]}
    <classification>: False

    <triple>: {pos_examples[1]}
    <classification>: True

    <triple>: {pos_examples[2]}
    <classification>: True

    <triple>: {neg_examples[2]}
    <classification>: False

    <triple>: {prompt_triple}
    """
    
    messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": full_prompt}]
    
    response_text = generate_chat_completion(messages)
    new_row['Response'] = response_text
    
    prediction_table = prediction_table.append(new_row, ignore_index = True)
    

In [9]:
random.seed(352)

for i in range(100):
    run_prompts(run_no=i)

In [10]:
prediction_table.loc[prediction_table['Response'].str.contains('True'),'Resp_numeric'] = 1
prediction_table.loc[prediction_table['Response'].str.contains('False'),'Resp_numeric'] = 0
prediction_table['Resp_numeric'] = prediction_table['Resp_numeric'].fillna(2)
prediction_table['Correct'] = np.where(prediction_table['Label'] == prediction_table['Resp_numeric'], 1,0)

prediction_table.head()

Unnamed: 0,Pos_Ex_1,Pos_Ex_2,Pos_Ex_3,Neg_Ex_1,Neg_Ex_2,Neg_Ex_3,Prompt,Response,Label,Resp_numeric,Correct
0,norsolorinic acid anthrone is a polyketide,"N-[(3R,4R,5S,6R)-5-[(2S,3R,4R,5S,6R)-3-Acetami...","4-hexyl-3-thiophen-2-yl-1H-1,2,4-triazole-5-th...",aryne is a benzenesulfonic acids,"Avenestergenin A2 is a 1-[(3S,9S,10S)-12-[(2R)...",[2-hydroxy-5-(prop-2-en-1-yl)phenyl]oxidanesul...,"N-[(2S,3S,6R)-6-[2-(ethylsulfonylamino)ethyl]-...",<classification>: False,1,0.0,0
1,His-Ser-Asp is a oligopeptide,benazepril is a lactam,lignin cw compound-134 is a phenols,beta-glucosyl 6-beta-glucosyloxy-indole-3-carb...,Gly-Arg-Leu is a Glu-Asp-Tyr,"N-[(2S,3R,4R,5R,6R)-6-[[(2R,3R,4R,5S,6R)-3-Ace...","N-(3,4-dimethoxyphenyl)-6-phenyl-4-thieno[2,3-...",<classification>: False,1,0.0,0
2,"2-ethylacrylic acid is a alpha,beta-unsaturate...",(S)-bitolterol is a bitolterol,2-O-(6-phosphono-alpha-D-mannosyl)-D-glyceric ...,"Mycosanoic acid (C24) is a N-[(5S,6S,9S)-8-[cy...",31-demethylbuxaminol is a 5-(4-methoxyphenyl)-...,diacylglycerol 38:2 is a Putaminoxin,cobalt-precorrin-6B is a cobalt corrinoid,<classification>: True,1,1.0,1
3,platensic acid methyl ester is a methyl ester,7-O-[alpha-L-rhamnosyl-(1->2)-beta-D-glucosyl]...,phytol is a diterpenoid,Adrenochrome o-semiquinone is a 2-(2-chlorophe...,BMS-453 is a pirazofurin,19-Methoxypomolic acid 3-arabinoside is a alph...,2-methoxyethyl 2-(4-tert-butylphenyl)-2-cyano-...,<classification>: False,0,0.0,1
4,N-acyl-O-(3-sn-phosphatidyl)-L-serine is a L-s...,idrocilamide is a cinnamamides,N-[[[(2-hydroxyphenyl)-oxomethyl]hydrazo]-sulf...,"noreugenin(1-) is a 24-methylcholesta-5,24-die...",iron(2+) sulfides is a phosphatidylinositol 40...,"N-[(2R,3R,4R,5S,6R)-2-[(2R,3S,4R,5S)-4-[(2S,3R...","1-(4-fluorophenyl)-3-[(3S,9R,10R)-12-[(2R)-1-h...",<classification>: False,0,0.0,1


In [11]:
# Display number and percentage correct 

print(f"Number correct: {prediction_table.Correct.sum()}")
acc = prediction_table.Correct.sum()/len(prediction_table)
print(acc)

Number correct: 85
0.85


In [12]:
tp = len(prediction_table[(prediction_table.Resp_numeric==1) & (prediction_table.Correct==1)])
fp = len(prediction_table[(prediction_table.Resp_numeric==1) & (prediction_table.Correct==0)])
tn = len(prediction_table[(prediction_table.Resp_numeric==0) & (prediction_table.Correct==1)])
fn = len(prediction_table[(prediction_table.Resp_numeric==0) & (prediction_table.Correct==0)])

print(tp)
print(fp)
print(tn)
print(fn)

39
1
46
9


# Random forest models

## Random embeddings

In [13]:
df_test = pd.read_csv('prompting_test_Task_1', index_col=0)

In [16]:
filename = 'models/rf_random_len_task1.joblib'
model = joblib.load(filename)

In [17]:
#%% load emb_dict
tokenizer = RegexpTokenizer(r'\w+')

with open('embeddings/random.pkl','rb') as f:
    emb_dict = pickle.load(f)

In [18]:
def text_to_vec(text, emb_dict):
    tokens = tokenizer.tokenize(text.lower())
    vectors = []
    vec_centroid = []
    for i in tokens:
        if len(i) >=3:
            vec = emb_dict.get(i)
            if vec is not None:
                vectors.append(vec)
    if vectors:
        vec_centroid = np.mean(vectors, axis = 0)
        return vec_centroid
    else:
        for i in tokens:
            vec = emb_dict.get(i)
            if vec is not None:
                vectors.append(vec)
        vec_centroid = np.mean(vectors, axis = 0)
        return vec_centroid

def link_to_vec(text, emb_dict):
    vectors = []
    vec_centroid = []
    items = text.split('_')
    for item in items:
        tokens = tokenizer.tokenize(item.lower())        
        for i in tokens:
            vec = emb_dict.get(i)
            if vec is not None:
                vectors.append(vec)
    if vectors:
        vec_centroid = np.mean(vectors, axis = 0)
        return vec_centroid
    else:
        return None

In [19]:
df_test['head_emb'] = df_test['head_name'].map(lambda text: text_to_vec(text, emb_dict))
df_test['link_emb'] = df_test['link'].map(lambda text: link_to_vec(text, emb_dict))
df_test['tail_emb'] = df_test['tail_name'].map(lambda text: text_to_vec(text, emb_dict))

In [20]:
df_test['X'] = df_test['head_emb'].apply(lambda x: x.tolist()) + df_test['link_emb'].apply(lambda x: x.tolist()) + df_test['tail_emb'].apply(lambda x: x.tolist())

In [21]:
X_test = df_test['X'].to_list()
X_test = np.array(X_test)

y_test = np.array(df_test['y'])
y_pred = model.predict(X_test)
df_test['y_pred'] = y_pred

print(classification_report(y_test, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.9792    0.9400    0.9592        50
           1     0.9423    0.9800    0.9608        50

    accuracy                         0.9600       100
   macro avg     0.9607    0.9600    0.9600       100
weighted avg     0.9607    0.9600    0.9600       100



[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 150 out of 150 | elapsed:    0.1s finished


## Glove

In [22]:
filename = 'models/rf_glove_len_task1.joblib'
model = joblib.load(filename)

In [23]:
#%% load emb_dict
tokenizer = RegexpTokenizer(r'\w+')

with open('embeddings/glove_random.pkl','rb') as f:
    emb_dict = pickle.load(f)

In [24]:
df_test = pd.read_csv('prompting_test_Task_1', index_col=0)

In [25]:
df_test['head_emb'] = df_test['head_name'].map(lambda text: text_to_vec(text, emb_dict))
df_test['link_emb'] = df_test['link'].map(lambda text: link_to_vec(text, emb_dict))
df_test['tail_emb'] = df_test['tail_name'].map(lambda text: text_to_vec(text, emb_dict))

In [26]:
df_test['X'] = df_test['head_emb'].apply(lambda x: x.tolist()) + df_test['link_emb'].apply(lambda x: x.tolist()) + df_test['tail_emb'].apply(lambda x: x.tolist())

In [27]:
X_test = df_test['X'].to_list()
X_test = np.array(X_test)

y_test = np.array(df_test['y'])
y_pred = model.predict(X_test)
df_test['y_pred'] = y_pred

print(classification_report(y_test, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.9583    0.9200    0.9388        50
           1     0.9231    0.9600    0.9412        50

    accuracy                         0.9400       100
   macro avg     0.9407    0.9400    0.9400       100
weighted avg     0.9407    0.9400    0.9400       100



[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    0.0s finished


## Pubmed from Glove

In [28]:
filename = 'models/rf_pubmedfromglove_len_task1.joblib'
model = joblib.load(filename)

In [29]:
#%% load emb_dict
tokenizer = RegexpTokenizer(r'\w+')

with open('embeddings/pubmed_fromGlove_random.pkl','rb') as f:
    emb_dict = pickle.load(f)

In [30]:
df_test = pd.read_csv('prompting_test_Task_1', index_col=0)

In [31]:
df_test['head_emb'] = df_test['head_name'].map(lambda text: text_to_vec(text, emb_dict))
df_test['link_emb'] = df_test['link'].map(lambda text: link_to_vec(text, emb_dict))
df_test['tail_emb'] = df_test['tail_name'].map(lambda text: text_to_vec(text, emb_dict))

In [32]:
df_test['X'] = df_test['head_emb'].apply(lambda x: x.tolist()) + df_test['link_emb'].apply(lambda x: x.tolist()) + df_test['tail_emb'].apply(lambda x: x.tolist())

In [33]:
X_test = df_test['X'].to_list()
X_test = np.array(X_test)

y_test = np.array(df_test['y'])
y_pred = model.predict(X_test)
df_test['y_pred'] = y_pred

print(classification_report(y_test, y_pred, digits=4))

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s


              precision    recall  f1-score   support

           0     0.9792    0.9400    0.9592        50
           1     0.9423    0.9800    0.9608        50

    accuracy                         0.9600       100
   macro avg     0.9607    0.9600    0.9600       100
weighted avg     0.9607    0.9600    0.9600       100



[Parallel(n_jobs=4)]: Done 150 out of 150 | elapsed:    0.2s finished


##  Pubmed from scratch

In [34]:
filename = 'models/rf_pubmedfromscratch_len_task1.joblib'
model = joblib.load(filename)

In [35]:
#%% load emb_dict
tokenizer = RegexpTokenizer(r'\w+')

with open('embeddings/pubmed_fromScratch_random.pkl','rb') as f:
    emb_dict = pickle.load(f)

In [36]:
df_test = pd.read_csv('prompting_test_Task_1', index_col=0)

In [37]:
df_test['head_emb'] = df_test['head_name'].map(lambda text: text_to_vec(text, emb_dict))
df_test['link_emb'] = df_test['link'].map(lambda text: link_to_vec(text, emb_dict))
df_test['tail_emb'] = df_test['tail_name'].map(lambda text: text_to_vec(text, emb_dict))

In [38]:
df_test['X'] = df_test['head_emb'].apply(lambda x: x.tolist()) + df_test['link_emb'].apply(lambda x: x.tolist()) + df_test['tail_emb'].apply(lambda x: x.tolist())

In [39]:
X_test = df_test['X'].to_list()
X_test = np.array(X_test)

y_test = np.array(df_test['y'])
y_pred = model.predict(X_test)
df_test['y_pred'] = y_pred

print(classification_report(y_test, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.9792    0.9400    0.9592        50
           1     0.9423    0.9800    0.9608        50

    accuracy                         0.9600       100
   macro avg     0.9607    0.9600    0.9600       100
weighted avg     0.9607    0.9600    0.9600       100



[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 150 out of 150 | elapsed:    0.0s finished


## Biowordvec

In [40]:
filename = 'models/rf_Biowordvec_len_task1.joblib'
model = joblib.load(filename)

In [41]:
#%% load emb_dict
tokenizer = RegexpTokenizer(r'\w+')

with open('embeddings/bio_random_extrinsic.pkl','rb') as f:
    emb_dict = pickle.load(f)

In [42]:
df_test = pd.read_csv('prompting_test_Task_1', index_col=0)

In [43]:
df_test['head_emb'] = df_test['head_name'].map(lambda text: text_to_vec(text, emb_dict))
df_test['link_emb'] = df_test['link'].map(lambda text: link_to_vec(text, emb_dict))
df_test['tail_emb'] = df_test['tail_name'].map(lambda text: text_to_vec(text, emb_dict))

In [44]:
df_test['X'] = df_test['head_emb'].apply(lambda x: x.tolist()) + df_test['link_emb'].apply(lambda x: x.tolist()) + df_test['tail_emb'].apply(lambda x: x.tolist())

In [45]:
X_test = df_test['X'].to_list()
X_test = np.array(X_test)

y_test = np.array(df_test['y'])
y_pred = model.predict(X_test)
df_test['y_pred'] = y_pred

print(classification_report(y_test, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.9592    0.9400    0.9495        50
           1     0.9412    0.9600    0.9505        50

    accuracy                         0.9500       100
   macro avg     0.9502    0.9500    0.9500       100
weighted avg     0.9502    0.9500    0.9500       100



[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished


## PubmedBERT

In [46]:
filename = 'models/rf_bert_pubmed.joblib'
model = joblib.load(filename)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [47]:
#%% load pubmed bert embeddings - dict key:node/link, value:embeddings
with open('embeddings/id2bert_pubmed.pkl','rb') as f:
    dict_id2bert = pickle.load(f)

with open('embeddings/link2bert_pubmed.pkl','rb') as f:
    dict_link2bert = pickle.load(f)

In [48]:
df_test = pd.read_csv('prompting_test_Task_1', index_col=0)

In [49]:
df_test['head_emb'] = df_test['head_id'].map(dict_id2bert)
df_test['link_emb'] = df_test['link'].map(dict_link2bert)
df_test['tail_emb'] = df_test['tail_id'].map(dict_id2bert)

In [50]:
df_test['X'] = df_test['head_emb'] + df_test['link_emb'] + df_test['tail_emb']
print("test done")

test done


In [51]:
X_test = df_test['X'].to_list()
X_test = np.array(X_test)

y_test = np.array(df_test['y'])
y_pred = model.predict(X_test)
df_test['y_pred'] = y_pred

print(classification_report(y_test, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.9783    0.9000    0.9375        50
           1     0.9074    0.9800    0.9423        50

    accuracy                         0.9400       100
   macro avg     0.9428    0.9400    0.9399       100
weighted avg     0.9428    0.9400    0.9399       100



[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    0.1s finished
