In [118]:
%load_ext autoreload
%autoreload 2
import numpy as np
import torch
from torch import nn
import matplotlib.pyplot as plt
from copy import deepcopy
import pandas as pd
from tqdm import tqdm
from collections import defaultdict
import pandas as pd
from datasets import Dataset
from os.path import join as oj
import pickle as pkl
import os
import sys
import iprompt.data
from transformers import AutoTokenizer, OPTForCausalLM, AutoModelForCausalLM
# from iprompt.data import TASKS_GALACTICA
import transformers

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [119]:
# load data
results_dir = '/home/chansingh/iprompt/experiments/results'
rs = {
    fname.replace('.pkl', ''): pkl.load(open(oj(results_dir, fname), 'rb'))
    for fname in os.listdir(results_dir)
    if fname.endswith('.pkl')
    and fname.startswith('uniprot')
}

# calculate metrics
def cyto(l: list):
    x = ''.join(l).lower()
    return 'cyto' in x or 'membrane' in x

def rna(l: list):
    x = ''.join(l).lower()
    return 'rna' in x or 'atp' in x or 'adenosine' in x or 'ribonucleic' in x

def mrr(x: list, f):
    for i, y in enumerate(x):
        if f([y]):
            return 1 / (i + 1)
    return 0

r = {}
for k in rs:
    l = rs[k]['prefixes']
    if 'cyto' in k:
        r[k] = [cyto(l[:20]), cyto(l[:5]), mrr(l, cyto)]
    else:
        r[k] = [rna(l[:20]), rna(l[:5]), mrr(l, rna)]
    
df = pd.DataFrame.from_dict(r)
df.index = ['Top-20 prompt correctness', 'Top-5 prompt correctness', 'MRR']
ks_baseline = [k for k in df.columns if 'baseline' in k]
ks_cyto = [k for k in df.columns if 'cyto' in k]
ks_rna = [k for k in df.columns if not 'cyto' in k]
df_baseline = df[ks_baseline]
df_baseline_cyto = df[[k for k in ks_baseline if k in ks_cyto]]
df_baseline_rna = df[[k for k in ks_baseline if k in ks_rna]]
df_cyto = df[[k for k in ks_cyto if k not in ks_baseline]]
df_rna = df[[k for k in ks_rna if k not in ks_baseline]]

In [120]:
scores_cyto = df_cyto.mean(axis=1).round(2).astype(str) + \
    ' $\pm$ ' + \
    (df_cyto.std(axis=1) /
     np.sqrt(df_cyto.shape[1])).round(2).astype(str)
scores_rna = df_rna.mean(axis=1).round(2).astype(str) + \
    ' $\pm$ ' + \
    (df_rna.std(axis=1) /
        np.sqrt(df_rna.shape[1])).round(2).astype(str)
scores_baseline = df_baseline.mean(axis=1).round(2).astype(str) + \
    ' $\pm$ ' + \
    (df_baseline.std(axis=1) /
        np.sqrt(df_baseline.shape[1])).round(2).astype(str)
scores_baseline_cyto = df_baseline_cyto.mean(axis=1).round(2).astype(str) + \
    ' $\pm$ ' + \
    (df_baseline_cyto.std(axis=1) /
        np.sqrt(df_baseline_cyto.shape[1])).round(2).astype(str)
scores_baseline_rna = df_baseline_rna.mean(axis=1).round(2).astype(str) + \
    ' $\pm$ ' + \
    (df_baseline_rna.std(axis=1) /
        np.sqrt(df_baseline_rna.shape[1])).round(2).astype(str)


In [122]:
d = pd.concat((scores_cyto, scores_rna, scores_baseline), axis=1)
d.columns = ['Cyto', 'Binding', 'Null data']
d = d.loc[['MRR', 'Top-5 prompt correctness', 'Top-20 prompt correctness']]
d = d.rename(index={'Top-20 prompt correctness': 'Top-20 correcntess', 'Top-5 prompt correctness': 'Top-5 correctness'})
print(d.to_latex(escape=False))

\begin{tabular}{llll}
\toprule
{} &             Cyto &          Binding &        Null data \\
\midrule
MRR                &   0.2 $\pm$ 0.08 &  0.08 $\pm$ 0.04 &  0.03 $\pm$ 0.01 \\
Top-5 correctness  &  0.25 $\pm$ 0.13 &  0.17 $\pm$ 0.11 &  0.05 $\pm$ 0.05 \\
Top-20 correcntess &  0.83 $\pm$ 0.11 &  0.33 $\pm$ 0.14 &  0.23 $\pm$ 0.09 \\
\bottomrule
\end{tabular}



  print(d.to_latex(escape=False))


In [111]:
for k in rs:
    print(k, '\n\t'.join(rs[k]['prefixes']))

uniprot_cytoplasm_membrane_100_4  \(5^{
	 following condition "(O
	 Disease Colorectal cancer, otherwise No
	 development of breast cancer; No
	 KEGG pathway "pancreatic
	 antigen. If No then you
	 [14-3
	 regulation of glucose metabolism. No
	 amino acid glycine, and No
	 disease and No, otherwise.
	 function of iron uptake, No
	 given keyword and No otherwise.
	 nucleus. Otherwise you get No
	 nucleus. Otherwise, answer No
	 phenotype; otherwise, answer No
	 given keyword and No otherwise!
	 given keyword and there are at
	 given keyword and no otherwise,
	 [None](https
	 class I viral fusion activity.
	 following human disease(s)
	 class I viral transcription complex and
	 [None](http
	 mitochondria and has transmembrane domains.
	 class I viral membrane fusion proteins
uniprot_baseline_cytoplasm_membrane_100_2  answer ’YES’,
	 cancer or disease, and no
	 response and No otherwise ?
	 answer ’YES’ and
	 following disease names? P=
	 given tissue; No, otherwise
	 cancer or disease, f

# print example

In [125]:
from imodelsx import explain_dataset_iprompt, get_add_two_numbers_dataset
from iprompt.data import TASKS_GALACTICA

In [126]:
task_name = f'tox21_0'

In [128]:
# get task
task = TASKS_GALACTICA[task_name]

# get data
df = task['gen_func']()
input_strings = df['input'].values
output_strings = df['output'].values

  control = d.loc[(d.sum(axis=1) == 0) & (d.isna().sum(axis=1) == 0)]
  tox = d.loc[(d.sum(axis=1) == 1) & (d.iloc[:, tox_target] == 1)]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tox'].iloc[n:] = 0


In [134]:
output_strings[0]

' No.\n\n'