# Python analysis of Project Rephetio epilepsy predictions

In [1]:
import pandas
import collections

In [2]:
# Download PK (Pouya Khankhanian) curation from Google Docs
doc_id = '1GJvqWp7WkMyboJ49Hts4eSOQCdJc_rcQenmqPLk6Bsw'
url = 'https://docs.google.com/spreadsheets/d/{}/export?format=tsv'.format(doc_id)
pk_df = (
    pandas.read_table(url)
    .dropna(subset=['category'])
    .reset_index(drop=True)
)
pk_df.head()

Unnamed: 0,name,prediction,disease_pctl,phcodb,trials,category,comment,sources
0,Topiramate,0.603,1.0,DM,35,AIGD,,
1,Ethotoin,0.589,0.9993,,0,AIGD,,
2,Quazepam,0.57,0.9987,,0,AIGD,,
3,Alprazolam,0.565,0.998,,1,AIGD,,
4,Primidone,0.494,0.9974,DM,0,AIGD,,


In [3]:
len(pk_df)

100

In [4]:
# Export to TSV
pk_df.to_csv('data/PK-curation.tsv', sep='\t', index=False, float_format='%.5g')

In [5]:
statuses = sorted(pk_df.category.unique())
statuses

['AIGD', 'IGD', 'UNKD']

In [6]:
def rolling_groups(df, k=5):
    """Yield rolling windows on dataframe"""
    for i in range(len(df)):
        start = max(0, i - k)
        end = i + k + 1
        yield df.iloc[start:end, :]

def summarize_window(df):
    s = pandas.Series()
    s['min_pred'] = min(df.prediction)
    s['max_pred'] = max(df.prediction)
    counter = collections.Counter(df.category)
    for status in statuses:
        s['freq_' + status] = counter[status] / len(df)
    return s

rolling_df = pandas.DataFrame.from_records(
    map(summarize_window, rolling_groups(pk_df, k=7))
)

In [7]:
plot_df = pandas.concat([pk_df.loc[:, 'name':'category'], rolling_df], axis='rows')
plot_df.tail()

Unnamed: 0,name,prediction,disease_pctl,phcodb,trials,category,min_pred,max_pred,freq_AIGD,freq_IGD,freq_UNKD
95,Dabrafenib,0.0306,0.9376,,0,UNKD,0.0296,0.0329,0.5,0.333333,0.166667
96,Rufinamide,0.0305,0.9369,DM,0,AIGD,0.0296,0.0327,0.454545,0.363636,0.181818
97,Memantine,0.0303,0.9363,,2,IGD,0.0296,0.0327,0.4,0.4,0.2
98,Zolpidem,0.0298,0.9356,,0,AIGD,0.0296,0.0325,0.444444,0.333333,0.222222
99,Acamprosate,0.0296,0.935,,0,UNKD,0.0296,0.0324,0.5,0.25,0.25


In [8]:
plot_df.to_csv('data/windows.tsv', sep='\t', index=False, float_format='%.5g')

## Stats

In [9]:
len(plot_df)

100

In [10]:
plot_df.category.value_counts()

AIGD    77
IGD     15
UNKD     8
Name: category, dtype: int64

In [11]:
url = 'https://github.com/dhimmel/learn/raw/d2251a942813015d0362a90f179c961016336e77/summary/indications.tsv'
aeds_in_phcodb = (
    pandas.read_table(url)
    .query("rel_type == 'TREATS_CtD'")
    .query("disease_id == 'DOID:1826'")
    .compound_name
    .tolist()
)
# Number of disease-modifying antiepileptics in PharmacotherapyDB
len(set(aeds_in_phcodb))

25

In [12]:
# Number of disease-modifying antiepileptics from PharmacotherapyDB in top 100 predictions
len(set(aeds_in_phcodb) & set(plot_df.name))

23

In [13]:
# Disease-modifying antiepileptics from PharmacotherapyDB not in top 100 predictions
set(aeds_in_phcodb) - set(plot_df.name)

{'Propofol', 'Vigabatrin'}