# Python analysis of Project Rephetio epilepsy predictions

In [1]:
import pandas
import collections

In [2]:
pk_df = (
    pandas.read_excel('data/top-5-percent-PK-plots.xlsx', skiprows=1)
    .iloc[:100, :11]
    .rename(columns={'Name': 'name', 'Prediction score': 'prediction', 'Disease Pctl': 'disease_pctl'})
)

pk_df['status'] = (pk_df
    [['AED', 'Anti-Epileptic properties', 'Induces seizure']]
    .astype(int).max(axis='columns')
    .map({4: 'AED', 3: 'AEP', 1: 'IS', 0: '?'})
)

pk_df = pk_df[['name', 'prediction', 'disease_pctl', 'status']]
pk_df.head(2)

Unnamed: 0,name,prediction,disease_pctl,status
0,Topiramate,0.603,1.0,AED
1,Ethotoin,0.589,0.999,AED


In [3]:
statuses = sorted(pk_df.status.unique())

In [4]:
def rolling_groups(df, k=5):
    """Yield rolling windows on dataframe"""
    for i in range(len(df)):
        start = max(0, i - k)
        end = i + k + 1
        yield df.iloc[start:end, :]

def summarize_window(df):
    s = pandas.Series()
    s['min_pred'] = min(df.prediction)
    s['max_pred'] = max(df.prediction)
    counter = collections.Counter(df.status)
    for status in statuses:
        s['freq_' + status] = counter[status] / len(df)
    return s

rolling_df = pandas.DataFrame.from_records(
    map(summarize_window, rolling_groups(pk_df, k=7))
)

In [5]:
plot_df = pandas.concat([pk_df, rolling_df], axis='rows')

In [6]:
plot_df.tail()

Unnamed: 0,name,prediction,disease_pctl,status,min_pred,max_pred,freq_?,freq_AED,freq_AEP,freq_IS
95,Desipramine,0.0308,0.938,IS,0.0298,0.033,0.166667,0.416667,0.083333,0.333333
96,Dabrafenib,0.0306,0.938,?,0.0298,0.0329,0.090909,0.454545,0.090909,0.363636
97,Rufinamide,0.0305,0.937,AED,0.0298,0.0327,0.1,0.4,0.1,0.4
98,Memantine,0.0303,0.936,IS,0.0298,0.0327,0.111111,0.444444,0.0,0.444444
99,Zolpidem,0.0298,0.936,AED,0.0298,0.0325,0.125,0.5,0.0,0.375


In [7]:
plot_df.to_csv('data/windows.tsv', sep='\t', index=False, float_format='%.5g')