# LC predictions

#### 27 Aug 2024

In [None]:
# to reproduce run the following to load matrix package (otherwise cant read ModelWrapper)
%load_ext kedro.ipython
%reload_kedro  --env cloud


In [None]:
import os
import pandas as pd
import polars as plå
import numpy as np
import matplotlib.pyplot as plt
import joblib
from tqdm import tqdm
import sys

#sys.path.append('matrix/matrix/src')
#from matrix.pipelines.modelling.model import ModelWrapper

## Load input files

In [None]:
# Load embeddings - embeddings from 26 August run, setup 3. Downloaded from GCS. 
EMB_PATH = '/Users/piotrkaniewski/work/run_infer/input/long_covid/rtx_kg2_nodes'
emb_nodes = pd.read_parquet(EMB_PATH)

In [None]:
# ID to embedding mapping 
id_to_embedding = {row['id']: row['topological_embedding'] for _, row in emb_nodes.iterrows()}

In [None]:
# Loading usual RTX-KG 2.7.3 nodes dataset containing nodes names and ids (using Polars for speed). Can be also downloaded from GCS
NODES_PATH  =  '/Users/piotrkaniewski/work/run_infer/input/rtx_kg2_nodes.tsv'
nodes = pl.read_csv(NODES_PATH, separator='\t')

In [None]:
# Load model - xg_ensemble from 26 August run, setup 3. Downloaded from MLFlow
MODEL_PATH = '/Users/piotrkaniewski/work/run_infer/input/long_covid/ensemble_model.pkl'
with open(MODEL_PATH, 'rb') as f:
    model = joblib.load(f)

In [None]:
# Load ground truth positive dataset. Can be downloaded from GCS
POS_PATH = '/Users/piotrkaniewski/work/run_infer/input/tp_pairs.txt'
tp_df = pd.read_csv(POS_PATH, sep='\t')

## Code for ranking drugs

In [None]:
def give_vectorised_dataset(df, id_to_embedding):
    '''
    Convert a drug-disease dataset into vectorised form.

    Args:
        df (pd.DataFrame): The dataframe to convert. The drugs column is 'source' and disease column is 'target'.
        id_to_embedding (dict): The dictionary to convert node ids to embeddings
    Returns: 
        The vectorised dataset
    '''
    
    vectorised_dataset = []
    for _, row in df.iterrows():
        drug_embedding = id_to_embedding[row['source']]
        disease_embedding = id_to_embedding[row['target']]
        vectorised_dataset.append(np.concatenate([drug_embedding, disease_embedding]))

    return np.array(vectorised_dataset)

    
def get_probabilities(pairs, model, id_to_embedding):
    """
    Get probability scores for drug-disease pairs dataset
    """
    X = give_vectorised_dataset(pairs, id_to_embedding)
    return model.predict_proba(X)


def get_ranked_drugs(model, drugs_lst, disease_id, id_to_embedding):
    """
    Gives sorted list of "treat" probability scores for a collection of drugs and a single disease
    """
    pairs = pd.DataFrame({'source': drugs_lst,'target': disease_id})
    pairs['treat_score'] = get_probabilities(pairs, model, id_to_embedding)[:,1]
    pairs_sorted = pairs.sort_values('treat_score', ascending = False).reset_index(drop=True)
    drugs_sorted = pairs_sorted.drop(columns='target')
    drugs_sorted = drugs_sorted.rename(columns={'source': 'id'})
    return drugs_sorted

def add_names_to_list(preds : pd.DataFrame, nodes : pl.DataFrame) -> pd.DataFrame:
    """Add names and descriptions to the predictions list
    """
    name_lst = [nodes.filter(pl.col('id') == drug_id)['name'].to_list()[0] for drug_id in preds['id']]
    description_lst = [nodes.filter(pl.col('id') == drug_id)['des'].to_list()[0] for drug_id in preds['id']]
    preds['name'] = name_lst
    preds['description'] = description_lst
    return preds

## Make predictions

For the list of drugs, we use the set of drugs appearing in the ground truth positive dataset. 

In [None]:
ec_metadata =pd.read_parquet('data/03_primary/ec_medical_team/nodes')
ec_metadata

In [None]:
# Finding long covid nodes
covid_nodes = nodes.filter(pl.col('name').str.contains(r'(?i)covid'))
print(covid_nodes)


In [None]:
# DOID definition of Long COVID
LC_ID = 'DOID:0080848'

# MONDO definition of Long COVID
LC_MONDO_ID = 'MONDO:0100233'


In [None]:
# Get list of drugs and collection of nodes
drugs_lst = list(tp_df['source'].unique())

In [None]:
# Predictions for Long COVID DOID definition
preds_doid = add_names_to_list(get_ranked_drugs(model, drugs_lst, LC_ID, id_to_embedding), nodes)
preds_doid.to_csv('output_lc/predictions_LC_doid.csv', index=False)

In [None]:
# Predictions for subtypes of Long COVID
preds_subtypes_lst = [add_names_to_list(get_ranked_drugs(model, drugs_lst, 'EC:'+str(i+1), id_to_embedding), nodes) for i in range(9)]
#preds_subtypes_lst = [preds_subtypes_lst[i].to_csv('output_lc/predictions_LC_subtype_'+str(i+1)+'.csv', index=False) for i in range(9)]

In [None]:
preds_subtypes_lst[0]

In [None]:
tp_df[tp_df.target==LC_ID]

In [None]:
tp_df[tp_df.target==LC_ID]

In [None]:
with pd.ExcelWriter('long_covid_treat_scores_v2.xlsx') as writer:  # doctest: +SKIP
    ec_metadata.to_excel(writer, sheet_name='metadata')
    for i in range(0,9):
        preds_subtypes_lst[i].to_excel(writer, sheet_name=f'ec_{i}')

To understand the effect of adding extra edges, we can make predictions for the MONDO definition of Long COVID.

In [None]:
# Prediction MONDO defintion of Long COVID
preds_mondo = add_names_to_list(get_ranked_drugs(model, drugs_lst, LC_MONDO_ID, id_to_embedding), nodes)


## Cumulative format

In [None]:
#load all subtypes from output_lc dir
lc_subtypes_df = [pd.read_csv(f'output_lc/predictions_LC_subtype_{i}.csv') for i in range(1,10)]

In [None]:
# get only top 500 scores from each subtype list
preds_subtypes_lst_500 = [lc_subtypes_df[i].head(500) for i in range(0,9)]
for i in range(0,9):
    preds_subtypes_lst_500[i]['subtype']=f'EC{i}'
total_df = pd.concat(preds_subtypes_lst_500, axis=0)
final_cumulative_df = total_df.loc[:,['id','name','description']].drop_duplicates()

In [None]:
# calc frequency across subtypes
count_id = total_df.groupby(['id']).count()
count_id['frequency'] = count_id.treat_score/9

frequency_dict = count_id['frequency'].to_dict()
final_cumulative_df['freq_across_subtypes_top500'] = final_cumulative_df['id'].map(frequency_dict)

In [None]:
# average across subtypes
count_id['average'] = total_df.groupby(['id']).treat_score.mean()

average_dict = count_id['average'].to_dict()
final_cumulative_df['avg_score_across_subtypes_top500'] = final_cumulative_df['id'].map(average_dict)

In [None]:
# average across subtypes
count_id['median'] = total_df.groupby(['id']).treat_score.median()

median_dict = count_id['median'].to_dict()
final_cumulative_df['median_score_across_subtypes_top500'] = final_cumulative_df['id'].map(median_dict)

In [None]:
# IQR 
def iqr(x):
    return x.quantile(0.75) - x.quantile(0.25)

count_id['iqr'] = total_df.groupby(['id']).treat_score.agg(iqr)

iqr_dict = count_id['iqr'].to_dict()
final_cumulative_df['iqr_score_across_subtypes_top500'] = final_cumulative_df['id'].map(iqr_dict)

In [None]:
# ERROR CELL - this is where the mistake was
# essentially I thought that groupby will sort the values based on the numeric and return also subtype associated with that
# value; however for some weird reason, min() and max() functions will sort the string values, returning 'lowest'/'highest' value for the string column
# (i suppose it is following alphanumeric convention). Therefore, although the score returned was correct, the associated value was incorrect

#this seems like an obvious mistake to spot (and it is partially) but because we are only taking top 500 drugs and diseases,
# it is not as obvious as EC0 for min  or EC8 for max as other values also show up; I also did a sanity check on a few drugs 
# but it seemed like the examples I have chosen were lucky, as I didn't spot any inconsistencies back then 

# min_dict = total_df.groupby(['id']).min().treat_score.to_dict()
# final_cumulative_df['min_score_across_subtypes_top500_score'] = final_cumulative_df['id'].map(min_dict)
# min_subtypes = total_df.groupby(['id']).min().subtype.to_dict()
# final_cumulative_df['min_score_across_subtypes_top500_subtype'] = final_cumulative_df['id'].map(min_subtypes)

# max_dict = total_df.groupby(['id']).max().treat_score.to_dict()
# final_cumulative_df['max_score_across_subtypes_top500_score'] = final_cumulative_df['id'].map(max_dict)
# max_subtypes = total_df.groupby(['id']).max().subtype.to_dict()
# final_cumulative_df['max_score_across_subtypes_top500_subtype'] = final_cumulative_df['id'].map(max_subtypes)

In [None]:
# now I am using more fool-proof strategy where I am utilizing min_dict entries to create a new dictionary with the same keys but subtype value

min_dict = total_df.groupby(['id']).min().treat_score.to_dict()
final_cumulative_df['min_score_across_subtypes_top500_score'] = final_cumulative_df['id'].map(min_dict)
min_subtypes = {key: total_df.loc[((total_df.id==key)&(total_df.treat_score==value))].subtype.values[0] for key, value in min_dict.items()}
final_cumulative_df['min_score_across_subtypes_top500_subtype'] = final_cumulative_df['id'].map(min_subtypes)

max_dict = total_df.groupby(['id']).max().treat_score.to_dict()
final_cumulative_df['max_score_across_subtypes_top500_score'] = final_cumulative_df['id'].map(max_dict)
max_subtypes = {key: total_df.loc[((total_df.id==key)&(total_df.treat_score==value))].subtype.values[0] for key, value in max_dict.items()}
final_cumulative_df['max_score_across_subtypes_top500_subtype'] = final_cumulative_df['id'].map(max_subtypes)

#### Save

In [None]:
# Ignore for now

# # mondo preds
# preds_mondo = preds_mondo.sort_values('probs', ascending=False).loc[:,['id','name','description','probs']]
# preds_mondo.rename(columns={'probs':'treat_score'})
# preds_mondo.loc[:,['id','name','description','treat_score']].to_csv('output_lc/mondo_0100233.csv')

# preds_mondo = preds_mondo.sort_values('probs', ascending=False).loc[:,['id','name','description','probs']]
# preds_mondo.rename(columns={'probs':'treat_score'})
# preds_mondo.loc[:,['id','name','description','treat_score']].to_csv('output_lc/mondo_0100233.csv')

In [None]:
final_cumulative_df.to_csv('output_lc/cumulative_list_v1_wip_troubleshoot.csv') #working version

In [None]:
# Ignore for now

# with pd.ExcelWriter('long_covid_treat_scores_v2.xlsx') as writer:  # doctest: +SKIP
#     preds_doid = preds_doid.sort_values('probs', ascending=False).loc[:,['id','name','description','probs']]
#     preds_doid.rename(columns={'probs':'treat_score'}).to_excel(writer, sheet_name='doid_0080848')
#     preds_mondo.loc[:,['id','name','description','treat_score']].to_csv('output_lc/mondo_0100233.csv')
#     preds_mondo = preds_mondo.sort_values('probs', ascending=False).loc[:,['id','name','description','probs']]
#     preds_mondo.rename(columns={'probs':'treat_score'}).to_excel(writer, sheet_name='mondo_0100233')

## Cumulative format v2 (requests)

In [None]:
# can re read but no need

#load all raw subtypes 
lc_subtypes_df = [pd.read_csv(f'output_lc/predictions_LC_subtype_{i}.csv') for i in range(1,10)]

In [None]:
#create a list for all
preds_subtypes_lst_all = lc_subtypes_df
for i in range(0,9):
    preds_subtypes_lst_all[i]['subtype']=f'EC{i}'
total_df_all = pd.concat(preds_subtypes_lst_all, axis=0)
final_cumulative_df_all = total_df_all.loc[:,['id','name','description']].drop_duplicates()

In [None]:
# frequency across subtypes all
count_id = total_df_all.groupby(['id']).count()
count_id['frequency'] = count_id.treat_score/9

frequency_dict = count_id['frequency'].to_dict()
final_cumulative_df_all['freq_across_subtypes_all'] = final_cumulative_df_all['id'].map(frequency_dict)

In [None]:
# average across subtypes
count_id['average'] = total_df_all.groupby(['id']).treat_score.mean()

average_dict = count_id['average'].to_dict()
final_cumulative_df_all['avg_score_across_subtypes_all'] = final_cumulative_df_all['id'].map(average_dict)

In [None]:
# average across subtypes
count_id['median'] = total_df_all.groupby(['id']).treat_score.median()

average_dict = count_id['median'].to_dict()
final_cumulative_df_all['median_score_across_subtypes_all'] = final_cumulative_df_all['id'].map(average_dict)

In [None]:
# IQR 
def iqr(x):
    return x.quantile(0.75) - x.quantile(0.25)

count_id['iqr'] = total_df_all.groupby(['id'])['treat_score'].agg(iqr)

iqr_dict = count_id['iqr'].to_dict()
final_cumulative_df_all['iqr_score_across_subtypes_all'] = final_cumulative_df_all['id'].map(iqr_dict)

In [None]:
min_dict = total_df_all.groupby(['id']).min().treat_score.to_dict()
final_cumulative_df_all['min_score_across_subtypes_all_score'] = final_cumulative_df_all['id'].map(min_dict)
min_subtypes = {key: total_df_all.loc[((total_df_all.id==key)&(total_df_all.treat_score==value))].subtype.values[0] for key, value in min_dict.items()}
final_cumulative_df_all['min_score_across_subtypes_all_subtype'] = final_cumulative_df_all['id'].map(min_subtypes)

max_dict = total_df_all.groupby(['id']).max().treat_score.to_dict()
final_cumulative_df_all['max_score_across_subtypes_all_score'] = final_cumulative_df_all['id'].map(max_dict)
max_subtypes = {key: total_df_all.loc[((total_df_all.id==key)&(total_df_all.treat_score==value))].subtype.values[0] for key, value in max_dict.items()}
final_cumulative_df_all['max_score_across_subtypes_all_subtype'] = final_cumulative_df_all['id'].map(max_subtypes)

In [None]:
#note here we are using a different a total_df, not total_df_all as we are only interested in top 500
for id in range(0,9):
    name = f'EC{id}'
    subtype_ids = total_df.loc[total_df.subtype==name].id.values
    final_cumulative_df_all[f'in_{name}_top500'] = final_cumulative_df_all['id'].isin(subtype_ids)

### Merge cumulative v1 with v2

In [None]:
#make sure we have same indices
final_cumulative_df_all = final_cumulative_df_all[final_cumulative_df_all.id.isin(final_cumulative_df.id)].reset_index()

#merge
final_cumulative_df_all = final_cumulative_df_all.merge(final_cumulative_df.drop(['name','description'], axis=1), on='id', how='right')

#remove unwanted 
final_cumulative_df_all 

In [None]:
final_cumulative_df_all.columns

In [None]:
# change format so that it makes sense 
final_cumulative_df_all.loc[:,['id', 'name', 'description', 'freq_across_subtypes_top500',
       'avg_score_across_subtypes_top500',
       'avg_score_across_subtypes_all',
       'median_score_across_subtypes_top500',
       'median_score_across_subtypes_all',
       'iqr_score_across_subtypes_top500',
       'iqr_score_across_subtypes_all',
       'min_score_across_subtypes_top500_score',
       'min_score_across_subtypes_top500_subtype',
       'max_score_across_subtypes_top500_score',
       'max_score_across_subtypes_top500_subtype',
       'min_score_across_subtypes_all_score',
       'min_score_across_subtypes_all_subtype',
       'max_score_across_subtypes_all_score',
       'max_score_across_subtypes_all_subtype',
       'in_EC0_top500', 'in_EC1_top500', 'in_EC2_top500', 'in_EC3_top500',
       'in_EC4_top500', 'in_EC5_top500', 'in_EC6_top500', 'in_EC7_top500',
       'in_EC8_top500']].to_csv('output_lc/final_cumulative_v3_fixed.csv')

# TODO replace indices with n+1 values (forgot about it so Ive done in excel)
# ensure you start from EC8->EC9 replacement and descend, not Ec0-> EC1 and ascend