In [1]:
import pandas as pd
from scipy.stats import zscore
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer

In [2]:
df = pd.read_csv('/home/jovyan/work/Predicted_Compounds_for_CRC_v0_1 (1).tsv', delimiter='\t')

In [3]:
df.head(n=20)

Unnamed: 0,Canon/Type,CMS1,CMS2,CMS3,CMS4,Unnamed: 5,Unnamed: 6
0,KRAS,No predictions yet!,,,,,
1,APC,Quinapril — 6.355 Rituximab — 4.840,No predictions yet!,Apafant — 6.428,Clofibric Acid (Platinum Enhancer) — 6.193 PD...,,
2,DCC,,,,,,
3,TGFBR2,,,,,,
4,SMAD2,,,,,,
5,SMAD4,,,,,,
6,BAX,,,,,,
7,P53,,,,,,
8,MLH1,,,,,,
9,MLH2,,,,,,


In [4]:
predicted_df = pd.read_csv('/home/jovyan/work/predicted-therapeutics-raw-data.csv')
predicted_df

Unnamed: 0,predicted therapy,n_crc_publications_since_2018,n_cancer_publications_since_2018,%_crc_papers_since_2018,n_crc_publications_before_2018,n_cancer_publications_before_2018,%_crc_papers_before_2018,crc_papers_overall,%_crc_after_2018,cancer_papers_overall,...,n_crc_trials,n_total_cancer_trials,n_other_cancer_trials,%_trials_for_crc,term_frequency_per_year,used_in_any_trial,usage_in_other_cancers,approved,severe_adverse_events,ongoing_development
0,sirolimus,18500.0,29000.0,0.64,18300.0,91500.0,0.2,36800.0,0.5,120500.0,...,21.0,782.0,761.0,0.03,668991.71,1.0,1.0,1.0,,1.0
1,birinapant,9.0,969.0,0.01,8.0,698.0,0.01,17.0,0.53,1667.0,...,1.0,13.0,12.0,0.08,4114.86,1.0,0.0,0.0,,0.0
2,gliquidone,45.0,275.0,0.16,56.0,437.0,0.13,101.0,0.45,712.0,...,0.0,1.0,1.0,0.0,2947.67,1.0,0.0,1.0,,1.0
3,rituximab,16600.0,26500.0,0.63,11100.0,81700.0,0.14,27700.0,0.6,108200.0,...,1.0,1807.0,1806.0,0.0,504741.06,1.0,1.0,1.0,,1.0
4,CI-1040 (PD-184352),428.0,704.0,0.61,1670.0,2910.0,0.57,2098.0,0.2,3614.0,...,2.0,2.0,0.0,1.0,394.15,1.0,0.0,0.0,,0.0
5,U0126,2640.0,7890.0,0.33,7780.0,19700.0,0.39,10420.0,0.25,27590.0,...,0.0,0.0,0.0,0.0,8429.57,0.0,0.0,0.0,,1.0
6,Y27632,964.0,11300.0,0.09,1140.0,13600.0,0.08,2104.0,0.46,24900.0,...,0.0,0.0,0.0,0.0,43154.11,1.0,0.0,0.0,,1.0
7,Clofibrate,431.0,2160.0,0.2,2690.0,15000.0,0.18,3121.0,0.14,17160.0,...,0.0,0.0,0.0,0.0,1694.01,1.0,0.0,1.0,,1.0
8,Fenofibrate,1930.0,9510.0,0.2,9790.0,16400.0,0.6,11720.0,0.16,25910.0,...,0.0,16.0,16.0,0.0,78965.65,1.0,0.0,1.0,,1.0
9,Bezafibrate,479.0,2700.0,0.18,2200.0,8340.0,0.26,2679.0,0.18,11040.0,...,0.0,2.0,2.0,0.0,40.52,1.0,0.0,1.0,,1.0


In [5]:
# Calculate specificity index as 1 / (CRC mentions / Cancer mentions) 
predicted_df['crc_specificity'] = 1 / (predicted_df['crc_papers_overall'].astype(float) / predicted_df['cancer_papers_overall'].astype(float))

# Calculate term frequency rating
predicted_df['frequency_rating'] = 1 / (predicted_df['crc_papers_overall'].astype(float) / predicted_df['term_frequency_per_year'].astype(float))

predicted_df[['predicted therapy', 'crc_specificity','frequency_rating', 'crc_papers_overall', 'cancer_papers_overall']]

Unnamed: 0,predicted therapy,crc_specificity,frequency_rating,crc_papers_overall,cancer_papers_overall
0,sirolimus,3.274457,18.179123,36800.0,120500.0
1,birinapant,98.058824,242.050588,17.0,1667.0
2,gliquidone,7.049505,29.184851,101.0,712.0
3,rituximab,3.906137,18.221699,27700.0,108200.0
4,CI-1040 (PD-184352),1.722593,0.187869,2098.0,3614.0
5,U0126,2.647793,0.80898,10420.0,27590.0
6,Y27632,11.834601,20.510509,2104.0,24900.0
7,Clofibrate,5.498238,0.542778,3121.0,17160.0
8,Fenofibrate,2.210751,6.737683,11720.0,25910.0
9,Bezafibrate,4.120941,0.015125,2679.0,11040.0


Future work will aim to further stratify these scores by CRC subtype where possible and to include data from trials around tolerability, toxicity, and efficicay as well.

The ultimate goal is to amass enough data to reasonable generate prospective scorings with weights based on successful trials. 


In [8]:
std_scaler = StandardScaler()
min_max = MinMaxScaler()
 
    
# Here we need to include additional metrics in the composite score especially we need to include SAE counts for each severity, number of patients in trials, and response profiles
trial_columns = ['n_other_cancer_trials', 'used_in_any_trial', 'usage_in_other_cancers', 'approved']  
crc_research_specificity_columns = ['crc_specificity', 'frequency_rating', 'n_crc_trials',  'ongoing_development']    
    
ct = ColumnTransformer([
    ('test', min_max, trial_columns)
])  
    
st = ColumnTransformer([
    ('specificty', min_max, crc_research_specificity_columns)
]) 


trial_features = predicted_df[trial_columns]
specificity_features = predicted_df[crc_research_specificity_columns]

scaled_cols = ['crc_specificity', 'frequency_rating', 'n_crc_trials', 'n_other_cancer_trials', 'used_in_any_trial', 'usage_in_other_cancers', 'approved', 'ongoing_development']

predicted_df[trial_columns] = ct.fit_transform(trial_features)

predicted_df[crc_research_specificity_columns] = st.fit_transform(specificity_features)

predicted_df

Unnamed: 0,predicted therapy,n_crc_publications_since_2018,n_cancer_publications_since_2018,%_crc_papers_since_2018,n_crc_publications_before_2018,n_cancer_publications_before_2018,%_crc_papers_before_2018,crc_papers_overall,%_crc_after_2018,cancer_papers_overall,...,n_other_cancer_trials,%_trials_for_crc,term_frequency_per_year,used_in_any_trial,usage_in_other_cancers,approved,severe_adverse_events,ongoing_development,crc_specificity,frequency_rating
0,sirolimus,18500.0,29000.0,0.64,18300.0,91500.0,0.2,36800.0,0.5,120500.0,...,0.421373,0.03,668991.71,1.0,1.0,1.0,,1.0,0.016109,0.075105
1,birinapant,9.0,969.0,0.01,8.0,698.0,0.01,17.0,0.53,1667.0,...,0.006645,0.08,4114.86,1.0,0.0,0.0,,0.0,1.0,1.0
2,gliquidone,45.0,275.0,0.16,56.0,437.0,0.13,101.0,0.45,712.0,...,0.000554,0.0,2947.67,1.0,0.0,1.0,,1.0,0.055295,0.120573
3,rituximab,16600.0,26500.0,0.63,11100.0,81700.0,0.14,27700.0,0.6,108200.0,...,1.0,0.0,504741.06,1.0,1.0,1.0,,1.0,0.022666,0.075281
4,CI-1040 (PD-184352),428.0,704.0,0.61,1670.0,2910.0,0.57,2098.0,0.2,3614.0,...,0.0,1.0,394.15,1.0,0.0,0.0,,0.0,0.0,0.000776
5,U0126,2640.0,7890.0,0.33,7780.0,19700.0,0.39,10420.0,0.25,27590.0,...,0.0,0.0,8429.57,0.0,0.0,0.0,,1.0,0.009604,0.003342
6,Y27632,964.0,11300.0,0.09,1140.0,13600.0,0.08,2104.0,0.46,24900.0,...,0.0,0.0,43154.11,1.0,0.0,0.0,,1.0,0.104966,0.084736
7,Clofibrate,431.0,2160.0,0.2,2690.0,15000.0,0.18,3121.0,0.14,17160.0,...,0.0,0.0,1694.01,1.0,0.0,1.0,,1.0,0.039192,0.002242
8,Fenofibrate,1930.0,9510.0,0.2,9790.0,16400.0,0.6,11720.0,0.16,25910.0,...,0.008859,0.0,78965.65,1.0,0.0,1.0,,1.0,0.005067,0.027836
9,Bezafibrate,479.0,2700.0,0.18,2200.0,8340.0,0.26,2679.0,0.18,11040.0,...,0.001107,0.0,40.52,1.0,0.0,1.0,,1.0,0.024896,6.2e-05


In [9]:
def weighted_average(row, values, weights):
    return sum(weights * row[values]) / sum(weights)

In [10]:
# Here we would ideally derive weights using PCA given enough data, however, sufficient data 

# weights = [0.225, 0.05, 0.075, 0.125, 0.075, 0.15, 0.20, 0.1]

In [12]:
# predicted_df['weighted_score'] = predicted_df[scaled_cols].apply(lambda row: weighted_average(row, scored_columns, weights), axis=1)
predicted_df['crc_specificity_score'] = predicted_df[crc_research_specificity_columns].mean(axis=1)
predicted_df['evidence_score'] = predicted_df[trial_columns].mean(axis=1)

In [13]:
predicted_df[['predicted therapy','crc_specificity_score', 'evidence_score', 'approved']].sort_values(by=['crc_specificity_score', 'approved', 'evidence_score'], ascending=[False, False, False])

Unnamed: 0,predicted therapy,crc_specificity_score,evidence_score,approved
0,sirolimus,0.522803,0.855343,1.0
1,birinapant,0.511905,0.251661,0.0
15,Licarbazepine,0.320961,0.5,1.0
6,Y27632,0.297426,0.25,0.0
2,gliquidone,0.293967,0.500138,1.0
14,Bendamustine,0.292125,0.799557,1.0
3,rituximab,0.286391,1.0,1.0
7,Clofibrate,0.260359,0.5,1.0
8,Fenofibrate,0.258226,0.502215,1.0
9,Bezafibrate,0.25624,0.500277,1.0


In [14]:
predicted_df.to_csv('./predicted_drug_scoring.csv')