In [4]:
import pandas as pd 
import numpy as np

In [5]:
df = pd.read_csv('mutations_1.csv')

### Calculate the total of samples and probability of Occurence

In [25]:
def Sample_sum(df):
    # sample_total = len(df)
    cancer_mask = df['class'].str.startswith('C')
    cancer_total = cancer_mask.sum()
    non_cancer_total = len(df) - cancer_total
    
    return cancer_total, non_cancer_total


In [21]:
def Probability(df):
    sample_total = len(df)
    cancer_total, non_cancer_total = Sample_sum(df)
    if sample_total == 0:  # Handle empty DataFrame
        return 0, 0
    probability_C = cancer_total/sample_total
    probability_NC = non_cancer_total/sample_total
    
    return probability_C, probability_NC

In [57]:
total_df = len(df)
t_c, t_nc = Sample_sum(df)
pC, pNC = Probability(df)
print(f"""For the root note, t \n
Total Number of Samples, n(t): {total_df} \n
Total Cancer Samples, n(t, C): {t_c} \n
Total NonCancer Samples, n(t, NC): {t_nc} \n
Probability of Selecting C Sample at Node, p(C|t): {pC} \n
Probability of Selecting NC Sample at Node, p(NC|t): {pNC}
""")

For the root note, t 

Total Number of Samples, n(t): 250 

Total Cancer Samples, n(t, C): 108 

Total NonCancer Samples, n(t, NC): 142 

Probability of Selecting C Sample at Node, p(C|t): 0.432 

Probability of Selecting NC Sample at Node, p(NC|t): 0.568



### Splitting and Computations

In [53]:
def Split_and_Compute(df):
    sample_total = len(df)
    result = {}
    
    for column in df.columns[1:]:
    # for i in range(1, len(df.columns)):
        # Split data based on column feature into two groups
        left_mask = df[column]==1
        right_mask = df[column]==0
        
        # Calculate total samples, cancer and non-cancer in each split
        left_sum = left_mask.sum()
        right_sum = right_mask.sum()
        
        left_cancer_tot, left_non_cancer_tot = Sample_sum(df[left_mask])
        right_cancer_tot, right_non_cancer_tot = Sample_sum(df[right_mask])
        
        # Calculate Proportions
        PL = left_sum/sample_total
        PR = right_sum/sample_total
        
        # Calculate Probabilities
        Prob_C_left = (left_cancer_tot/left_sum) if left_sum != 0 else 0
        Prob_NC_left = (left_non_cancer_tot/left_sum) if left_sum != 0 else 0
        Prob_C_right = (right_cancer_tot/right_sum) if right_sum != 0 else 0
        Prob_NC_right = (right_non_cancer_tot/right_sum) if right_sum != 0 else 0
        
        Q = abs(Prob_C_left - Prob_C_right) + abs(Prob_NC_left - Prob_NC_right)
        
        result[column] = [left_sum, right_sum, left_cancer_tot, left_non_cancer_tot, PL, PR, Prob_C_left, 
                          Prob_NC_left, Prob_C_right,  Prob_NC_right, 2*PL*PR, Q, 2*PL*PR*Q]
        
       
    return result

In [45]:
result_df = pd.DataFrame(Split_and_Compute(df))

In [47]:
result_df.index = ['n(tL)', 'n(tR)', 'n(tL,C)', 'n(tL,NC)', 'PL', 'PR', 
                   'P(C|tL)', 'P(NC|tL)', 'P(C|tR)', 'P(NC|tR)', '2PLPR', 'Q', 'Phi(s|t)' ]
result_df = result_df.transpose()

In [51]:
feature_table = result_df.sort_values(by='Phi(s|t)', ascending=False)
feature_table.head(10)

Unnamed: 0,n(tL),n(tR),"n(tL,C)","n(tL,NC)",PL,PR,P(C|tL),P(NC|tL),P(C|tR),P(NC|tR),2PLPR,Q,Phi(s|t)
RPL22_GRCh37_1:6257785-6257785_Frame-Shift-Del_DEL_T-T--,28.0,222.0,27.0,1.0,0.112,0.888,0.964286,0.035714,0.364865,0.635135,0.198912,1.198842,0.238464
DOCK3_GRCh37_3:51417604-51417604_Frame-Shift-Del_DEL_C-C--,23.0,227.0,22.0,1.0,0.092,0.908,0.956522,0.043478,0.378855,0.621145,0.167072,1.155334,0.193024
RNF43_GRCh37_17:56435161-56435161_Frame-Shift-Del_DEL_C-C--,23.0,227.0,22.0,1.0,0.092,0.908,0.956522,0.043478,0.378855,0.621145,0.167072,1.155334,0.193024
PPP2R1A_GRCh37_19:52715971-52715971_Missense-Mutation_SNP_C-C-G_C-C-T,24.0,226.0,0.0,24.0,0.096,0.904,0.0,1.0,0.477876,0.522124,0.173568,0.955752,0.165888
KRAS_GRCh37_12:25398284-25398284_Missense-Mutation_SNP_C-C-A_C-C-T_C-C-G,28.0,222.0,22.0,6.0,0.112,0.888,0.785714,0.214286,0.387387,0.612613,0.198912,0.796654,0.158464
NCOA3_GRCh37_20:46282985-46282985_3'Flank_DEL_T-T--,15.0,235.0,15.0,0.0,0.06,0.94,1.0,0.0,0.395745,0.604255,0.1128,1.208511,0.13632
DRD5_GRCh37_4:9785349-9785349_3'UTR_SNP_G-G-C,14.0,236.0,14.0,0.0,0.056,0.944,1.0,0.0,0.398305,0.601695,0.105728,1.20339,0.127232
CTNNA2_GRCh37_2:80875615-80875615_3'UTR_DEL_T-T--,13.0,237.0,13.0,0.0,0.052,0.948,1.0,0.0,0.400844,0.599156,0.098592,1.198312,0.118144
ACVR2A_GRCh37_2:148683686-148683686_Frame-Shift-Del_DEL_A-A--,13.0,237.0,13.0,0.0,0.052,0.948,1.0,0.0,0.400844,0.599156,0.098592,1.198312,0.118144
CSNK1G1_GRCh37_15:64461260-64461260_3'UTR_DEL_A-A--,13.0,237.0,13.0,0.0,0.052,0.948,1.0,0.0,0.400844,0.599156,0.098592,1.198312,0.118144
