In [1]:
import numpy as np
import scipy
from matplotlib import pyplot as plt
import pandas as pd

In [2]:
from sklearn.metrics import jaccard_score

In [3]:
df=pd.DataFrame()

In [4]:
#sample an age
ages=np.random.normal(65,10,size=23).astype(int)
#0 is MDS, 1 is AML
disease=np.random.binomial(size=23, n=1, p= 0.5)
ANC=np.random.uniform(0.5,6,size=23)
HB=np.random.uniform(0.5,6,size=23)
PLT=np.random.uniform(50,400,size=23)
blasts=np.random.poisson(10,size=23)
#0 is no, 1 is yes
RBC=np.random.binomial(size=23, n=1, p= 0.5)
ESA=np.random.binomial(size=23, n=1, p= 0.5)
chemo=np.random.binomial(size=23, n=1, p= 0.5)
HMA=np.random.binomial(size=23, n=1, p= 0.5)
HSCT=np.random.binomial(size=23, n=1, p= 0.5)
genetic_info=np.random.binomial(size=(23,31),n=1,p=0.1)

In [5]:
df['age']=ages
df['disease']=disease
df['ANC']=np.around(ANC,decimals=1)
df['HB']=np.around(HB,decimals=1)
df['PLT']=np.around(PLT,decimals=1)
df['blasts']=blasts
df['RBC']=RBC
df['ESA']=ESA
df['chemo']=chemo
df['HMA']=HMA
df['HSCT']=HSCT
#df['genetic_info']=genetic_info

In [6]:
# Convert the matrix to a dataframe
matrix_df = pd.DataFrame(genetic_info)

# Concatenate the original dataframe with the matrix dataframe
df = pd.concat([df, matrix_df], axis=1)


In [7]:
dict_df = df.T.to_dict()

In [21]:
dict_df

{0: {'age': 50.0,
  'disease': 1.0,
  'ANC': 1.2,
  'HB': 2.2,
  'PLT': 94.2,
  'blasts': 8.0,
  'RBC': 0.0,
  'ESA': 0.0,
  'chemo': 1.0,
  'HMA': 1.0,
  'HSCT': 1.0,
  0: 1.0,
  1: 0.0,
  2: 0.0,
  3: 0.0,
  4: 0.0,
  5: 0.0,
  6: 0.0,
  7: 0.0,
  8: 0.0,
  9: 0.0,
  10: 0.0,
  11: 0.0,
  12: 0.0,
  13: 0.0,
  14: 0.0,
  15: 0.0,
  16: 0.0,
  17: 0.0,
  18: 0.0,
  19: 1.0,
  20: 0.0,
  21: 0.0,
  22: 0.0,
  23: 0.0,
  24: 0.0,
  25: 0.0,
  26: 0.0,
  27: 0.0,
  28: 0.0,
  29: 1.0,
  30: 0.0},
 1: {'age': 71.0,
  'disease': 0.0,
  'ANC': 5.4,
  'HB': 1.6,
  'PLT': 226.3,
  'blasts': 9.0,
  'RBC': 0.0,
  'ESA': 1.0,
  'chemo': 1.0,
  'HMA': 0.0,
  'HSCT': 1.0,
  0: 0.0,
  1: 0.0,
  2: 0.0,
  3: 1.0,
  4: 0.0,
  5: 0.0,
  6: 0.0,
  7: 0.0,
  8: 0.0,
  9: 0.0,
  10: 0.0,
  11: 0.0,
  12: 0.0,
  13: 0.0,
  14: 0.0,
  15: 0.0,
  16: 0.0,
  17: 0.0,
  18: 1.0,
  19: 0.0,
  20: 0.0,
  21: 0.0,
  22: 1.0,
  23: 0.0,
  24: 0.0,
  25: 1.0,
  26: 0.0,
  27: 1.0,
  28: 0.0,
  29: 0.0,
  30: 0.0},

In [9]:
dict_df[0]['age']

50.0

In [31]:
def gaussian_kernel(a,b,std=10):
    return np.exp(-(a-b)**2/(2*std**2))
def kernel(patient1_idx,patient2_idx,n_genes=31,ignore=None):
    output_dict={}
    output_dict['age']=gaussian_kernel(dict_df[patient1_idx]['age'],dict_df[patient2_idx]['age'])
    output_dict['disease']=int(dict_df[patient1_idx]['disease']==dict_df[patient2_idx]['disease'])
    output_dict['ANC']=gaussian_kernel(dict_df[patient1_idx]['ANC'],dict_df[patient2_idx]['ANC'],2)
    output_dict['HB']=gaussian_kernel(dict_df[patient1_idx]['HB'],dict_df[patient2_idx]['HB'],2)
    output_dict['PLT']=gaussian_kernel(dict_df[patient1_idx]['PLT'],dict_df[patient2_idx]['PLT'],50)
    output_dict['blasts']=gaussian_kernel(dict_df[patient1_idx]['blasts'],dict_df[patient2_idx]['blasts'],10)
    output_dict['RBC']=int(dict_df[patient1_idx]['RBC']==dict_df[patient2_idx]['RBC'])
    output_dict['ESA']=int(dict_df[patient1_idx]['ESA']==dict_df[patient2_idx]['ESA'])
    output_dict['chemo']=int(dict_df[patient1_idx]['chemo']==dict_df[patient2_idx]['chemo'])
    output_dict['HMA']=int(dict_df[patient1_idx]['HMA']==dict_df[patient2_idx]['HMA'])
    output_dict['HSCT']=int(dict_df[patient1_idx]['HSCT']==dict_df[patient2_idx]['HSCT'])
    for i in range(n_genes):
        output_dict[i]=int(dict_df[patient1_idx][i]==dict_df[patient2_idx][i]==1)
    return output_dict

In [41]:
mat=np.random.rand(23,23)
similarity_kernel = (mat + mat.T) / 2
np.fill_diagonal(similarity_kernel, 1)

In [52]:
X=[]
y=[]
for i in range(1,len(mat)):
    for j in range(i, len(mat)):
        X.append(np.array(list(kernel(i,j).values())))
        y.append(similarity_kernel[i,j])

In [55]:
X=np.array(X)
y=np.array(y)

In [60]:
from sklearn.linear_model import Ridge
clf = Ridge(alpha=1.0)
clf.fit(X, y)
clf.score(X,y)

0.2981293207472183