In [1]:
from sklearn.manifold import MDS
import numpy as np
import torch
import pandas as pd
import random

from sklearn.metrics import euclidean_distances
import utils as ut
import TumorDecon as td


In [2]:
J = 10 #number of patients
N = 100 #dimensions of clinical data
b = np.random.random_sample(size=(J,J))*2-1
S= (b + b.T)/2

In [3]:
np.fill_diagonal(S, 0)

# Start from the basics
### From a similarity matrix get some points with that similarity

In [4]:
og_points=np.random.poisson(size=[J,N])
matrix=euclidean_distances(og_points,og_points)

In [5]:
# perform MDS
X_mds = ut.fake_points_MDS(matrix,N)




In [6]:
matrix_empirical=euclidean_distances(X_mds,X_mds)

In [7]:
np.sum(matrix_empirical-matrix)

0.15911431133138976

# Forward model

In [8]:
J = 10 #number of patients
N = 30 #dimensions of clinical data
C = 3 #number of cell types
G = 100 #genes number
sc_samples=1000 #samples to create signature matrix

In [9]:
features=np.random.normal(scale=1/np.sqrt(N),size=[J,N])
weights=np.random.normal(scale=1/np.sqrt(N),size=[N,G])

In [10]:
patient_correction=np.exp(np.matmul(features,weights))
#get a genexpatient
patient_correction = patient_correction.T
cell_type_correction=np.exp(np.random.normal(scale=0.5,size=[G,C]))

In [11]:
print(patient_correction.shape)

(100, 10)


In [12]:
shape=10
scale=1
average_mu=np.random.gamma(shape,scale,size=[G])
# Multiply M and V, broadcasting happens automatically
average_mu = average_mu[:, np.newaxis]

# Broadcast multiplication
mu_cell_type = average_mu * cell_type_correction

mu = mu_cell_type[:, :, np.newaxis] * patient_correction[:, np.newaxis, :]
print(mu.shape)

(100, 3, 10)


In [13]:
T_torch = torch.from_numpy(mu)

# Use the Poisson function to generate samples
sc_counts= torch.poisson(T_torch.unsqueeze(-1).expand(*T_torch.shape, sc_samples))

# Now I have to create the mixtures
# permute the dimensions to have patients- cell types-samples-genes
sc_counts = sc_counts.permute(2, 1, 3, 0)


In [14]:
cell_type_profiles = torch.mean(sc_counts, dim=(0,2))
cell_type_profiles_per_patient=torch.mean(sc_counts,dim=2)
bulk_counts_per_patient=torch.sum(sc_counts, dim=(1,2))

In [15]:
#tumordecon requires to use some real genes names
# Load the data
df = pd.read_csv('genes_list.txt', sep='\t')

# Extract the 'symbol' column, which contains the gene names
gene_names = df['Approved symbol'].tolist()

# Choose 5 random gene names
random_gene_names = random.sample(gene_names, 100)
genes=random_gene_names
genes_entrez=["gene " + str(i) for i in range(1, 101)]
patients = ["patient " + str(i) for i in range(1, 11)]

In [63]:
# Create a DataFrame
df = pd.DataFrame(cell_type_profiles.T, index=genes, columns=["cell type 1", "cell type 2" , "cell type 3"])
df.index.name="Gene_Symbol"
df=df.reset_index()
#df['Entrez_Gene_Id']=genes_entrez
# Save the DataFrame to a CSV file
#df.to_csv("signature_matrix.csv")
df.to_csv('data/signature_matrix.tsv', sep='\t',index=False)

In [47]:
df

Unnamed: 0,Hugo_Symbol,cell type 1,cell type 2,cell type 3,Entrez_Gene_Id
0,ZNRF3,85.1370,13.7235,15.1833,gene 1
1,TARS3,1.3809,2.6789,4.1686,gene 2
2,TCIRG1,47.3944,13.9389,17.8215,gene 3
3,KIF26A,13.1958,6.4639,25.0781,gene 4
4,SLC25A2,8.9253,20.3371,32.4341,gene 5
...,...,...,...,...,...
95,HSPA8,7.8466,10.3099,13.8147,gene 96
96,COL4A2,7.3030,30.2128,16.9989,gene 97
97,ODAD4,4.3629,10.9232,16.0526,gene 98
98,GLI2,5.8833,21.0743,45.0472,gene 99


In [17]:
# Create a DataFrame
df = pd.DataFrame(cell_type_profiles_per_patient[0].T, index=genes, columns=["cell type 1", "cell type 2" , "cell type 3"])

# Save the DataFrame to a CSV file
df.to_csv("signature_matrix_one_patient.csv")

In [45]:
df

Unnamed: 0,Hugo_Symbol,cell type 1,cell type 2,cell type 3,Entrez_Gene_Id
0,ZNRF3,85.1370,13.7235,15.1833,gene 1
1,TARS3,1.3809,2.6789,4.1686,gene 2
2,TCIRG1,47.3944,13.9389,17.8215,gene 3
3,KIF26A,13.1958,6.4639,25.0781,gene 4
4,SLC25A2,8.9253,20.3371,32.4341,gene 5
...,...,...,...,...,...
95,HSPA8,7.8466,10.3099,13.8147,gene 96
96,COL4A2,7.3030,30.2128,16.9989,gene 97
97,ODAD4,4.3629,10.9232,16.0526,gene 98
98,GLI2,5.8833,21.0743,45.0472,gene 99


In [39]:
# Create a DataFrame
df = pd.DataFrame(bulk_counts_per_patient.T, index=genes, columns=patients)
df.index.name="Hugo_Symbol"
df=df.reset_index()
df['Entrez_Gene_Id']=genes_entrez
#df.set_index('Entrez_Gene_Id', inplace=True)
# Save the DataFrame to a CSV file
#df.to_csv("data/bulk_counts.csv")
df.to_csv('data/bulk_counts.tsv', sep='\t',index=False)

# Try TumorDecon

### As in the tutorial in https://people.math.umass.edu/~aronow/TumorDecon/quickstart.html#tutorial

In [64]:
# Location of sample data (included with the TumorDecon package):
data_loc = "./data/"
# Read in sample data (original source - Colorectal Adenocarcinoma RNA Seq v2 from cBioPortal.org):
rna = td.read_rna_file(data_loc+'bulk_counts.tsv',identifier='hugo',fetch_missing_hugo=False)


In [65]:
signature=td.read_sig_file(data_loc+"signature_matrix.tsv")

In [67]:
ciber_freqs = td.tumor_deconvolve(rna, 'cibersort',  patient_IDs='ALL', sig_matrix=signature, args={'nu':'best', 'scaling':'minmax'})


Running CiberSort...
CiberSort has completed!


In [68]:
ciber_freqs

Patient_ID,cell type 1,cell type 2,cell type 3
patient 1,0.523464,0.220355,0.256182
patient 2,0.458384,0.272144,0.269472
patient 3,0.512667,0.205613,0.281721
patient 4,0.519566,0.207037,0.273396
patient 5,0.496914,0.231775,0.271311
patient 6,0.527356,0.228447,0.244197
patient 7,0.505382,0.240672,0.253946
patient 8,0.496009,0.248513,0.255479
patient 9,0.469061,0.243827,0.287113
patient 10,0.51756,0.224312,0.258128


# Mix them together -> Try to fake a pipeline 