Load modules

In [1]:
import pandas as pd
import cell2cell as c2c
import numpy as np

import scipy
import os

Directories

In [2]:
wk_dir = '/home/qdai8/projects/Projects/STDCC/'
data_folder = wk_dir + 'Data/RDA/SLE/'
output_folder =  wk_dir + 'Results/'
if not os.path.isdir(output_folder):
    os.mkdir(output_folder)

## 1.1 Load subject-level information 

In [3]:
subject_info = pd.read_csv(data_folder + 'subject_info.txt', sep='\t')

check ancestry:

In [4]:
subject_info['pop_cov'].value_counts()

European            149
Asian               107
African American      3
Hispanic              2
Name: pop_cov, dtype: int64

check processing batch by SLE status:

In [5]:
subject_info[['Processing_Cohort', 'SLE_status']].value_counts()

Processing_Cohort  SLE_status
2.0                SLE           105
4.0                Healthy        44
1.0                Healthy        42
4.0                SLE            40
3.0                SLE            17
                   Healthy         9
2.0                Healthy         4
dtype: int64

filter the 5 samples of African American or Hispanic history:

In [6]:
subject_info = subject_info[(subject_info.pop_cov.isin(['Asian', 'European']))]

In [7]:
subject_info['pop_cov'].value_counts()

European    149
Asian       107
Name: pop_cov, dtype: int64

sort subjects by SLE status:

In [8]:
subject_info = subject_info.sort_values('SLE_status')

## 1.2 cell-type-based expression matrices 

In [9]:
exp_matrices = dict()
for sample in subject_info.ind_cov.values:
    exp_matrices[sample] = pd.read_csv(data_folder + 'CellnnFraction_Validation/{}.CellnnFraction.csv.gz'.format(sample), 
                                       index_col=0)

check cell types across samples:

In [10]:
cell_types = []
for k, v in exp_matrices.items():
    for cell_type in v.columns:
        cell_types.append(cell_type)

In [11]:
from collections import Counter
Counter(cell_types)

Counter({'B': 254,
         'NK': 256,
         'PB': 198,
         'Progen': 206,
         'Prolif': 255,
         'T4': 256,
         'T8': 256,
         'cDC': 254,
         'cM': 256,
         'ncM': 256,
         'pDC': 254})

reduced dataset to samples with 'B', 'NK', 'Prolif', 'T4', 'T8', 'cM', 'ncM', 'cDC', 'pDC' cell types

In [12]:
filter_exp_matrices = []
filter_samples = []
select_cells = ['B', 'NK', 'Prolif', 'T4', 'T8', 'cM', 'ncM', 'cDC', 'pDC']

for sample in subject_info.ind_cov.values:
    v = exp_matrices[sample]
    if all(item in v.columns for item in select_cells):
        filter_exp_matrices.append(exp_matrices[sample].fillna(0))
        filter_samples.append(sample)
    else:
        continue

remaining samples:

In [13]:
len(filter_samples)

251

subject-level information for remaing samples:

In [14]:
filter_subject_info = subject_info.loc[subject_info['ind_cov'].isin(filter_samples)]

In [15]:
filter_subject_info['SLE_status'].value_counts()

SLE        154
Healthy     97
Name: SLE_status, dtype: int64

In [16]:
filter_subject_info['pop_cov'].value_counts()

European    147
Asian       104
Name: pop_cov, dtype: int64

In [17]:
filter_subject_info.to_csv(data_folder + 'filter_subject_info.txt',
                            header = True,
                            index = False,
                            sep = '\t',
                            mode = 'w')

In [18]:
data_folder

'/home/qdai8/projects/Projects/STDCC/Data/RDA/SLE/'

## Ligand-Receptor Pairs

In [19]:
lr_pairs = pd.read_csv('https://raw.githubusercontent.com/LewisLabUCSD/Ligand-Receptor-Pairs/master/Human/Human-2020-Jin-LR-pairs.csv')
lr_pairs = lr_pairs.astype(str)

In [20]:
# interaction columns:
int_columns = ('ligand_symbol', 'receptor_symbol')

In [21]:
lr_pairs = c2c.preprocessing.ppi.remove_ppi_bidirectionality(ppi_data=lr_pairs, 
                                                             interaction_columns=int_columns)

Removing bidirectionality of PPI network


In [22]:
lr_pairs.shape

(1988, 17)

In [23]:
ppi_functions = dict()

for idx, row in lr_pairs.iterrows():
    ppi_label = row[int_columns[0]] + '^' + row[int_columns[1]]
    ppi_functions[ppi_label] = row['annotation']

In [24]:
ensembl_symbol = dict()

for idx, row in lr_pairs.iterrows():
    ensembl_symbol[row['interaction_ensembl']] = row['interaction_symbol']

## Build Tensor

**Note here the order of samples in the tensor are the same as those in the sample meta data**

In [25]:
context_labels = filter_subject_info['ind_cov'].tolist()

In [26]:
tensor = c2c.tensor.InteractionTensor(rnaseq_matrices=filter_exp_matrices,
                                      ppi_data=lr_pairs,
                                      context_names=context_labels,
                                      how='inner',
                                      complex_sep='&',
                                      interaction_columns=('ligand_ensembl', 'receptor_ensembl'),
                                      communication_score='expression_gmean')

Getting expression values for protein complexes
Building tensor for the provided context


In [27]:
tensor.tensor.shape

(251, 58, 9, 9)

In [28]:
from scipy.io import savemat
mymat={'c2ctensor':tensor.tensor}
savemat(data_folder + "c2ctensor_nnfraction_4d_val.mat", mymat)

## Save LR pair names and Cell Type names

In [29]:
tensor.order_names[1] = [ensembl_symbol[lr] for lr in tensor.order_names[1]]

In [30]:
pd.DataFrame(tensor.order_names[1]).to_csv(data_folder + '/meta_lr_val.txt', 
                      mode='w',
                      index=None,
                      header=True,
                      sep='\t')

In [31]:
pd.DataFrame(tensor.order_names[2]).to_csv(data_folder + '/meta_sender_val.txt', 
                      mode='w',
                      index=None,
                      header=True,
                      sep='\t')

In [32]:
pd.DataFrame(tensor.order_names[3]).to_csv(data_folder + '/meta_receiver_val.txt', 
                      mode='w',
                      index=None,
                      header=True,
                      sep='\t')