The dataset consists of proeotmics from 5 bones + brain + meninges
1. skull (calvaria)
2. forearm (humerus)
3. vertebra
4. pelvis
5. femur
6. brain
7. meninges (not in naive)

for 3 conditions,
1. naive
2. sham (surgical cut)
3. mcao (stroke model)

for 3 animals each.

total 60 groups.

pooled samples have also scapula

sham meninges 3 has repeated measurement so replaced by sample 4

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import random 

import numpy as np
import scanpy as sc
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

import anndata as ann

sc.settings.set_figure_params(dpi=100)


from utils import (
                    get_genes_per_group,                
                    filter_proteins_per_group, 
                    normalise,
#                     median_normalization,
                    impute_knn,
                    impute_knn_nan,
                    impute_min_value,
                    impute_min_value_nan,
                  )

from gprofiler import GProfiler
from gprofiler_plotting import plot_enrich
from bioinfokit import analys, visuz
from upsetplot import plot, from_contents


In [3]:
DATA_DIR = ""
FILE_NAME = ""
delimiter="\t"

In [20]:
data = pd.read_csv(f"{DATA_DIR}/{FILE_NAME}", delimiter=delimiter)

In [21]:
gene_column="Genes"

# dropping genes with nan values
data = data[~data[gene_column].isna()]
data = data.reset_index()
data = data.drop(columns=["index"])

In [33]:
# specifying columns of interest
columns_naive = [col for col in data.columns if 'naive' in col]
columns_sham = [col for col in data.columns if 'sham' in col]
columns_mcao = [col for col in data.columns if 'mcao' in col]

columns_calvaria = [col for col in data.columns if 'calvaria' in col]
columns_meninges = [col for col in data.columns if 'meninges' in col]
columns_brain = [col for col in data.columns if 'brain' in col]

columns_humerus = [col for col in data.columns if 'humerus' in col]
columns_vertebra = [col for col in data.columns if 'vertebra' in col]
columns_pelvis = [col for col in data.columns if 'pelvis' in col]
columns_femur = [col for col in data.columns if 'femur' in col]

# remove scapula
COLUMNS = list(set(columns_naive + columns_sham + columns_mcao))
COLUMNS_BMS = list(set(COLUMNS) & (set(columns_calvaria + columns_meninges + columns_brain)))
COLUMNS_BONES = list(set(COLUMNS) - (set(columns_meninges + columns_brain)))

#### Preprocessing
1. Filtering proteins 
2. Normalisation 
3. Log transformation 
5. Imputation 

#### 1. Filter data

In [43]:
filtered_data = filter_proteins_per_group(
    data=data,
    samples={
        'calvaria': columns_calvaria, 
#         'meninges': columns_meninges, 
#         'brain': columns_brain,
        'humerus': columns_humerus,
        'vertebra': columns_vertebra,
        'pelvis': columns_pelvis,
        'femur': columns_femur,
    },
    half_values=True,
    in_place=False
)

#### Create anndata

In [45]:
genes = [gene.split(';')[0] for gene in filtered_data['Genes'].values]
genes = [gene.split('_')[0] for gene in genes]
protein_ids = [p.split(';')[0] for p in filtered_data['Protein.Ids'].values]

In [46]:
# all bones + meninges and brain data
regions = [col.split('_')[1] for col in COLUMNS]
samples = [col.split('_')[-1] for col in COLUMNS]
conditions = [col.split('_')[0] for col in COLUMNS]

In [47]:
# whole samples

X = filtered_data[COLUMNS].T.values

df_obs = pd.DataFrame()
df_obs['condition'] = conditions
df_obs['region'] = regions
df_obs['sample'] = samples

df_var = pd.DataFrame(index=genes)
df_var['gene'] = genes
df_var['protein_id'] = protein_ids


adata = ann.AnnData(
    X = X,
    obs = df_obs,
    var = df_var,
)
adata.var_names_make_unique()
adata

Variable names are not unique. To make them unique, call `.var_names_make_unique`.


AnnData object with n_obs × n_vars = 60 × 4172
    obs: 'condition', 'region', 'sample'
    var: 'gene', 'protein_id'

#### 2. Dropping proteins appearing in less than half samples

In [50]:
num_genes_org = len(adata.var_names)
num_genes_org

4172

In [51]:
# filtering out genes that are present in less than min_samples
min_samples=6
sc.pp.filter_genes(adata, min_cells=min_samples)
print(f'filtered out {num_genes_org - adata.shape[1]} genes that are detected in less than {min_samples} samples!')
adata

filtered out 0 genes that are detected in less than 6 samples!


AnnData object with n_obs × n_vars = 60 × 4172
    obs: 'condition', 'region', 'sample'
    var: 'gene', 'protein_id', 'n_cells'

#### 3. Log transformation -- skipped since transformation was already performed

In [52]:
# log transform adata
sc.pp.log1p(adata)

In [54]:
adata.obs['condition_region'] = adata.obs['region'].astype(str) + ' ' + \
                                    adata.obs['condition'].astype(str) 
                                           
adata.obs['condition_region'] = adata.obs['condition_region'].astype('category')

In [55]:
adata.obs['condition_region_sample'] = adata.obs['condition'].astype(str) + '_' + \
                                           adata.obs['region'].astype(str) + '_' + \
                                            adata.obs['sample'].astype(str)
adata.obs['condition_region_sample'] = adata.obs['condition_region_sample'].astype('category')

#### 4. Normalisation

In [58]:
normalise(
    adata, 
    obs_columns=['condition_region_sample'],
    na_threshold=None,
)


#### 4. Imputation (KNN)

In [60]:
adata_knn = adata.copy()

In [61]:
impute_knn_nan(adata_knn)