The dataset consists of proeotmics from 3 bones

1. skull (calvaria)
2. vertebra
3. pelvis

for 20 human samples

total 60 measurements.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import random 

import numpy as np
import scanpy as sc
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

import anndata as ann

sc.settings.set_figure_params(dpi=100)


from utils import (
                    get_genes_per_group,                
                    filter_proteins_per_group, 
                    normalise,
                    impute_knn,
                    impute_knn_nan,
                    impute_min_value,
                    impute_min_value_nan,
                  )

from gprofiler import GProfiler
from gprofiler_plotting import plot_enrich

from bioinfokit import analys, visuz

In [3]:
DATA_DIR = ""
FILE_NAME = ""
delimiter = "\t"

In [4]:
data = pd.read_csv(f"{DATA_DIR}/{FILE_NAME}" , delimiter=delimiter)

In [None]:
# dropping genes with nan values
data = data[~data['Genes'].isna()]
data = data.reset_index()
data = data.drop(columns=["index"])

In [7]:
calvaria_cols = [col for col in data.columns if 'Calvaria' in col]
pelvis_cols = [col for col in data.columns if 'Pelvis' in col]
vertebra_cols = [col for col in data.columns if 'Vertebra' in col]
columns = calvaria_cols + pelvis_cols + vertebra_cols

#### Preprocessing
1. Filtering proteins 
2. Drop proteins apprearing in less than min_samples 
3. Log transformation 
4. Normalisation 
5. Imputation 

#### 1. Filter data

In [8]:
filtered_data = filter_proteins_per_group(
    data=data,
    samples={
        'calvaria': calvaria_cols, 
        'pelvis': pelvis_cols,
        'vertebra': vertebra_cols
    },
#     drop_threshold=6,
    half_values=True,
    in_place=False
)

#### Create anndata

In [27]:
genes = [gene.split(';')[0] for gene in filtered_data['Genes'].values]
genes = [gene.split('_')[0] for gene in genes]
protein_ids = [str(p).split(';')[0] for p in filtered_data['Protein.Ids'].values]
protein_names = [str(p).split(';')[0] for p in filtered_data['Protein.Names'].values]
protein_names = [p.split('_')[0] for p in protein_names]

In [28]:
# whole samples

X = filtered_data[columns].T.values

df_obs = pd.DataFrame()
df_obs['sample'] = [col.split('_')[-1] for col in columns]
df_obs['region'] = [col.split('_')[0] for col in columns]

df_var = pd.DataFrame(index=genes)
df_var['gene'] = filtered_data['Genes'].values
df_var['protein_id'] = protein_ids
df_var['protein_names'] = protein_names


adata = ann.AnnData(
    X = X,
    obs = df_obs,
    var = df_var,
)
adata.var_names_make_unique()
adata

Variable names are not unique. To make them unique, call `.var_names_make_unique`.


AnnData object with n_obs × n_vars = 60 × 5320
    obs: 'sample', 'region'
    var: 'gene', 'protein_id', 'protein_names'

In [29]:
adata.obs["region_sample"] = adata.obs["region"] + "_"  + adata.obs["sample"]

#### 2. Dropping proteins appearing in less than half samples

In [32]:
num_genes_org = len(adata.var_names)
num_genes_org

5320

In [33]:
# filtering out genes that are present in less than min_samples
min_samples=6
sc.pp.filter_genes(adata, min_cells=min_samples)
print(f'filtered out {num_genes_org - adata.shape[1]} genes that are detected in less than {min_samples} samples!')
adata

filtered out 0 genes that are detected in less than 6 samples!


AnnData object with n_obs × n_vars = 60 × 5320
    obs: 'sample', 'region', 'region_sample'
    var: 'gene', 'protein_id', 'protein_names', 'n_cells'

#### 3. Log transformation

In [34]:
# log transform adata
sc.pp.log1p(adata)

#### 4. Normalisation

In [36]:
group_by = 'region_sample'
normalise(
    adata, 
    obs_columns=['region_sample'],
#     group_by_column=group_by,
    na_threshold=None,
)

#### 4. Imputation (KNN)

In [38]:
adata_knn = adata.copy()

In [39]:
impute_knn_nan(adata_knn)

#### Downstream analysis

In [41]:
samples_info = pd.DataFrame()
samples_info["sample num"] = [1, 2, 3, 4, 5, 6, 7, 8, 9,
                             16, 15, 17, 18, 10, 11, 12, 13, 20, 19, 14,
                             ]
samples_info["PMI (h)"] = [36, 36, 18, 36, 26, 28, 17, 27, 29,
                           300, 180, 217, 134, 325, 280, 255, 142, 220, 258, 466,
                          ]
samples_info["Cause of death"] = [
    "Asphyxiaton (Hanging/suicide)",
    "Dissection of aorta (Traffic accident)",
    "Asphyxiaton (possible anaphylactic shock)",
    "Hemorrhagic shock (Suicide)",
    "Hemorrhagic shock (complication during central venous cathederization)",
    "Polytrauma (Accident)",
    "Myocardial infarction",
    "Head shot (Suicide) possible M.Parkinson",
    "Hanging (suicide)",
    "Acute heart failure",
    "Hypothermia",
    "Acute heart failure",
    "Intoxication",
    "Pneumonia",
    "Pulmonary thromboembolism",
    "Bypass-Thrombosis",
    "Pulmonary thromboembolism",
    "Intracerebral bleeding",
    "Traumatic brain injury ",
    "Acute myocardial infarction",
]
samples_info["Cause of death"] = samples_info["Cause of death"].astype(str)
samples_info["age (years)"] = [
    23, 56, 56, 86, 77, 70, 69, 92, 39,
    91, 95, 72, 25, 73, 54, 85, 75, 46, 63, 71,
]
samples_info["sex"] = ["male", "male", "male", "male", "female", "male", "male", "male", "female",
                       "female", "female", "male", "male", "female", "male", "male", "male", "female", "male", "male"
                      ]

samples_info = samples_info.sort_values(by=['sample num'])



samples_info

Unnamed: 0,sample num,PMI (h),Cause of death,age (years),sex
0,1,36,Asphyxiaton (Hanging/suicide),23,male
1,2,36,Dissection of aorta (Traffic accident),56,male
2,3,18,Asphyxiaton (possible anaphylactic shock),56,male
3,4,36,Hemorrhagic shock (Suicide),86,male
4,5,26,Hemorrhagic shock (complication during central...,77,female
5,6,28,Polytrauma (Accident),70,male
6,7,17,Myocardial infarction,69,male
7,8,27,Head shot (Suicide) possible M.Parkinson,92,male
8,9,29,Hanging (suicide),39,female
13,10,325,Pneumonia,73,female


In [42]:
age = np.array([], dtype=int)
pmi = np.array([], dtype=int)
cause_of_death = np.array([])
sex = np.array([])

for s in adata.obs["sample"]:
    s_info = samples_info[samples_info["sample num"] == int(s)]    
    age = np.append(age, s_info["age (years)"].values[0])
    pmi = np.append(pmi, s_info["PMI (h)"].values[0])
    cause_of_death = np.append(cause_of_death, s_info["Cause of death"].values[0])
    sex = np.append(sex, s_info["sex"].values[0])
    
    
adata_knn.obs["age"] = age
adata_knn.obs["cause_of_death"] = cause_of_death
adata_knn.obs["pmi"] = pmi
adata_knn.obs["sex"] = sex

### data intergation
- to correct for samples coming from different runs.

In [164]:
sc.pp.combat(adata_knn, key="sample")

  (abs(g_new - g_old) / g_old).max(), (abs(d_new - d_old) / d_old).max()
