# 1. Preprocessing analysis

This notebook is used to preprocess single-cell data:
- SC1: HeLa and NIH3T3 cells experiment
- SC2: 10 cell lines from NCI-60 cell panel (A498, BT-549, HOP-62, HS 578T, HT29, HeLa, IGR-OV1, MALME-3M, NCI-H460, and OVCAR-5') and HeLa cells

Before starting, download data available at Metabolights [www.ebi.ac.uk/metabolights/MTBLS11236], study identifier MTBLS11236 and move to data folder.
- SC1_raw.h5ad - Single-cell data
- SC2_raw.h5ad - Single-cell data
- SC1_custom_database.tsv - Ions from custom database
- SC2_custom_database.tsv - Ions from custom database

In [1]:
from pathlib import Path

import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scanpy as sc
from anndata import read_h5ad

## Set paths

In [2]:
#input
data_dir = Path(r'../data')

SC1_path = data_dir / 'SC1_raw.h5ad'
SC2_path = data_dir / 'SC2_raw.h5ad'

custom1_path = data_dir / 'SC1_custom_database.tsv'
custom2_path = data_dir / 'SC2_custom_database.tsv'

#output
data_path =  Path(r'../data')
data_path.mkdir(parents=True, exist_ok=True)
sc.settings.figdir = data_path

## SC1

### Load data

In [3]:
#load anndata file
adata =  sc.read_h5ad(SC1_path)
adata.raw = adata

#load custom database
database = pd.read_csv(custom1_path, sep='\t', header=0) 
database_filt = database['formula']

### Filtering

In [4]:
print('Before filtering: Cell', adata.shape[0], 'Ions', adata.shape[1])
      
#CustomDB
adata = adata[:, adata.var['formula-0'].isin(database_filt)]
print('Custom filtering: Cell', adata.shape[0], 'Ions', adata.shape[1])
      
#Adducts
adata = adata[:, (adata.var["adduct"] == '-H')]
print('Adducts filtering: Cell', adata.shape[0], 'Ions', adata.shape[1])

#Min cells and genes
sc.pp.filter_cells(adata, min_genes = 10)
sc.pp.filter_genes(adata, min_cells = 50)
print('Cell/Gene filtering: Cell', adata.shape[0], 'Ions', adata.shape[1])

#saving
adata.write(data_path / 'SC1_filtered.h5ad')

Before filtering: Cell 78503 Ions 236
Custom filtering: Cell 78503 Ions 234
Adducts filtering: Cell 78503 Ions 135


  adata.obs["n_genes"] = number


Cell/Gene filtering: Cell 78500 Ions 135


### Normalization

In [5]:
#raw counts
adata.layers["counts"] = adata.X.copy() # keep raw values for diff. analysis

#normalized counts
sc.pp.normalize_total(adata, target_sum=10000, exclude_highly_expressed =True, max_fraction = 0.05)
adata.layers["norm_counts"] = adata.X.copy()

#log transformation
sc.pp.log1p(adata)
adata.layers["log1p"] = adata.X.copy()

adata.raw = adata

#saving
adata.write(data_dir / 'SC1_normalized.h5ad')



## SC2

### Load data

In [6]:
#load anndata file
adata =  sc.read_h5ad(SC2_path)
adata.raw = adata

#load custom database
database = pd.read_csv(custom2_path, sep='\t', header=0) 
database_filt = database['formula']

### Filtering

In [7]:
print('Before filtering: Cell', adata.shape[0], 'Ions', adata.shape[1])
      
#CustomDB
adata = adata[:, adata.var['formula-0'].isin(database_filt)]
print('Custom filtering: Cell', adata.shape[0], 'Ions', adata.shape[1])

#Adducts
adata = adata[:, (adata.var["adduct"] == '-H')]
print('Adducts filtering: Cell', adata.shape[0], 'Ions', adata.shape[1])

#Min cells and genes
sc.pp.filter_cells(adata, min_genes = 20)
sc.pp.filter_genes(adata, min_cells = 50)
print('Cell/Gene filtering: Cell', adata.shape[0], 'Ions', adata.shape[1])

#saving
adata.write(data_path / 'SC2_filtered.h5ad')

Before filtering: Cell 42153 Ions 436
Custom filtering: Cell 42153 Ions 344
Adducts filtering: Cell 42153 Ions 202
Cell/Gene filtering: Cell 42153 Ions 202


  adata.obs["n_genes"] = number


### Normalization

In [8]:
#raw counts
adata.layers["counts"] = adata.X.copy() # keep raw values for diff. analysis

#normalized counts
sc.pp.normalize_total(adata, target_sum=10000, exclude_highly_expressed =True, max_fraction = 0.05)
adata.layers["norm_counts"] = adata.X.copy()

#log transformation
sc.pp.log1p(adata)
adata.layers["log1p"] = adata.X.copy()

adata.raw = adata

#saving
adata.write(data_dir / 'SC2_normalized.h5ad')