# 1. Preprocessing analysis

This notebook is used to preprocess single-cell data:
- SC1: HeLa and NIH3T3 cells experiment
- SC2: 10 cell lines from NCI-60 cell panel (A498, BT-549, HOP-62, HS 578T, HT29, HeLa, IGR-OV1, MALME-3M, NCI-H460, and OVCAR-5') and HeLa cells
- SC3: HeLa cells treated with 2-Deoxy-glucose (2-DG) at 12 and 24h timepoints
- SC4: Co-culture of NIH3T3 and NCI-H460 cell lines

Before starting, download data available at Metabolights [www.ebi.ac.uk/metabolights/MTBLS11236], study identifier MTBLS11236 and move to data folder.
- SC1_raw.h5ad - Single-cell data
- SC2_raw.h5ad - Single-cell data
- SC3_raw.h5ad - Single-cell data
- SC4_raw.h5ad - Single-cell data
- SC1_custom_database.tsv - Ions from custom database
- SC2_custom_database.tsv - Ions from custom database
- SC3_custom_database.tsv - Ions from custom database
- SC4_custom_database.tsv - Ions from custom database



In [1]:
from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scanpy as sc
from anndata import read_h5ad, concat
import sys
sys.path.insert(0, '../functions/')
import pl
import utils

## Set paths

In [3]:
#input
data_dir = Path(r'../data')

SC1_path = data_dir / 'SC1_raw.h5ad'
SC2_path = data_dir / 'SC2_raw.h5ad'
SC3_path = data_dir / 'SC3_raw.h5ad'
SC4_path = data_dir / 'SC4_raw.h5ad'

custom1_path = data_dir / 'SC1_custom_database.tsv'
custom2_path = data_dir / 'SC2_custom_database.tsv'
custom3_path = data_dir / 'SC3_custom_database.tsv'
custom4_path = data_dir / 'SC4_custom_database.tsv'

#output
data_path =  Path(r'../data')

data_path.mkdir(parents=True, exist_ok=True)
sc.settings.figdir = data_path

## SC1

### Load data

In [4]:
#load anndata file
adata =  sc.read_h5ad(SC1_path)
adata.raw = adata

#load custom database
database = pd.read_csv(custom1_path, sep='\t', header=0) 
database_filt = database['formula']

### Filtering

In [5]:
print('Before filtering: Cell', adata.shape[0], 'Ions', adata.shape[1])
      
#CustomDB
adata = adata[:, adata.var['formula-0'].isin(database_filt)]
print('Custom filtering: Cell', adata.shape[0], 'Ions', adata.shape[1])
      
#Adducts
adata = adata[:, (adata.var['adduct']=='-H')]
print('Adducts filtering: Cell', adata.shape[0], 'Ions', adata.shape[1])

#Min cells and genes
sc.pp.filter_cells(adata, min_genes=10)
sc.pp.filter_genes(adata, min_cells=50)
print('Cell/Gene filtering: Cell', adata.shape[0], 'Ions', adata.shape[1])

#saving
adata.write(data_path / 'SC1_filtered.h5ad')

Before filtering: Cell 78503 Ions 236
Custom filtering: Cell 78503 Ions 234
Adducts filtering: Cell 78503 Ions 135


  adata.obs["n_genes"] = number


Cell/Gene filtering: Cell 78500 Ions 135


### Normalization

In [5]:
#raw counts
adata.layers['counts'] = adata.X.copy()

#normalized counts
sc.pp.normalize_total(adata, target_sum=10000, exclude_highly_expressed =True, max_fraction = 0.05)
adata.layers['norm_counts'] = adata.X.copy()

#log transformation
sc.pp.log1p(adata)
adata.layers['log1p'] = adata.X.copy()

adata.raw = adata

#saving
adata.write(data_dir / 'SC1_normalized.h5ad')



## SC2

### Load data

In [6]:
#load anndata file
adata =  sc.read_h5ad(SC2_path)
adata.raw = adata

#load custom database
database = pd.read_csv(custom2_path, sep='\t', header=0) 
database_filt = database['formula']

### Filtering

In [7]:
print('Before filtering: Cell', adata.shape[0], 'Ions', adata.shape[1])
      
#CustomDB
adata = adata[:, adata.var['formula-0'].isin(database_filt)]
print('Custom filtering: Cell', adata.shape[0], 'Ions', adata.shape[1])

#Adducts
adata = adata[:, (adata.var['adduct']=='-H')]
print('Adducts filtering: Cell', adata.shape[0], 'Ions', adata.shape[1])

#Min cells and genes
sc.pp.filter_cells(adata, min_genes=20)
sc.pp.filter_genes(adata, min_cells=50)
print('Cell/Gene filtering: Cell', adata.shape[0], 'Ions', adata.shape[1])

#saving
adata.write(data_path / 'SC2_filtered.h5ad')

Before filtering: Cell 42153 Ions 436
Custom filtering: Cell 42153 Ions 344
Adducts filtering: Cell 42153 Ions 202
Cell/Gene filtering: Cell 42153 Ions 202


  adata.obs["n_genes"] = number


### Normalization

In [8]:
#raw counts
adata.layers["counts"] = adata.X.copy()

#normalized counts
sc.pp.normalize_total(adata, target_sum=10000, exclude_highly_expressed =True, max_fraction = 0.05)
adata.layers['norm_counts'] = adata.X.copy()

#log transformation
sc.pp.log1p(adata)
adata.layers['log1p'] = adata.X.copy()

adata.raw = adata

#saving
adata.write(data_dir / 'SC2_normalized.h5ad')

## SC3

### Load data

In [12]:
#load anndata file
adata =  sc.read_h5ad(SC3_path)
adata.raw = adata
adata.obs_names_make_unique()

#load custom database
database = pd.read_csv(custom3_path, sep='\t', header=0) 
database_filt = database['formula']

### Filtering

In [13]:
print('Before filtering: Cell', adata.shape[0], 'Ions', adata.shape[1])
      
#CustomDB
adata = adata[:, adata.var['formula'].isin(database_filt)]
print('Custom filtering: Cell', adata.shape[0], 'Ions', adata.shape[1])

#Adducts
adata = adata[:, (adata.var['adduct']=='-H')]
print('Adducts filtering: Cell', adata.shape[0], 'Ions', adata.shape[1])

#Min cells and ions
sc.pp.filter_cells(adata, min_genes=75)
sc.pp.filter_genes(adata, min_cells=2000)
print('Cell/Ions filtering: Cell', adata.shape[0], 'Ions', adata.shape[1])

#Minimum ablated proportion
adata = adata[adata.obs['cell_sampled_proportion']>=0.4, :]
print('Sampled proportion: Cell', adata.shape[0], 'Ions', adata.shape[1])

#saving
adata.write(data_path / 'SC3_filtered.h5ad')

Before filtering: Cell 18937 Ions 175
Custom filtering: Cell 18937 Ions 140
Adducts filtering: Cell 18937 Ions 140
Cell/Ions filtering: Cell 15794 Ions 111
Sampled proportion: Cell 15697 Ions 111


  adata.obs["n_genes"] = number


### Normalization

In [14]:
#raw counts
adata.layers["counts"] = adata.X.copy()

#normalized counts
sc.pp.normalize_total(adata, target_sum=10000, exclude_highly_expressed =True, max_fraction = 0.05)
adata.layers["norm_counts"] = adata.X.copy()

#log transformation
sc.pp.log1p(adata)
adata.layers["log1p"] = adata.X.copy()

adata.raw = adata

#saving
adata.write(data_dir / 'SC3_normalized.h5ad')

  adata.layers["counts"] = adata.X.copy()


## SC4

### Load data

In [10]:
#load anndata file
adata =  sc.read_h5ad(SC4_path)
adata.raw = adata

#load custom database
database = pd.read_csv(custom4_path, sep='\t', header=0) 
database_filt = database['annotation_id']

  utils.warn_names_duplicates("obs")


### Filtering

In [11]:
print('Before filtering: Cells', adata.shape[0], 'Ions', adata.shape[1])

#filtering for -H adducts only
adata = adata[:, (adata.var['adduct']=='-H')]
print('Adducts filtering: Cells', adata.shape[0], 'Ions', adata.shape[1])

#CustomDB
adata = adata[:, adata.var['annotation_id'].isin(database_filt)]
print('Custom filtering: Cells', adata.shape[0], 'Ions', adata.shape[1])

#Min cells and ions
sc.pp.filter_cells(adata, min_genes=40)
sc.pp.filter_genes(adata, min_cells=300)
print('Cell/Ion filtering: Cells', adata.shape[0], 'Ions', adata.shape[1])

#Minimum ablated proportion
adata = adata[adata.obs['cell_sampled_proportion']>=0.4, :]
print('Sampled proportion: Cells', adata.shape[0], 'Ions', adata.shape[1])

#saving
adata.write(data_path / 'SC4_filtered.h5ad')

Before filtering: Cells 4446 Ions 499
Adducts filtering: Cells 4446 Ions 254
Custom filtering: Cells 4446 Ions 173
Cell/Ion filtering: Cells 4445 Ions 142
Sampled proportion: Cells 4413 Ions 142


  adata.obs["n_genes"] = number
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


### Fluorescence processing and normalization

In [8]:
adata.obs['condition_GFP'] = 'Uncertain'
adata.obs.loc[(adata.obs['sampled_intensity_max-GFP'] > 10), 'condition_GFP'] = 'NIH3T3'
adata.obs.loc[(adata.obs['sampled_intensity_max-GFP'] <= 10), 'condition_GFP'] = 'NCI-H460'

counts = adata.obs['condition_GFP'].value_counts()
NIH3T3_count = counts.get('NIH3T3', 0)
NCIH460_count = counts.get('NCI-H460', 0)
Uncertain_count = counts.get('Uncertain', 0)
print(f'NIH3T3: {NIH3T3_count}, NCI-H460: {NCIH460_count}, Uncertain: {Uncertain_count}')

adata.obs['condition'] = 'Uncertain'
adata.obs.loc[(adata.obs['sampled_intensity_max-GFP'] > 10) & 
              (adata.obs['inertia_tensor_eigvals-0'] > 200),'condition'] = 'NIH3T3'
adata.obs.loc[(adata.obs['sampled_intensity_max-GFP'] < 10) & 
              (adata.obs['cell_area'] < 1000),'condition'] = 'NCI-H460'

counts = adata.obs['condition'].value_counts()
NIH3T3_count = counts.get('NIH3T3', 0)
NCIH460_count = counts.get('NCI-H460', 0)
Uncertain_count = counts.get('Uncertain', 0)
print(f'NIH3T3: {NIH3T3_count}, NCI-H460: {NCIH460_count}, Uncertain: {Uncertain_count}')

NIH3T3: 1235, NCI-H460: 3178, Uncertain: 0
NIH3T3: 496, NCI-H460: 1876, Uncertain: 2041


  adata.obs['condition_GFP'] = 'Uncertain'
  utils.warn_names_duplicates("obs")


### Normalization

In [9]:
#raw counts
adata.layers["counts"] = adata.X.copy()

#normalized counts
sc.pp.normalize_total(adata, target_sum=10000, exclude_highly_expressed =True, max_fraction = 0.05)
adata.layers["norm_counts"] = adata.X.copy()

#log transformation
sc.pp.log1p(adata)
adata.layers["log1p"] = adata.X.copy()

adata.raw = adata

#saving
adata.write(data_dir / 'SC4_normalized.h5ad')