In [None]:
# ! pip install scrublet

https://doi.org/10.1016/j.isci.2021.102151
Wyler et al. (2021) iScience

data:  https://cellxgene.cziscience.com/collections/d0e9c47b-4ce7-4f84-b182-eddcfa0b2658
 

#### extraction of viral genes: https://github.com/BIMSBbioinfo/Ewyler_SARS-CoV/blob/master/Processing/Process_Seurat.rmd


#### methods: https://www.cell.com/cms/10.1016/j.isci.2021.102151/attachment/12909dee-01ca-466f-86f9-3e997d6461dd/mmc1

In [None]:
import scanpy as sc
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import scanpy.external as sce
import sys
import scipy 

In [None]:
adata = sc.read_h5ad('calu3.h5ad')

### remove S1 infected samples  and check what experimental conditions are available

In [None]:
adata = adata[adata.obs["strain"] != "SARSCoV1"]
adata = adata[adata.obs.query("strain != 'SARSCoV1'").index]
adata = adata[[r != "SARSCoV1" for r in adata.obs.strain]]
adata.obs.strain.unique()

In [None]:
conditions = []
for i in range(len(adata.obs.index)):
    conditions.append(adata.obs.index[i].split('-')[1:3])
set(tuple(row) for row in conditions)

### return raw count values

In [None]:
# adata.layers["counts"] = adata.X.copy()

In [None]:
adata.X = adata.raw.X

### Filter low quality cells

In [None]:
adata.var["mito"] = adata.var.feature_name.str.startswith("MT-")
adata.var[adata.var.mito==True] 

fig, axes = plt.subplots(ncols = 3, nrows = 1, figsize=(15,4))


fig.suptitle('QC')

sns.histplot(adata.obs['nCount_RNA'], bins=50, ax=axes[1])
plt.xlim(-1000, 60000)


sns.histplot(adata.obs['n_genes_by_counts'], bins=25, ax=axes[0])
plt.xlim(0, 6500)

sns.histplot(adata.var['mito'], bins=10, ax=axes[2])
plt.xlim(-0.3, 1.5)

### remove doublets 

In [None]:
# filter cells and genes
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3) 

In [None]:
sce.pp.scrublet(adata, mean_center=True)
sce.pp.scrublet_simulate_doublets(adata)
adata

In [None]:
adata = adata[~adata.obs.predicted_doublet]

In [None]:
genes = list(adata.var['feature_name'].values)

### matrix with genes and their expression levels in corresponding cells

In [None]:
counts = adata[:, adata.var['feature_name'].isin(genes)].to_df() 

In [None]:
counts.to_csv('preprocessed_calu3.csv')

#  starting with prepocessed counts


In [1]:
import scanpy as sc
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import scanpy.external as sce
import sys
import scipy 

In [2]:
counts = pd.read_csv('preprocessed_calu3.csv', sep=',')

In [3]:
counts.head()

Unnamed: 0.1,Unnamed: 0,ENSG00000146038,ENSG00000136536,ENSG00000116679,ENSG00000160360,ENSG00000165282,ENSG00000023330,ENSG00000105371,ENSG00000171234,ENSG00000117242,...,ENSG00000137760,ENSG00000004534,ENSG00000025423,ENSG00000164543,ENSG00000073578,ENSG00000163596,ENSG00000224152,ENSG00000145388,ENSG00000038219,ENSG00000181513
0,Calu3-mock-12h-A_AAAAATCCCTAG,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Calu3-mock-12h-A_AAAACATACTTA,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Calu3-mock-12h-A_AAAACATGTCAA,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Calu3-mock-12h-A_AAAACCAGACTA,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,Calu3-mock-12h-A_AAAACTCTAGCG,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
counts4 = counts[counts['Unnamed: 0'].str.contains('4')]
counts4.index = counts4['Unnamed: 0']
counts4.drop('Unnamed: 0', axis = 1)
counts4 = counts4.groupby([s.split('_')[0] for s in counts4.index.values]).sum().T
counts4.to_csv('Calu3_4h_sum.csv')
counts4

  counts4 = counts4.groupby([s.split('_')[0] for s in counts4.index.values]).sum().T


Unnamed: 0,Calu3-S2-4h-A,Calu3-S2-4h-B,Calu3-mock-4h-A,Calu3-mock-4h-B
ENSG00000146038,10460.0,5557.0,7117.0,10530.0
ENSG00000136536,1581.0,834.0,961.0,1324.0
ENSG00000116679,2695.0,1646.0,1969.0,2911.0
ENSG00000160360,90.0,92.0,77.0,141.0
ENSG00000165282,1252.0,776.0,911.0,1289.0
...,...,...,...,...
ENSG00000163596,147.0,53.0,81.0,113.0
ENSG00000224152,37.0,13.0,19.0,36.0
ENSG00000145388,464.0,234.0,297.0,444.0
ENSG00000038219,4126.0,3086.0,2865.0,3428.0


In [17]:
counts8 = counts[counts['Unnamed: 0'].str.contains('8')]
counts8.index = counts8['Unnamed: 0']
counts8.drop('Unnamed: 0', axis = 1)
counts8 = counts8.groupby([s.split('_')[0] for s in counts8.index.values]).sum().T
counts8.to_csv('Calu3_8h_sum.csv')
counts8

  counts8 = counts8.groupby([s.split('_')[0] for s in counts8.index.values]).sum().T


Unnamed: 0,Calu3-S2-8h-A,Calu3-S2-8h-B
ENSG00000146038,5267.0,9900.0
ENSG00000136536,855.0,1489.0
ENSG00000116679,1507.0,2347.0
ENSG00000160360,63.0,89.0
ENSG00000165282,634.0,894.0
...,...,...
ENSG00000163596,50.0,119.0
ENSG00000224152,13.0,33.0
ENSG00000145388,254.0,427.0
ENSG00000038219,2874.0,3736.0


In [12]:
counts12 = counts[counts['Unnamed: 0'].str.contains('12')]
counts12.index = counts12['Unnamed: 0']
counts12.drop('Unnamed: 0', axis = 1)
counts12 = counts12.groupby([s.split('_')[0] for s in counts12.index.values]).sum().T
counts12.to_csv('Calu3_12h_sum.csv')
counts12

  counts12 = counts12.groupby([s.split('_')[0] for s in counts12.index.values]).sum().T


Unnamed: 0,Calu3-S2-12h-A,Calu3-S2-12h-B,Calu3-mock-12h-A,Calu3-mock-12h-B
ENSG00000146038,5845.0,10123.0,4154.0,8402.0
ENSG00000136536,808.0,1407.0,806.0,1282.0
ENSG00000116679,1240.0,2182.0,1693.0,2784.0
ENSG00000160360,28.0,68.0,71.0,80.0
ENSG00000165282,512.0,880.0,706.0,1033.0
...,...,...,...,...
ENSG00000163596,37.0,63.0,57.0,85.0
ENSG00000224152,13.0,29.0,25.0,32.0
ENSG00000145388,194.0,346.0,195.0,417.0
ENSG00000038219,2042.0,3866.0,1877.0,3355.0
