# Data Preperation for Models

In [53]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

import scanpy as sc
import anndata
import pandas as pd
from matplotlib import rcParams

## Creating the Figure 4 Adata Object

In [None]:
sc.settings.verbosity = 3
sc.logging.print_header()
sc.settings.set_figure_params(dpi=200, frameon=False, figsize=(3,3), facecolor = 'white', color_map = 'magma')

adata = sc.read_csv('/Volumes/mgriffit/Active/griffithlab/gc2596/e.schmidt/fig4_foltz/SingleCell_ProbabilisticModels/data/fig4_data/csv/fig4_object_RNAcounts.csv')
adata = adata.transpose()

# inspect the barcodes
print('RNA Indexes Before')
print(adata.obs.index)
# If there are X's
adata.obs.index = adata.obs.index.str.replace('X', '') # note default value of regex changing in other versions, current default is True
adata.obs.index = adata.obs.index.str.replace('\.', '-', regex = True)
print('RNA Indexes After')
print(adata.obs.index)
print()

meta = pd.read_csv('/Volumes/mgriffit/Active/griffithlab/gc2596/e.schmidt/fig4_foltz/SingleCell_ProbabilisticModels/data/fig4_data/csv/fig4_object_meta.csv', index_col = 0)
adata.obs = meta
print('Meta Indexes')
print(meta.index)
print()


# add protein data
protein_adata = sc.read_csv('/Volumes/mgriffit/Active/griffithlab/gc2596/e.schmidt/fig4_foltz/SingleCell_ProbabilisticModels/data/fig4_data/csv/fig4_object_ADTcounts.csv')
protein_adata = protein_adata.transpose()

# inspect:
print('Protein Indexes Before')
print(protein_adata.obs.index)
# if needed, uncomment lines 20 and 21
protein_adata.obs.index = protein_adata.obs.index.str.replace('X', '') # note default value of regex changing in other versions, current default is True
protein_adata.obs.index = protein_adata.obs.index.str.replace('\.', '-', regex = True)
print('Protein Indexes After')
print(protein_adata.obs.index)
print()
protein_adata.obs = meta
adata.obsm["protein_expression"] = protein_adata.to_df()

# now confirm equality in 2 ways!
# first is sum where True is = 1 so if all equal should be the total length of the two indexes being compared
print('Confirm the indexes are equal')
print(sum(protein_adata.obs.index == adata.obs.index))
print(sum(meta.index == adata.obs.index))
print(sum(meta.index == protein_adata.obs.index))
# and option 2 through pandas:
print(protein_adata.obs.index.equals(adata.obs.index))
print(protein_adata.obs.index.equals(meta.index))
print(adata.obs.index.equals(meta.index))


# remove mouse genes
adata.var['mouse'] = adata.var_names.str.startswith('mm10')
adata.var['mouse']
adata = adata[:,~adata.var.mouse]


sc.pl.highest_expr_genes(adata, n_top= 20)
sc.pp.filter_genes(adata, min_cells = 10)

adata.raw = adata

adata.write('/Volumes/mgriffit/Active/griffithlab/gc2596/e.schmidt/fig4_foltz/SingleCell_ProbabilisticModels/data/fig4_data/adata/fig4_Protein_folzconversion.h5ad')

## Creating Fig7 Adata Object
This data set will be used as the query

In [54]:
adata = sc.read_csv('/Volumes/mgriffit/Active/griffithlab/gc2596/e.schmidt/fig4_foltz/SingleCell_ProbabilisticModels/data/fig7_data/csv/fig7_object_RNAcounts.csv')
adata = adata.transpose()

# inspect the barcodes
print('RNA Indexes Before')
print(adata.obs.index)
# If there are X's
adata.obs.index = adata.obs.index.str.replace('X', '') # note default value of regex changing in other versions, current default is True
adata.obs.index = adata.obs.index.str.replace('\.', '-', regex = True)
print('RNA Indexes After')
print(adata.obs.index)
print()

meta = pd.read_csv('/Volumes/mgriffit/Active/griffithlab/gc2596/e.schmidt/fig4_foltz/SingleCell_ProbabilisticModels/data/fig7_data/csv/fig7_object_meta.csv', index_col = 0)
adata.obs = meta
print('Meta Indexes')
print(meta.index)
print()

# add protein data
protein = sc.read_csv('/Volumes/mgriffit/Active/griffithlab/gc2596/e.schmidt/fig4_foltz/SingleCell_ProbabilisticModels/data/fig7_data/csv/fig7_object_ADTcounts.csv')
protein = protein.transpose()

# inspect:
print('Protein Indexes Before')
print(protein.obs.index)
# if needed, uncomment lines 20 and 21
protein.obs.index = protein.obs.index.str.replace('X', '') # note default value of regex changing in other versions, current default is True
protein.obs.index = protein.obs.index.str.replace('\.', '-', regex = True)
print('Protein Indexes After')
print(protein.obs.index)
print()
protein.obs = meta
adata.obsm["protein_expression"] = protein.to_df()


# now confirm equality in 2 ways!
# first is sum where True is = 1 so if all equal should be the total length of the two indexes being compared
print('Confirm the indexes are equal')
print(sum(protein.obs.index == adata.obs.index))
print(sum(meta.index == adata.obs.index))
print(sum(meta.index == protein.obs.index))
# and option 2 through pandas:
print(protein.obs.index.equals(adata.obs.index))
print(protein.obs.index.equals(meta.index))
print(adata.obs.index.equals(meta.index))


# remove mouse genes
adata.var['mouse'] = adata.var_names.str.startswith('mm10')
adata.var['mouse']
adata = adata[:,~adata.var.mouse]


sc.pl.highest_expr_genes(adata, n_top= 20, save = "genes.png")
sc.pp.filter_genes(adata, min_cells = 10)

adata.raw = adata

adata.write('/Volumes/mgriffit/Active/griffithlab/gc2596/e.schmidt/fig4_foltz/SingleCell_ProbabilisticModels/data/fig7_data/adata/fig7_Protein_folzconversion.h5ad')

FileNotFoundError: [Errno 2] No such file or directory: 'data/fig7_data/csv/fig7_object_RNAcounts.csv'

# Preperation

In [56]:

adata = sc.read('/Volumes/mgriffit/Active/griffithlab/gc2596/e.schmidt/fig4_foltz/SingleCell_ProbabilisticModels/data/fig4_data/adata/fig4_Protein_folzconversion.h5ad')


In [None]:
adata

In [None]:
query_offline = sc.read('/Volumes/mgriffit/Active/griffithlab/gc2596/e.schmidt/fig4_foltz/SingleCell_ProbabilisticModels/data/fig4_data/adata/fig7_Protein_folzconversion.h5ad')
query_offline.layers["counts"] = query_offline.X.copy()
query_offline.raw = query_offline

In [None]:
query_offline

### Make sure both datasets have the same proteins

In [None]:
adata.obsm["protein_expression"].columns

In [None]:
protiens_to_check = ['IgG2aADT', 'IgG2bADT', 'IgG1ADT', 'PD-1ADT', 'CD8ADT']


In [None]:
adata.obsm["protein_expression"].columns

In [None]:
adata.obsm["protein_expression"] = adata.obsm["protein_expression"][adata.obsm["protein_expression"].columns.difference(protiens_to_check)]

In [None]:
adata.obsm["protein_expression"].columns

In [None]:
query_offline.obsm["protein_expression"].columns

In [None]:
query_offline.obsm["protein_expression"].columns.str.replace(r'-ADT', 'ADT')

In [None]:
# Create a new DataFrame with modified column names
new_columns = query_offline.obsm["protein_expression"].columns.str.replace(r'-ADT', 'ADT')
new_protein_expression = query_offline.obsm["protein_expression"].copy()
new_protein_expression.columns = new_columns

# Update the AnnData object with the new DataFrame
query_offline.obsm["protein_expression"] = new_protein_expression

In [None]:
query_offline.obsm["protein_expression"].columns

In [None]:
query_offline.obsm["protein_expression"] = query_offline.obsm["protein_expression"][query_offline.obsm["protein_expression"].columns.difference(protiens_to_check)]

In [None]:
query_offline.obsm["protein_expression"].columns

In [None]:
adata.obsm["protein_expression"].columns

In [None]:
ref_proteins = adata.obsm["protein_expression"].columns
query_proteins = query_offline.obsm["protein_expression"].columns

proteins_in_common = ref_proteins.intersection(query_proteins)

In [None]:
print(len(ref_proteins))
print(len(query_proteins))
print(len(proteins_in_common))

In [None]:
query_offline.obsm["protein_expression"].columns

In [None]:
ref_genes = adata.var_names
query_genes = query_offline.var_names

genes_in_common = ref_genes.intersection(query_genes)

In [None]:
genes_in_common

In [None]:
adata = adata[:, adata.var_names.isin(genes_in_common)]
query_offline = query_offline[:, query_offline.var_names.isin(genes_in_common)]

In [None]:
adata.var_names

In [None]:
query_offline.var_names

### Add the Correct CellType Identification from the orginal query dataset

In [None]:
query_offline

In [None]:
full_query = sc.read('/Volumes/mgriffit/Active/griffithlab/gc2596/e.schmidt/fig4_foltz/SingleCell_ProbabilisticModels/data/fig7_data/adata/fig7_k562_pagaobject3223withleiden31423pagaonleidenclusters_RAW.h5ad')

In [None]:
query_offline.obs['batch'] = full_query.obs['orig.ident']

In [None]:
full_query.obs["orig.ident"]

In [None]:
query_offline.obs['leiden'] = full_query.obs['leiden'].astype("int64")

In [None]:
query_offline.obs['leiden']

### Rename the clusters with the names of the Cell Types 

In [None]:
query_offline.obs['leiden'] = query_offline.obs['leiden'].replace({
        0: "CD56dim",
        1: "CD56dim",
        2: "ML2",
        3: "CD56dim",
        4: "CD56dim",
        5: "ML1",
        6: "CD56dim",
        7: "CD56dim",
        8: "CD56dim",
        9: "CD56bright",
        10: "Proliferating", 
        11: "CD56dim", 
        12: "ML1", 
        13: "ML1", 
        14: "Proliferating", 
        15: "nonNK",
        16: "CD56bright",
        17: "trans"
    })


query_offline.obs['leiden'] = query_offline.obs['leiden'].astype("category")

In [None]:
query_offline.obs['leiden']

In [None]:
adata.obs["seurat_clusters"]

In [None]:
adata.obs["seurat_clusters"] = adata.obs["seurat_clusters"].replace({
        0: "CD56dim",
        1: "ML1",
        2: "ML2",
        3: "CD56bright",
        4: "CD56dim",
        5: "NKG2Cpos",
        6: "Proliferating1",
        7: "CD56dim",
        8: "Proliferating2",
        9: "nonNK"
    })

adata.obs["seurat_clusters"] = adata.obs["seurat_clusters"].astype("category")
adata.obs["celltype.l2"] = adata.obs["seurat_clusters"].copy()

In [None]:
adata.obs["seurat_clusters"]

### Split dataset in the separate batches

In [None]:
adata_3228 = adata[adata.obs['orig.ident'].isin([3228])].copy()
adata_3228.obs["batch"] = "3228"
adata_730 = adata[adata.obs['orig.ident'].isin([730])].copy()
adata_730.obs["batch"] = "730"
adata_451 = adata[adata.obs['orig.ident'].isin([451])].copy()
adata_451.obs["batch"] = "451"

adata_full = anndata.concat([adata_3228, adata_730, adata_451])
adata_full.obs['batch'] = adata_full.obs['batch'].astype("category")

In [None]:
adata_full.obs['batch']

### Write object for SCANVI Model

In [None]:
adata_full.write_h5ad(
    "/Volumes/mgriffit/Active/griffithlab/gc2596/e.schmidt/fig4_foltz/SingleCell_ProbabilisticModels/data/fig4_data/adata/fig4_Protein_folzconversion_prepped.h5ad" 
)

In [None]:
query_offline.write_h5ad(
    "/Volumes/mgriffit/Active/griffithlab/gc2596/e.schmidt/fig4_foltz/SingleCell_ProbabilisticModels/data/fig7_data/adata/fig7_Protein_folzconversion_prepped.h5ad" 
)