# Part 4

- **INPUT:**
    -  ```counts_norm.mtx``` 
    -  ```anno_cells_norm.txt``` 
    -  ```anno_samples_norm.txt``` 
    -  ```anno_genes_norm.txt``` 
    -  ```cell_relabelling.csv``` file containing unified cell type annotations. Stored in additional_input_files sub-directory.
    
    
- **OUTPUT:**

    -  ```counts_corr.csv.gz``` 
    -  ```anno_cells_corr.txt``` 
    -  ```anno_samples_corr.txt``` 
    -  ```anno_genes_corr.txt``` 
    -  ```adata_afterCorrection.h5ad```

### load data

In [18]:
import numpy as np
import scanpy as sc
import pandas as pd
import os
import shutil

In [19]:
print("here")

here


In [20]:
adata=sc.read_mtx("../2.filtering/outs/counts_norm.mtx")

In [21]:
adata = adata.T

In [22]:
# read in cell annotation file
anno_cells = pd.read_csv("../2.filtering/outs/anno_cells_norm.txt", sep = "\t") 

  anno_cells = pd.read_csv("../2.filtering/outs/anno_cells_norm.txt", sep = "\t")


In [23]:
# read in sample annotation file
anno_samples = pd.read_csv("../2.filtering/outs/anno_samples_norm.txt", sep = "\t") 

In [24]:
# read in gene annotation file
anno_genes = pd.read_csv("../2.filtering/outs/anno_genes_norm.txt", sep = "\t") 

In [25]:
# load cell subtype and color keys
cell_relabel=pd.read_csv("../../cell_relabelling.csv"
                        ,sep=';')

In [26]:
cell_relabel = cell_relabel[cell_relabel.source=="Lasry et al., 2022"]

In [27]:
adata.var_names=anno_genes['gene_symbol']
print("adata.var_names[1:10]")
print(adata.var_names[1:10])
adata.var = anno_genes
adata.var.index = adata.var.index.astype(str)

adata.var_names[1:10]
Index(['LINC01409', 'LINC01128', 'LINC00115', 'FAM41C', 'AL645608.2', 'NOC2L',
       'KLHL17', 'PLEKHN1', 'HES4'],
      dtype='object', name='gene_symbol')


In [28]:
adata.obs_names=anno_cells['cell_ID']
print("adata.obs_names[1:10]")
print(adata.obs_names[1:10])
adata.obs = anno_cells
adata.obs.index = adata.obs.index.astype(str)

adata.obs_names[1:10]
Index(['2020.09.15.AML0024.CATCAAGTCCGAGAAG',
       '2020.09.15.AML0024.CATCCACAGGGACCAT',
       '2020.09.15.AML0024.CCTCAACAGAGCAAGA',
       '2020.09.15.AML0024.CCTCAACAGTTCCATG',
       '2020.09.15.AML0024.CCTCAACGTAGAATAC',
       '2020.09.15.AML0024.CCTCAACGTTCTCCCA',
       '2020.09.15.AML0024.CCTCAACTCCGAACGC',
       '2020.09.15.AML0024.CCTCAACTCTAGTCAG',
       '2020.09.15.AML0024.CCTCACAAGACAGTCG'],
      dtype='object', name='cell_ID')


In [29]:
# define colors
colors_cell_type=dict(zip(cell_relabel["cell_type"],cell_relabel["cell_type_color_hex"]))
print(colors_cell_type)

{'T': '#4F92EE', 'Mono': '#F09040', 'DC': '#854701', 'B': '#4FC384', 'NK': '#4F64EE', 'HSPC': '#D040F0', 'Gran': '#CD6600', 'PeriVasc': '#BEBEBE', 'LymP': '#556B2F', 'Ery': '#F72E3A', 'Megakaryocytes': '#C89B7A'}


In [30]:
#there is an error at line 9, cell_subtype is only 2 values?
adata.obs["cell_subtype"].value_counts()

Mono      18004
CD8_T      6272
CD4_T      4989
B          4413
HSPC       3169
NK         3078
Gran       2332
Ery        1674
cDC        1092
pDC         542
gd_T        487
Plasma      352
MAIT        298
Name: cell_subtype, dtype: int64

In [31]:
# rename "malignangt" columns into "bares_mutatoin"
adata.obs["bares_mutation"] = adata.obs["malignant"].astype('str')

In [32]:
# define color schemes
colors_cell_subtype=dict(zip(cell_relabel["cell_subtype"],cell_relabel["cell_subtype_color_hex"]))
print(colors_cell_subtype)

colors_cell_type=dict(zip(cell_relabel["cell_type"],cell_relabel["cell_type_color_hex"]))
print(colors_cell_type)

{'CD4_T': '#4FCCEE', 'CD8_T': '#4F92EE', 'Mono': '#F09040', 'cDC': '#B95D10', 'B': '#4FC384', 'NK': '#4F64EE', 'HSPC': '#D040F0', 'MAIT': '#00688B', 'pDC': '#854701', 'Gran': '#CD6600', 'gd_T': '#00BFFF', 'Plasma': '#317E54', 'PeriVasc': '#BEBEBE', 'LymP': '#556B2F', 'Ery': '#C72D37', 'Megakaryocytes': '#C89B7A'}
{'T': '#4F92EE', 'Mono': '#F09040', 'DC': '#854701', 'B': '#4FC384', 'NK': '#4F64EE', 'HSPC': '#D040F0', 'Gran': '#CD6600', 'PeriVasc': '#BEBEBE', 'LymP': '#556B2F', 'Ery': '#F72E3A', 'Megakaryocytes': '#C89B7A'}


In [33]:
adata.uns["health_status_colors"] = ["#7C001F" # bordeau for AML
                                                     , "#7ac5cd" # CadetBlue3 for healthy
                                                     ]

adata.uns["bares_mutation_colors"] = ["#A6ACAF" # grey for FALSE
                                                  , "#C0392B" # red for TRUE
                                                 ]

### visualize before batch correction

# Export

In [34]:
output_path = "outs/"

In [35]:
# export counts as csv.gz
print("save counts_corr.csv.gz")

counts_corr=adata.to_df().transpose()
counts_corr.index = adata.var.gene_symbol
counts_corr.columns = adata.obs.cell
counts_corr.to_csv(output_path + "counts_norm.csv.gz"
                   ,index=True
                   ,compression="gzip"
                   )

save counts_corr.csv.gz


In [None]:
# export anno_cells_corr
print("save anno_cells_corr.txt")
adata.obs.to_csv(output_path + "anno_cells_norm.txt"
                    ,sep = "\t"
                    ,index = True)


In [None]:
# export anno_samples_corr
print("save anno_samples_corr.txt")
anno_samples.to_csv(output_path + "anno_samples_norm.txt"
                    ,sep = "\t"
                    ,index = True)

In [None]:
# export anno_genes_corr
print("save anno_genes_corr.txt")
adata.var.to_csv(output_path + "anno_genes_norm.txt"
                    ,sep = "\t"
                    ,index = True)

In [None]:
output_path