# 00b preprocess MPP and LT overlay with correct cell proportions

Assemble merged object of MPPs + LTs with correct cell proportions (from FACS)

Run with this command in docker container

docker run \
--rm \
-d \
--name demuxEM \
-p 8881:8888 \
-e JUPYTER_ENABLE_LAB=YES \
-v /Users/efast/Documents/:/home/jovyan/work \
pegasuspy_scanpy:vs1


In [1]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])?  y


In [2]:
import numpy as np
import pandas as pd
import scanpy as sc

In [3]:
## with LT-HSCs

sample_strings = ['ct', 'dmPGE2', 'GCSF', 'indo', 'pIC']

file_base = './raw_data/B_'
file_end = '/outs/filtered_feature_bc_matrix'

file_base_LT = './raw_data/A_'

ann_base = './write/demux_adata_hto_'
ann_end = '.csv'

write_path_base = './sc_objects/demuxannotated_'
write_path_end = '.h5ad'
write_path_end_proportions = 'prop.h5ad'
write_path_end_proportions_LT = '_LT_prop.h5ad'

write_path_end_csv = 'counts.csv'

proportions = pd.read_csv('./raw_data/cell_proportions_demux.csv', index_col= 0)

for i in range(len(sample_strings)):
    sample = sample_strings[i]
    data_file = file_base+sample+file_end # assembles the name of the datafile
    ann_file = ann_base+sample+ann_end
    
    adata = sc.read_10x_mtx(data_file, var_names='gene_symbols', cache=True)
    ann = pd.read_csv( ann_file, index_col= 0)

    adata.obs.index = adata.obs.index.map(lambda x: x.rstrip('-1')) # remove the '-1'
    adata.obs = pd.merge(ann, adata.obs, how='right', left_index = True, right_index = True) # merge the ann and adata.obs
    adata = adata[adata.obs['demux_type'] == 'singlet'] # filter and keep only singlets

    HSPC_renaming = {'CD48LSK': 'MPP3/4', 'ST': 'MPP1'} # dictionary for renaming
    adata.obs = adata.obs.replace(HSPC_renaming) #replace with new names
    
    category_counts = adata.obs.groupby(['assignment']).count()
    del category_counts.index.name
    
    # write the file to disk
    out_h5ad = write_path_base + sample + write_path_end
    out_csv = write_path_base + sample + write_path_end_csv
    
    adata.write(out_h5ad)
    category_counts.to_csv(out_csv)

    count_MPP34 = category_counts.loc['MPP3/4','counts']
    temp_df2 = adata.obs[adata.obs['assignment'] == 'MPP3/4'].sample(n= count_MPP34, random_state=1)
    temp_df2['select_cells'] = 1

    temp_df= []

    sample_HSPCS = ['MPP2', 'MPP', 'MPP1']

    for j in range(len(sample_HSPCS)):
        sample_h = sample_HSPCS[j]

        number = round(category_counts.loc['MPP3/4','hto_type'] * (proportions.loc[sample_h, sample]/proportions.loc['MPP3/4', sample]))
        number = number.astype(int)

        temp_df = adata.obs[adata.obs['assignment'] == sample_h].sample(n= number, random_state=1)
        temp_df['select_cells'] = 1

        temp_df2 = temp_df2.append(temp_df)
    
    # drop columns I don't need
    temp_df2 = temp_df2.drop(columns=['counts', 'hto_type', 'rna_type', 'demux_type', 'assignment'])

    # merge with adata.obs
    adata.obs = pd.merge(temp_df2, adata.obs, how='right', left_index = True, right_index = True)

    # make subselection of the adata dataframe
    adata = adata[adata.obs['select_cells'] == 1] # filter and keep only singlets
    
    # LT
    data_file_LT = file_base_LT+sample+file_end # assembles the name of the datafile
        
    adata_LT = sc.read_10x_mtx(data_file_LT, var_names='gene_symbols', cache=True)

    number_LT = round(category_counts.loc['MPP3/4','hto_type'] * (proportions.loc['HSC', sample]/proportions.loc['MPP3/4', sample]))
    number_LT = number_LT.astype(int)

    temp_df3 = adata_LT.obs.sample(n= number_LT, random_state=1)
    temp_df3['select_cells'] = 1

     # merge with adata.obs
    adata_LT.obs = pd.merge(temp_df3, adata_LT.obs, how='right', left_index = True, right_index = True)

    adata_LT = adata_LT[adata_LT.obs['select_cells'] == 1] # filter and keep only singlets

    adata_LT.obs['assignment'] = 'LT'
    
    #path out
    out_h5ad_prop = write_path_base + sample + write_path_end_proportions
    out_h5ad_prop_LT = write_path_base + sample +  write_path_end_proportions_LT
    
    adata.write(out_h5ad_prop)
    adata_LT.write(out_h5ad_prop_LT)


... storing 'hto_type' as categorical
... storing 'rna_type' as categorical
... storing 'demux_type' as categorical
... storing 'assignment' as categorical
... storing 'feature_types' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'assignment' as categorical
... storing 'feature_types' as categorical
... storing 'hto_type' as categorical
... storing 'rna_type' as categorical
... storing 'demux_type' as categorical
... storing 'assignment' as categorical
... storing 'feature_types' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'assignment' as categorical
... storing 'feature_types' as categorical
... storing 'hto_type' as categorical
... storing 'rna_type' as categorical
... storing 'demux_type' as categorical
... storing 'assignment' as categorical
... storing 'feature_types' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'assignment' as categorical
... storing 'feature_types' as categorical
... sto

In [4]:
sc.logging.print_versions()
pd.show_versions()

scanpy==1.4.5.1 anndata==0.7.1 umap==0.3.10 numpy==1.17.3 scipy==1.3.0 pandas==0.25.3 scikit-learn==0.21.3 statsmodels==0.10.0 python-igraph==0.7.1 louvain==0.6.1.post1

INSTALLED VERSIONS
------------------
commit           : None
python           : 3.7.3.final.0
python-bits      : 64
OS               : Linux
OS-release       : 4.19.76-linuxkit
machine          : x86_64
processor        : x86_64
byteorder        : little
LC_ALL           : en_US.UTF-8
LANG             : en_US.UTF-8
LOCALE           : en_US.UTF-8

pandas           : 0.25.3
numpy            : 1.17.3
pytz             : 2019.3
dateutil         : 2.8.1
pip              : 19.3.1
setuptools       : 41.6.0.post20191101
Cython           : 0.29.14
pytest           : None
hypothesis       : None
sphinx           : None
blosc            : None
feather          : None
xlsxwriter       : 1.2.8
lxml.etree       : None
html5lib         : None
pymysql          : None
psycopg2         : None
jinja2           : 2.10.3
IPython          :