# Converting the LARRY dataset to Scanpy format

First, we download all the files from [Experiment 1 from Weinreb 2020 on the KleinLab paper data repository](https://github.com/AllonKleinLab/paper-data/tree/master/Lineage_tracing_on_transcriptional_landscapes_links_state_to_fate_during_differentiation).

Then we'll get this into the anndata format.

In [1]:
import scanpy as sc # reinstall numba if you run into issues 
import pandas as pd
import numpy as np



In [2]:
%%time

counts = sc.read_mtx('ltseq/stateFate_inVitro_normed_counts.mtx')

CPU times: user 2min 50s, sys: 8.3 s, total: 2min 59s
Wall time: 2min 53s


In [3]:
meta = pd.read_csv('ltseq/stateFate_inVitro_metadata.txt', sep='\t')
meta.head(3)

Unnamed: 0,Library,Cell barcode,Time point,Starting population,Cell type annotation,Well,SPRING-x,SPRING-y
0,d6_2_2,GCGTGCAA-AGAAGTTA,6.0,Lin-Kit+Sca1-,Undifferentiated,2,411.496,-96.19
1,d6_2_2,AAGGGACC-CTCGATGC,6.0,Lin-Kit+Sca1-,Undifferentiated,2,-587.462,-306.925
2,d6_2_2,CGTACCGA-AGCGCCTT,6.0,Lin-Kit+Sca1-,Monocyte,2,1429.805,-429.3


In [4]:
gene_names = pd.read_csv('ltseq/stateFate_inVitro_gene_names.txt', 
                         header=None, sep='\t')
gene_names.head(3)

Unnamed: 0,0
0,0610006L08Rik
1,0610007P14Rik
2,0610009B22Rik


The clones come in a matrix format (binary, 1 at the index of the clone id), so we'll convert this into an interpretable integer format clone id.

In [5]:
clones = sc.read_mtx('ltseq/stateFate_inVitro_clone_matrix.mtx')

In [6]:
%%time
clone_ids = [np.argmax(clones.X[i, :]) for i in range(len(clones))]

CPU times: user 8.63 s, sys: 4.67 ms, total: 8.63 s
Wall time: 8.62 s


In [7]:
neutrophil_pt = np.zeros(len(counts)) - 1
neutrophil_pt

array([-1., -1., -1., ..., -1., -1., -1.])

In [8]:
pt = pd.read_csv('ltseq/stateFate_inVitro_neutrophil_pseudotime.txt', 
                 index_col=0, header=0, sep='\t')
for i in pt.index:
    neutrophil_pt[i] = pt.loc[i]
    
neutrophil_pt

array([ 2.3849e+04, -1.0000e+00, -1.0000e+00, ..., -1.0000e+00,
       -1.0000e+00,  1.4090e+03])

Now we'll add all the data to our AnnData object.

In [9]:
counts.var_names = [x.upper() for x in gene_names[0].values]

In [10]:
counts.obs['clone'] = clone_ids
counts.obs['time'] = meta['Time point'].values
counts.obs['well'] = meta['Well'].values
counts.obs['type'] = meta['Cell type annotation'].values
counts.obs['SPRING1'] = meta['SPRING-x'].values
counts.obs['SPRING2'] = meta['SPRING-y'].values
counts.obs['Neutrophil PT'] = neutrophil_pt

In [11]:
with_clone = counts[counts.obs['clone'] != 0]

In [12]:
sc.pp.normalize_total(with_clone, target_sum=1e4)
sc.pp.log1p(with_clone)

sc.pp.highly_variable_genes(with_clone, n_top_genes=10**3)
rna_feats = with_clone[:, with_clone.var.highly_variable == True]

sc.pp.scale(rna_feats, max_value=10)
sc.pp.scale(with_clone, max_value=10)

  view_to_actual(adata)
  view_to_actual(adata)


In [13]:
# with_clone.write("LARRY_WT_preprocessed.h5ad")
rna_feats.write('LARRY_HVGs_preprocessed.h5ad')