# lasso file preprocessing
    This notebook demonstrate:
    1.merge different parts of the same slice and save in compressed csv format(.gz);
    2.convert bin1 data to bin20, bin50 and bin100 data;
    3.construct an AnnData object suitable for downstream analysis;
    4.visualize raw coordinate images;
    5.registration/alignment;
    6.visualize the coordinate image after registration/alignment.

### 1. merge different parts of the same slice and save in compressed csv format(.gz)

In [18]:
import pandas as pd
import stDrosophila as sd
# concat file
file1 = "/media/yao/Elements SE/BGI_Paper/L3_new/L3_b/lasso/lasso_L3_b_bin1_raw/L3_b_S30_1.gem.gz"
file2 = "/media/yao/Elements SE/BGI_Paper/L3_new/L3_b/lasso/lasso_L3_b_bin1_raw/L3_b_S30_2.gem.gz"
data1 = sd.io.read_lasso(filename=file1)
data2 = sd.io.read_lasso(filename=file2)
concat_data = pd.concat([data1, data2], axis=0)
concat_data.sort_values(by=["geneID", "x", "y", "MIDCounts"], inplace=True)
concat_data.to_csv("/media/yao/Elements SE/BGI_Paper/L3_new/L3_b/lasso/lasso_L3_b_bin1/L3_b_S30.gem.gz",
                   index=False, sep="\t", compression="gzip")
print(concat_data)

        geneID      x      y  MIDCounts
21       128up  18721  18154          1
15       128up  18729  17018          1
95       128up  18733  17205          1
122      128up  18752  16340          1
175      128up  18753  18228          1
...        ...    ...    ...        ...
6262108    zye  20890  19627          1
6262239    zye  20894  19706          1
6262210    zye  20931  19585          1
6262005    zye  20941  19623          1
6262394    zye  20952  19672          2

[7978494 rows x 4 columns]


### 2. convert bin1 data to bin20, bin50 and bin100 data (the following takes bin50 as an example）

In [23]:
import os
import stDrosophila as sd
folder = "/media/yao/Elements SE/BGI_Paper/L3_new/L3_b/lasso/lasso_L3_b_bin1"
files = [os.path.join(root, filename)for root, dirs, files in os.walk(folder)for filename in files]
files.sort()
for file in files:
    bin1_data = sd.io.read_lasso(filename=file)
    file_name = file.split("/")[-1]

    print(f"----{file_name} start!")
    data = sd.pp.bin1tobinx(bin1_data, binx=20,
                            save=f"/media/yao/Elements SE/BGI_Paper/L3_new/L3_b/lasso/lasso_L3_b_bin20/{file_name}", save_format="gzip")
    del data
    print(f"----{file_name} successfully!")

----L3_b_S24.gem.gz start!
----L3_b_S24.gem.gz successfully!
----L3_b_S25.gem.gz start!
----L3_b_S25.gem.gz successfully!
----L3_b_S26.gem.gz start!
----L3_b_S26.gem.gz successfully!
----L3_b_S27.gem.gz start!
----L3_b_S27.gem.gz successfully!
----L3_b_S28.gem.gz start!
----L3_b_S28.gem.gz successfully!
----L3_b_S29.gem.gz start!
----L3_b_S29.gem.gz successfully!
----L3_b_S30.gem.gz start!
----L3_b_S30.gem.gz successfully!
----L3_b_S31.gem.gz start!
----L3_b_S31.gem.gz successfully!
----L3_b_S32.gem.gz start!
----L3_b_S32.gem.gz successfully!


### 3. construct an AnnData object suitable for downstream analysis (the following takes bin20 as an example）

In [28]:
import os
import stDrosophila as sd
# bin20
folder = "/media/yao/Elements SE/BGI_Paper/L3_new/L3_b/lasso/lasso_L3_b_bin100"
files = [os.path.join(root, filename)for root, dirs, files in os.walk(folder)for filename in files]
files.sort()
for file in files[:9]:
    data = sd.io.read_lasso(filename=file)
    file_name = file.split("/")[-1]

    print(f"----{file_name} start!")
    z = int(file_name[6:8]) - 24
    adata = sd.io.lasso2adata(data=data, slice=file_name[:-7], binsize=100, z_gap=7, z=z)
    adata.write_h5ad(f"/media/yao/Elements SE/BGI_Paper/L3_new/L3_b/raw_adata/L3_b_bin100/{adata.obs['slice'][0]}.h5ad")
    print(f"----{file_name} successfully!")

----L3_b_S24.gem.gz start!


... storing 'slice' as categorical


----L3_b_S24.gem.gz successfully!
----L3_b_S25.gem.gz start!


... storing 'slice' as categorical


----L3_b_S25.gem.gz successfully!
----L3_b_S26.gem.gz start!


... storing 'slice' as categorical


----L3_b_S26.gem.gz successfully!
----L3_b_S27.gem.gz start!


... storing 'slice' as categorical


----L3_b_S27.gem.gz successfully!
----L3_b_S28.gem.gz start!


... storing 'slice' as categorical


----L3_b_S28.gem.gz successfully!
----L3_b_S29.gem.gz start!


... storing 'slice' as categorical


----L3_b_S29.gem.gz successfully!
----L3_b_S30.gem.gz start!


... storing 'slice' as categorical


----L3_b_S30.gem.gz successfully!
----L3_b_S31.gem.gz start!


... storing 'slice' as categorical


----L3_b_S31.gem.gz successfully!
----L3_b_S32.gem.gz start!


... storing 'slice' as categorical


----L3_b_S32.gem.gz successfully!


### 4. visualize raw coordinate images

In [30]:
import os
import anndata as ad
import stDrosophila as sd
folder = "/media/yao/Elements SE/BGI_Paper/L3_new/L3_b/raw_adata/L3_b_bin100"
files = [os.path.join(root, filename) for root, dirs, files in os.walk(folder) for filename in files]
files.sort()
adata_list = [ad.read(file) for file in files]
sd.pl.spatial_plot(adata=adata_list, cluster_col=None, save="/media/yao/Elements SE/BGI_Paper/L3_new/L3_b/raw_adata/L3_b_bin100_raw_coords.png",
                   slice_col="slice", spot_size=1)

<Figure size 432x288 with 0 Axes>

### 5. registration/alignment

In [None]:
# multi-slices Example (slice_alignment_bigBin):
import os
import anndata as ad
import stDrosophila as sd
import torch
## Enter raw anndata data(slices)
folder = "/media/yao/Elements SE/BGI_Paper/L3_new/L3_b/raw_adata/L3_b_bin20"
files = [os.path.join(root, filename) for root, dirs1, files in os.walk(folder) for filename in files]
files.sort()
slices = [ad.read(file) for file in files]

## Enter raw anndata data(slices_big)
folder_big = "/media/yao/Elements SE/BGI_Paper/L3_new/L3_b/raw_adata/L3_b_bin100"
files_big = [os.path.join(root, filename) for root, dirs2, files in os.walk(folder_big) for filename in files]
files_big.sort()
slices_big = [ad.read(file) for file in files_big]

## Alignment
align_slices, align_slices_big = sd.tl.slice_alignment_bigBin(slices=slices, slices_big=slices_big, alpha=0.1, numItermax=200,
                                                              numItermaxEmd=1000000, device=torch.device("cuda:0"), verbose=True)

## Save the slices after alignment
opath = "/media/yao/Elements SE/BGI_Paper/L3_new/L3_b/align_adata/L3_b_bin20"
if not os.path.exists(opath):
    os.mkdir(opath)
for slice in align_slices:
    subSave = os.path.join(opath, f"{slice.obs['slice'][0]}.h5ad")
    slice.write_h5ad(subSave)

## Save the slices_big after alignment
opath_big = "/media/yao/Elements SE/BGI_Paper/L3_new/L3_b/align_adata/L3_b_bin100"
if not os.path.exists(opath_big):
    os.mkdir(opath_big)
for slice in align_slices_big:
    subSave_big = os.path.join(opath_big, f"{slice.obs['slice'][0]}.h5ad")
    slice.write_h5ad(subSave_big)


Whether CUDA is currently available: True
Device: NVIDIA GeForce RTX 3060
GPU total memory: 11 GB


 Alignment :   0%|          | 0/57 [00:00<?, ?it/s]

### 6. visualize the coordinate image after registration/alignment

In [None]:
import os
import anndata as ad
import stDrosophila as sd
folder = "/media/yao/Elements SE/BGI_Paper/L3_new/L3_b/align_adata/L3_b_bin20"
files = [os.path.join(root, filename) for root, dirs, files in os.walk(folder) for filename in files]
files.sort()
adata_list = [ad.read(file) for file in files]
sd.pl.spatial_plot(adata=adata_list, cluster_col=None, save="/media/yao/Elements SE/BGI_Paper/L3_new/L3_b/raw_adata/L3_b_bin20_raw_coords.png",
                   slice_col="slice", spot_size=1)