In [None]:
from platython import util as pu
import pandas as pd

pu.timestamp()

# SAMap preparation

SAMap requires a sequence alignment in BLAST format between the two species to be compared. The IDs in this file must correspond with the index column of the `.var` slot of the single-cell objects. We should have one isoform per gene. This means that we only need to harmonize the gene names between the alignment file and the lookup that SAMap needs.

In [None]:
%%bash --out ./mmseqs.out --err ./mmseqs.err
base="/Users/npapadop/Documents/projects/acanthochitona/2023-04_chiton_wrapup"
cgig="raw/cgig/Crassostrea_gigas_longest_pep_noprotid.fa"
pcro="raw/pcro/Pcro_longest_prot_no_t.aa"
out_dir="/Users/npapadop/Documents/teaching/2023/advanced_scRNAseq/data/aln/m8/"

mkdir -p ${out_dir}
cd ${out_dir}
mmseqs easy-search $base/$cgig $base/$pcro cgig_pcro.m8 tmp
mmseqs easy-search $base/$pcro $base/$cgig pcro_cgig.m8 tmp

In [None]:
pu.timestamp()

Do the forward comparison (query: chiton - target: platy), and remember that SAMap expects no headers to be present.

In [None]:
base = "/Users/npapadop/Documents/teaching/2023/advanced_scRNAseq/data/"

In [None]:
alignment = pd.read_csv(base + "aln/m8/pcro_cgig.m8", sep="\t", header=None)
# official column names:
alignment.columns = ["query", "target", "seq_id", "ali_len", "no_mism", "no_go", "q_start", "t_start", "q_end", "t_end", "eval", "bit"]
# remove low quality hits, since we aren"t keeping those anyway
alignment = alignment[alignment["eval"] < 1e-6]
# Chiton gene names are protein IDs that keep track of isoform.
# Trim the last part of the name, which is in the form "seqX.pX"
# Chiton names also come with an "afas_" prepended
alignment["query"] = "pcro_" + alignment["query"].str.split("|").str[0]
# Crassostrea names are well-behaved!
alignment["target"] = "cgig_" + alignment["target"].str.split("|").str[0]

alignment = alignment.sort_values("eval").drop_duplicates(["query", "target"])
alignment.to_csv(base + "pcrozieri_to_cgigas.txt", sep="\t", index=None, header=False)

In [None]:
alignment = pd.read_csv(base + "aln/m8/cgig_pcro.m8", sep="\t", header=None)
# official column names:
alignment.columns = ["query", "target", "seq_id", "ali_len", "no_mism", "no_go", "q_start", "t_start", "q_end", "t_end", "eval", "bit"]
# remove low quality hits, since we aren"t keeping those anyway
alignment = alignment[alignment["eval"] < 1e-6]
# Chiton gene names are protein IDs that keep track of isoform.
# Trim the last part of the name, which is in the form "seqX.pX"
alignment["target"] = "pcro_" + alignment["target"].str.split("|").str[0]
# Crassostrea names are well-behaved!
alignment["query"] = "cgig_" + alignment["query"].str.split("|").str[0]

alignment = alignment.sort_values("eval").drop_duplicates(["query", "target"])

alignment.to_csv(base + "cgigas_to_pcrozieri.txt", sep="\t", index=None, header=False)

In [None]:
pu.timestamp()

In [None]:
!mmseqs | grep Version