In [50]:
# basci env
import os
from pathlib import Path
import pandas as pd
import time
import json
from tqdm import tqdm

# data process of file from ncbi
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

# dir
# DATASET_DIR = Path('/mnt/f/spatial_data/probe')
DATASET_DIR = Path('./probe')
RUNID = 'example_dataset'
workdir = DATASET_DIR / RUNID
os.makedirs(workdir, exist_ok=True)
organism = 'mouse'

In [51]:
# create results dir
current_time = time.localtime()
formatted_time = time.strftime("%Y%m%d_%H%M%S", current_time)
output = os.path.join(workdir, 'results', formatted_time+'_ensembl')
bds_candidate_dir = os.path.join(output, "bds_candidate")
os.makedirs(output, exist_ok=True)
os.makedirs(bds_candidate_dir, exist_ok=True)

# file name variables
bds_candidate_file_suffix = "_bds_candidate.fasta"
combined_bds_candidates_file = "total_bds_candidate.fasta"
combined_bds_candidates_blast_file = 'total_bds_candidate_blast.fasta'
bds_candidate_num_file = "bds_candidate_num.json"
blast_results_file = "blast_results.xml"

In [52]:
# gene_info = pd.read_excel(os.path.join(workdir, "gene_info.xlsx"))
gene_info = pd.read_table(os.path.join(workdir, "gene_list_JC.txt"))
gene_info['gene'] = gene_info['gene_name']
if organism == 'mouse': gene_info['gene'] = gene_info['gene'].str.capitalize()
elif organism == 'human': gene_info['gene'] = gene_info['gene'].str.upper()
gene_list = [_.strip() for _ in gene_info['gene'].unique() if _!=0]
print(len(gene_list))
gene_info.head()

3


Unnamed: 0,gene_name,gene
0,Gapdh,Gapdh
1,Actb,Actb
2,Hprt1,Hprt1


In [4]:
from lib.database_interaction import ensembl_name_to_seqs
import time
max_trial = 3
sequences_of_all = dict()
error_messages = {gene: [] for gene in gene_list}

with tqdm(total=len(gene_list), desc="Retriving_sequences", position=0) as pbar_total:
    for gene in gene_list:
        sequences_of_all[gene] = {}
        trial_success = False
        
        # Reset the trial progress bar for each gene
        for trial in range(1, max_trial+1):  # Retrying up to 3 times
            try:
                # Attempt to retrieve sequences
                sequences_of_all[gene] = ensembl_name_to_seqs(gene=gene, species=organism, seq_type='cds', tqdm_args={'position': 1,'leave': False})
                trial_success = True
                break
            except Exception as e:
                time.sleep(1)

        if not trial_success:
            error_messages[gene].append(f"Failed to retrieve sequences for {gene} after {max_trial} attempts.")

        pbar_total.update(1)  # Update the main progress bar after each gene

for gene, messages in error_messages.items():
    for message in messages:
        print(message)

with open(os.path.join(output, 'sequence_of_all.json'), 'w') as file: json.dump(sequences_of_all, file)

Retriving_sequences:   0%|          | 0/3 [00:00<?, ?it/s]
Gapdh:   0%|          | 0/20 [00:00<?, ?it/s][A
Gapdh:   5%|▌         | 1/20 [00:00<00:07,  2.58it/s][A
Gapdh:  10%|█         | 2/20 [00:00<00:06,  2.73it/s][A
Gapdh:  15%|█▌        | 3/20 [00:01<00:06,  2.67it/s][A
Gapdh:  20%|██        | 4/20 [00:01<00:06,  2.64it/s][A
Gapdh:  25%|██▌       | 5/20 [00:01<00:05,  2.57it/s][A
Gapdh:  30%|███       | 6/20 [00:03<00:08,  1.59it/s][A
Gapdh:  35%|███▌      | 7/20 [00:03<00:07,  1.76it/s][A
Gapdh:  40%|████      | 8/20 [00:03<00:06,  1.99it/s][A
Gapdh:  45%|████▌     | 9/20 [00:04<00:05,  2.08it/s][A
Gapdh:  50%|█████     | 10/20 [00:04<00:04,  2.03it/s][A
Gapdh:  55%|█████▌    | 11/20 [00:05<00:04,  2.02it/s][A
Gapdh:  60%|██████    | 12/20 [00:05<00:04,  1.99it/s][A
Gapdh:  65%|██████▌   | 13/20 [00:06<00:03,  2.04it/s][A
Gapdh:  70%|███████   | 14/20 [00:06<00:02,  2.20it/s][A
Gapdh:  75%|███████▌  | 15/20 [00:07<00:02,  2.31it/s][A
Gapdh:  80%|████████  | 16/20 [

In [5]:
with open(os.path.join(output, 'sequence_of_all.json'), 'r') as file: sequences_of_all = json.load(file)

isoforms = {}
for gene, isoforms_tmp in sequences_of_all.items():
    shortest_isoform = None
    min_length = 10**6
    max_length = 0
    for isoform in isoforms_tmp:
        try: isoform_name = isoform['external_name']
        except: continue
        if len(isoform['seq']) < min_length and gene.upper() in isoform_name.upper():
            min_length = len(isoform['seq'])
            shortest_isoform = isoform
    if shortest_isoform:
        isoforms[gene] = shortest_isoform
with open(os.path.join(output, 'shortest_isoforms.json'), 'w') as file: json.dump(isoforms, file)

In [None]:
# 查找绑定位点

In [6]:
%reload_ext autoreload
%autoreload 2

# set lib auto reload in jupyter notebook
from lib.search_binding import position_search, optimize_subsequence, seq_minus

# Initiation of array
binding_site_entry = [
    "accession", "gene_name", "mol_type", "organism",
    "pos", "plp_bds", "plp_Tm","plp_bds3'", "plp_bds5'", "plp_Tm3'", "plp_Tm5'", "mfe", "wanted"] ## 构建表头
alignment_entry = ["align_num", "align_accession", "align_descrip", "plus/minus"]
BDS_INFO = pd.DataFrame(columns=binding_site_entry+alignment_entry)

# Search binding sites on mRNA sequence
pre_binding_num = {}

# initialization of file
with open(os.path.join(output, combined_bds_candidates_file), "w") as handle: handle.write("")
with open(os.path.join(output, combined_bds_candidates_blast_file), "w") as f: f.write("")

for desc, info in tqdm(isoforms.items(), desc="Searching_binding_sites", position=0):
    seq = info['seq']
    if 'N' in seq: seq = seq.replace('N', '')
    try: gene_name= info['external_name']
    except: gene_name = desc

    id = info['id']
    mol_type = info['biotype']
    
    pos_info = position_search(
        seq, gene=gene_name,
        BDS_len=40, BDS_num=100, min_gap=0, better_gap=40, pin_gap=0.05, 
        G_min=0.25, G_max=0.7, G_consecutive=5, Tm_low=48, Tm_high=60, 
        verbose_pos=1, leave=False, warn=False) #设置筛选条件
    
    record_list = []
    for i, pre_binding_tmp in enumerate([_['plp_bds'] for _ in pos_info]):
        record_list.append(
            SeqRecord(Seq(pre_binding_tmp), id="bds_candidate" + str(i), 
                      description="|".join([id, gene_name, organism, mol_type])))

    # add information about binding sites to FOI
    add = pd.DataFrame(pos_info)
    add['accession'] = id
    add['gene_name'] = gene_name
    add['mol_type'] = mol_type
    add['organism'] = organism
    BDS_INFO = pd.concat([BDS_INFO, add], ignore_index=True)

    file_out = os.path.join(bds_candidate_dir, gene_name + bds_candidate_file_suffix)
    
    # write pre_binding to files
    with open(file_out, "w") as f:
        for new_record in record_list: SeqIO.write(new_record, f, "fasta")
    with open(os.path.join(output, combined_bds_candidates_file), "a") as handle:
        for new_record in record_list: SeqIO.write(new_record, handle, "fasta")
    with open(os.path.join(output, combined_bds_candidates_blast_file), "a") as handle:
        for new_record in record_list: 
            blast_seq = str(new_record.seq)
            blast_seq = blast_seq[len(blast_seq)//2-16:len(blast_seq)//2+16] #提取中心32 nt生成BLAST专用文件
            new_record = SeqRecord(Seq(blast_seq), id=new_record.id, description=new_record.description)
            SeqIO.write(new_record, handle, "fasta")

    # record the num of pre_binding for each gene
    pre_binding_num[f"{id}_{gene_name}"] = len(pos_info)

with open(os.path.join(output, bds_candidate_num_file), "w") as f: json.dump(pre_binding_num, f)

Searching_binding_sites:   0%|          | 0/3 [00:00<?, ?it/s]
Gapdh-207:   0%|          | 0/109 [00:00<?, ?it/s][A
  BDS_INFO = pd.concat([BDS_INFO, add], ignore_index=True)
Searching_binding_sites:  33%|███▎      | 1/3 [00:00<00:00,  3.06it/s]
Actb-203:   0%|          | 0/252 [00:00<?, ?it/s][A
                                                 [A
Hprt1-201:   0%|          | 0/553 [00:00<?, ?it/s][A
Searching_binding_sites: 100%|██████████| 3/3 [00:00<00:00,  6.83it/s]


In [None]:
##需要导出fasta文件，用blast网站处理：
##https://blast.ncbi.nlm.nih.gov/Blast.cgi?PROGRAM=blastn&PAGE_TYPE=BlastSearch&BLAST_SPEC=&LINK_LOC=blasttab&LAST_PAGE=tblastn
##输出后修改文件名为blast_results.xml

In [8]:
# Extract interested information from blast_results 提取blast的结果
from Bio.Blast import NCBIXML
align_num = []
# read the id/plus-minus part/align_num
with open(os.path.join(output, blast_results_file), "r") as blast_output:
    blast_records = NCBIXML.parse(blast_output)
    loca = 0
    for blast_record in blast_records:
        align_accession = []
        align_descrip_list = []
        # get align num of each binding site
        length = len(blast_record.alignments)
        align_num.append(length)
        for i in range(length):
            descrip = blast_record.descriptions[i].title.split("|")
            # get accession and descrip of each align seq
            align_accession.append(descrip[3])
            align_descrip_list.append(descrip[-1])
        BDS_INFO.loc[loca, "align_accession"] = "|".join(str(_) for _ in align_accession)
        # add align_descrip to df
        BDS_INFO.loc[loca, "align_descrip"] = "|".join(str(_) for _ in align_descrip_list)
        # get plus/minus of each align seq
        p_m = [blast_record.alignments[_].hsps[0].frame[1] for _ in range(length)]
        # add plus/minus to df
        try: BDS_INFO.loc[loca, "plus/minus"] = ",".join([str(_) for _ in p_m])
        except: BDS_INFO.loc[loca, "plus/minus"] = pd.NA
        loca += 1
BDS_INFO["align_num"] = align_num

In [13]:
## 选择结合位点
import re

def adjust_gene_name(gene_name, gene_list):
    gene_list = [x.upper() for x in gene_list]
    match = re.search(r'(.+)-(\d+)$', gene_name)
    if match:
        base_gene_name = match.group(1)
        if base_gene_name.upper() in gene_list or gene_name.upper() in gene_list: return base_gene_name
        else: return gene_name
    else: return gene_name
## 原文输出probes_wanted.xlsx文档 因为.xlsx的文档在jupyter lab中很难打开
## 于是尝试修改输出csv文件,成功
BDS_INFO["wanted"] = [True] * len(BDS_INFO)
verbose = True
# select by specifity
gene_name_list = [_.upper() for _ in gene_list]
gene_name_list_out = [i for i in gene_name_list]
for i in range(len(BDS_INFO)):
    # check gene_name
    gene_name = adjust_gene_name(BDS_INFO.loc[i, "gene_name"], gene_name_list)
    spe_ori= BDS_INFO.loc[i, "organism"]
    if gene_name.upper() not in gene_name_list: 
        BDS_INFO.loc[i, "wanted"] = False
        if verbose: print(f"{gene_name} not in gene list.")
    else:
        try: gene_name_list_out.remove(gene_name)
        except: pass

    # check DNA or mRNA type
    if BDS_INFO.loc[i, "wanted"] == True:
        if BDS_INFO.loc[i, "mol_type"] != "protein_coding":
            # BDS_INFO.loc[i, "wanted"] = False
            if verbose: print("{} is {}.".format(gene_name, BDS_INFO.loc[i, "mol_type"]))

    # check gene_organism name
    if BDS_INFO.loc[i, "wanted"] == True:
        descrip = BDS_INFO.loc[i, "align_descrip"]
        if pd.isnull(descrip):
            BDS_INFO.loc[i, "wanted"] = False
            if verbose: print(f"{gene_name} not found in BLAST.")
        else:
            descrip = descrip.split("|")
            for des in descrip:
                if gene_name not in des and spe_ori in des:
                    BDS_INFO.loc[i, "wanted"] = False
                    if verbose: print(f"{gene_name} not specific.")
                    break

    # check plus/minus
    if BDS_INFO.loc[i, "wanted"] == True:
        pm_list = BDS_INFO.loc[i, "plus/minus"].split(",")
        if "-1" not in pm_list:
            BDS_INFO.loc[i, "wanted"] = False
            if verbose: print(f"{gene_name} not plus/minus.")

# write the whole information of interest to a excel file in tmp dir
BDS_INFO.to_csv(os.path.join(output, "probes_candidates.csv"))

out_tmp = BDS_INFO[BDS_INFO["wanted"] == True]
output_df = pd.DataFrame()
for gene in out_tmp.gene_name.unique():
    pos_wanted = list(out_tmp[out_tmp.gene_name == gene]["pos"])
    pos_best = optimize_subsequence(pos_wanted, length=8, min_gap=40, better_gap=80, gene=gene)
    pos_output = out_tmp[out_tmp.gene_name == gene]
    pos_output = pos_output[pos_output["pos"].isin(pos_best)]
    output_df = pd.concat([output_df, pos_output])

# write the output to a xlsx file
output_df.to_csv(os.path.join(output, "probes_wanted.csv"))

Gene Gapdh-207: Not enough pos for 8 binding sites.
Gene Gapdh-207: condition too harsh, loose to get better results
[27, 28, 31, 38, 39, 46, 47, 51, 55, 75]
Gene Actb-203: Not enough pos for 8 binding sites.
Gene Actb-203: condition too harsh, loose to get better results
[59, 60, 112, 120, 124, 125, 126, 127, 128, 129, 132, 236, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 254, 257, 258, 259]
Gene Hprt1-201: Not enough pos for 8 binding sites.
Gene Hprt1-201: condition too harsh, loose to get better results
[112, 113, 114, 115, 127, 129, 130, 131, 132, 133, 134, 146, 147, 148, 149, 296, 447, 450, 453, 454, 457, 458, 459, 460, 470, 473, 474, 475, 476, 477, 478, 479, 480, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 499, 500]


In [None]:
# 经过上述步骤 已经生成想要的probes 存放在probes_wanted.csv文件中
# 需要把输出的probes_wanted.csv放在example_dataset文件夹下面

In [15]:
resultsdir = os.path.join(workdir, 'results')
print(resultsdir)

probe/example_dataset/results


In [16]:
# 去掉重复的probe
resultsdir = os.path.join(workdir, 'results')
def adjust_gene_name(gene_name, gene_list):
    gene_list = [x.upper() for x in gene_list]
    match = re.search(r'(.+)-(\d+)$', gene_name)
    if match:
        base_gene_name = match.group(1)
        if base_gene_name.upper() in gene_list or gene_name.upper() in gene_list: return base_gene_name
        else: return gene_name
    else: return gene_name


result = pd.DataFrame()
for dir in os.listdir(os.path.join(resultsdir)):
    try: result = pd.concat([result, pd.read_csv(os.path.join(resultsdir, dir, "probes_wanted.csv"), index_col=0)])
    except: continue
result["gene_name"] = [adjust_gene_name(_, gene_list) for _ in result["gene_name"]]
result = result.sort_values(["gene_name", "pos"])
# result.drop_duplicates(subset=["bds"], keep="first", inplace=True)
result.drop_duplicates(subset=["plp_bds"], keep="first", inplace=True)
result.head()

Unnamed: 0,accession,gene_name,mol_type,organism,pos,plp_bds,plp_Tm,plp_bds3',plp_bds5',plp_Tm3',plp_Tm5',mfe,wanted,align_num,align_accession,align_descrip,plus/minus
10,ENSMUST00000163829,Actb,protein_coding,mouse,59,GGAGGGGAATACAGCCCGGGGAGCATCGTCGCCCGCGAAG,72.43,GGAGGGGAATACAGCCCGGG,GAGCATCGTCGCCCGCGAAG,57.44,57.81,-8.4,True,43,AK166349.1|AK152844.1|AK078935.1|AK150866.1|NG...,Mus musculus mammary gland RCB-0526 Jyg-MC(A)...,"-1,-1,-1,-1,1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1..."
11,ENSMUST00000163829,Actb,protein_coding,mouse,60,TGGAGGGGAATACAGCCCGGGGAGCATCGTCGCCCGCGAA,72.24,TGGAGGGGAATACAGCCCGG,GGAGCATCGTCGCCCGCGAA,55.15,59.78,-8.4,True,43,AK166349.1|AK152844.1|AK078935.1|AK150866.1|NG...,Mus musculus mammary gland RCB-0526 Jyg-MC(A)...,"-1,-1,-1,-1,1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1..."
12,ENSMUST00000163829,Actb,protein_coding,mouse,112,TCCTTCTGACCCATTCCCACCATCACACCCTGGTGCCTAG,75.4,TCCTTCTGACCCATTCCCAC,CATCACACCCTGGTGCCTAG,59.8,58.05,-7.9,True,96,AK166349.1|AK152844.1|AK150662.1|AK078935.1|AK...,Mus musculus mammary gland RCB-0526 Jyg-MC(A)...,"-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-..."
13,ENSMUST00000163829,Actb,protein_coding,mouse,120,CATAGGAGTCCTTCTGACCCATTCCCACCATCACACCCTG,72.19,CATAGGAGTCCTTCTGACCC,ATTCCCACCATCACACCCTG,55.55,58.1,-6.7,True,55,AK166349.1|AK152844.1|AK150662.1|AK078935.1|AK...,Mus musculus mammary gland RCB-0526 Jyg-MC(A)...,"-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-..."
14,ENSMUST00000163829,Actb,protein_coding,mouse,124,CCCACATAGGAGTCCTTCTGACCCATTCCCACCATCACAC,72.15,CCCACATAGGAGTCCTTCTG,ACCCATTCCCACCATCACAC,55.53,58.04,-6.7,True,47,AK166349.1|AK152844.1|AK150662.1|AK078935.1|AK...,Mus musculus mammary gland RCB-0526 Jyg-MC(A)...,"-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-..."


In [20]:
## 注意查看目录，不然结果输出了也不知道放在哪里
workdir

PosixPath('probe/example_dataset')

In [17]:
## 保存结果
result.to_csv(os.path.join(workdir, 'gene_binding_site.csv'))

result["gene_name"] = result["gene_name"].str.upper()
to_search = [gene for gene in gene_list if len(result[result["gene_name"] == gene.upper()]) < 1]
with open(os.path.join(workdir, "to_search.txt"), "w") as f:
    for line in to_search: f.write(line + "\n")

In [None]:
#
# 到目前为止已经获取了gene_binding_site 后续可能需要将这个文件和barcode联合起来
# 
# 下一步就是使用随机数，获取barcode了

In [18]:
## 导入模块和作者自己设置的函数
from seqfold import dg, fold, dot_bracket
import random

from Bio.Blast import NCBIXML


def random_seq_list(length=20, num=50):
    nucleotides = ['A', 'T', 'C', 'G']
    sequence = [''.join(random.choice(nucleotides) for _ in range(length)) for j in range(num)]
    return sequence


def hum_dis(seq1, seq2):
    if len(seq1) != len(seq2):
        print("seq_not_match")
    else:
        cont = 0
        for char in range(len(seq1)):
            if seq1[char] != seq2[char]:
                cont += 1
        return cont


def dna_sec_struct(seq, temp=45):
    # Predict the minimum free energy
    mfe = dg(seq, temp=temp)
    # `fold` returns a list of `seqfold.Struct` from the minimum free energy structure
    structs = fold(seq, temp=temp)
    return mfe, structs


def thre_by_blast(file="./JXAKR9US016-Alignment.xml", thre=18):
    pos = []
    with open(file, "r") as blast_output:
        blast_records = NCBIXML.parse(blast_output)
        for blast_record in blast_records:
            save = True
            for alignment in blast_record.alignments:
                # print("Alignment title:", alignment.title)
                # print("Length of the alignment:", alignment.length)

                # # Iterate over the high-scoring pairs (HSPs) in the alignment
                for hsp in alignment.hsps:
                    # print("HSP score:", hsp.score)
                    if hsp.score >= thre:
                        pos.append(False)
                        save = False
                        break
                    # print("HSP bits:", hsp.bits)
                    # print("HSP query sequence:", hsp.query)
                    # print("HSP match sequence:", hsp.match)
                    # print("HSP subject sequence:", hsp.sbjct)
            if save:
                pos.append(True)
    return pos

In [None]:
## 使用上面的函数，生成符合条件的序列

In [None]:
## 注意下面生成随机序列 可以重复 注意文件保存

In [None]:
## 注意下面生成随机序列 可以重复 注意文件保存

In [None]:
## 注意下面生成随机序列 可以重复 注意文件保存

In [19]:
seq_list = random_seq_list()

seq_list_export = []
for seq in seq_list:
    seq = seq.upper()

    # GGGGG
    if "GGGGG" in seq:
        print(f"{seq}: \tthre_by_G")
        continue

    # dif
    dif = True
    for tmp_seq in seq_list_export:
        if hum_dis(tmp_seq, seq) < 10:
            dif = False
            print(f"{seq}: \tthre_by_dif")
    if not dif:
        continue

    # secondary structure
    mfe, structs = dna_sec_struct(seq, temp=45)
    if mfe < 0:
        print(f"{seq}: \tthre_by_stru\t", dot_bracket(seq, structs))
        continue

    seq_list_export.append(seq)

print(f'{len(seq_list_export)} seqs remained')

TCCCACATGGAGTCCCAACT: 	thre_by_stru	 (((.....))).........
GCCTCCTAGCAGGGATGCTG: 	thre_by_stru	 ((.(((......))).))..
AGGACTCGTAGACCGACATT: 	thre_by_stru	 .((.((...)).))......
ATGCCCATAGGTCAGTAAAG: 	thre_by_stru	 .((.((...)).))......
CCTGTCGGTGTTAACAGGAT: 	thre_by_stru	 (((((........)))))..
TCAATTGGGGGCGCGATGAT: 	thre_by_G
TCCAGTTGTTTTGGGACGAA: 	thre_by_dif
TCATATGTGCAAACAAGTGC: 	thre_by_dif
GCTTCAGGCAAATCGCGGCC: 	thre_by_stru	 ......(((........)))
CTCTGCGTTGCAGACGAACA: 	thre_by_stru	 .(((((...)))))......
ACATCCGATTAGCATCGTCC: 	thre_by_stru	 .....((((....))))...
TCTAACTGTCCGCAGACACC: 	thre_by_stru	 ......((((....))))..
ACTTATATGGGGATACCAGA: 	thre_by_stru	 .........((....))...
37 seqs remained


In [21]:
import os
wd=os.getcwd()
print(wd)

/home/data/t180501/1_WeChat/RNA_FISH


In [20]:
## 保存文件,注意查看文件的输出位置
with open("./random_seq_filtered.txt", "w") as f:
    for _ in range(len(seq_list_export)):
        f.write(f'>seq{_}\n' + seq_list_export[_] + "\n")

In [None]:
## 这个数据需要再次传递到ncbi 输出的xml文件有一项评分 需要筛选评分比较低的
## https://blast.ncbi.nlm.nih.gov/Blast.cgi?PROGRAM=tblastn&PAGE_TYPE=BlastSearch&LINK_LOC=blasthome
## 这次就不用修改xml的文件名直接读取就好

In [28]:
## 这个步骤主要是选出那些匹配程度较低的序列
## 这样特意设计的序列估计是不不容易自己和自己形成环
## 注意阈值的设置
# pos = thre_by_blast(file='./1KDHC5BG013-Alignment.xml', thre=18)
pos = thre_by_blast(file='./1KDHC5BG013-Alignment.xml', thre=20)
sum(pos)

26

In [30]:
## 保存筛选之后的随机序列 
with open("./random_seq_filtered.txt", "w") as f:
    cont = 0
    for _ in range(len(seq_list_export)):
        if pos[_]:
            f.write(f">seq{cont}\n" + seq_list_export[_] + "\n")
            cont += 1
## 看一下随机序列
tmp=pd.read_table('./random_seq_filtered.txt')
# print(tmp)

In [31]:
## 根据随机序列生成barcode序列 
## 设置运行环境
## 设置函数
import pandas as pd

def hum_dis(seq1, seq2):
    seq1 = seq1.upper()
    seq2 = seq2.upper()
    if len(seq1) != len(seq2):
        # print("seq_not_match")
        # print(seq1, seq2)
        return -1
    else:
        cont = 0
        for char in range(len(seq1)):
            if seq1[char] != seq2[char]:
                cont += 1
        return cont
    

def create_seq_lib(
    seq_list,
    color_fraction={
        "Red": [_ / 4 for _ in range(5)],
        "Green": [_ / 2 for _ in range(3)],
        "Blue": [_ / 4 for _ in range(5)],
        "Yellow": [_ / 4 for _ in range(5)],
    },
):
    seq_lib = pd.DataFrame(columns=["seq", "color", "grade", "fraction"])
    color_list = []
    for color in color_fraction.keys():
        color_list += [color] * len(color_fraction[color])
    seq_lib["color"] = color_list

    if len(seq_lib["color"]) > len(seq_list):
        print("Seq Not Enough")
        return ValueError

    seq_lib["seq"] = seq_list[: len(seq_lib["color"])]

    fra = []
    grade = []
    for color in color_fraction.keys():
        fra += color_fraction[color]
        grade += [_ for _ in range(len(color_fraction[color]))]
    seq_lib["fraction"] = fra
    seq_lib["grade"] = grade

    return seq_lib


import itertools

def create_barcode_lib(
    seq_lib,
    color_order=["Green", "Red", "Blue", "Yellow"],
    sum_num=5,
    sum_list=["Red", "Yellow", "Blue"],
):
    color_order_seq = [_ + "seq" for _ in color_order]
    barcode_lib = pd.DataFrame(columns=["barcode"] + color_order + color_order_seq)

    grade_list = {
        color: list(seq_lib[seq_lib.color == color].grade.unique())
        for color in color_order
    }
    barcode_lib[color_order] = list(itertools.product(*grade_list.values()))

    for i in range(len(barcode_lib)):
        grade = barcode_lib.loc[i, color_order]

        barcode_sub_list = [
            list(
                seq_lib[
                    (seq_lib.color == color_order[_]) & (seq_lib.grade == grade[_])
                ].seq
            )[0]
            for _ in range(len(color_order))
        ]
        barcode_lib.loc[i, color_order_seq] = barcode_sub_list
        barcode_lib.loc[i, "barcode"] = "".join(barcode_sub_list)

    if sum_num:
        barcode_lib["sum"] = barcode_lib[sum_list].sum(axis=1)
        barcode_lib = barcode_lib[
            # (barcode_lib["sum"] >= 1) 
            # &
            (barcode_lib["sum"] == sum_num)
        ]
        barcode_lib.set_index('barcode', inplace=True)

    return barcode_lib

In [32]:
## 读取上个random_seq_filtered.txt文件
file = './random_seq_filtered.txt'
with open(file, "r") as f:
    seq_list = f.readlines()
seq_list = [seq_list[_].replace('\n','') for _ in range(1,len(seq_list),2)]
print(seq_list)

['GGGGTTGAATGGGTAGATTC', 'CACTCTATCCACGGTTAAGA', 'TCCTCCTTTTTTGACACGCA', 'ACCTTACAATTTTTTACATC', 'AACAACGGTCTGGATGTTCA', 'TACTCGTGTTCCATCCAGTT', 'GACCATGCCCCCGTGTACAT', 'TCAAGCCAGGTCGATTATCT', 'ATGACTCCAATTTGTCACCC', 'ATGCTTCCTCATAGGGGCGA', 'CGTACAATAAGTGGCTTATC', 'GAGCCAGAACAGACCCTCGG', 'TTAGGTATACACAAGAGTTC', 'TAGCACACCGCATATGGGAA', 'TGTGCCCATAGGGTTAACCG', 'TTCACATCTCCTACCTGTGT', 'CGTGATGCTCTTATCAATCA', 'AGCTGCCCTCGCGTAACCCC', 'GCTGGTACGTACGCAACTTC', 'AGCAGACCCTGATCATCAAC', 'GCCTAACAGTTCCCACCCTT', 'CTCATGCCCTACAAATTCTG', 'CAGAGGTCGCCATCCCTTCA']


In [33]:
## 根据上面的序列生成矩阵
## 在这个步骤里面 如果提供的随机的序列不够 会报错 因为这些序列分不过来
## 但如果本身并不需要那么多荧光颜色的话 这边生成多少种类颜色的代码可以修改
## 此代码可根据需要进行修改
seq_lib = create_seq_lib(
    seq_list=seq_list,
    color_fraction={
        "Green": [0, 0.5, 1],
        "Red": [_ / 5 for _ in range(6)],
        "Blue": [_ / 5 for _ in range(6)],
        "Yellow": [_ / 5 for _ in range(6)],
    },
)
seq_lib

Unnamed: 0,seq,color,grade,fraction
0,GGGGTTGAATGGGTAGATTC,Green,0,0.0
1,CACTCTATCCACGGTTAAGA,Green,1,0.5
2,TCCTCCTTTTTTGACACGCA,Green,2,1.0
3,ACCTTACAATTTTTTACATC,Red,0,0.0
4,AACAACGGTCTGGATGTTCA,Red,1,0.2
5,TACTCGTGTTCCATCCAGTT,Red,2,0.4
6,GACCATGCCCCCGTGTACAT,Red,3,0.6
7,TCAAGCCAGGTCGATTATCT,Red,4,0.8
8,ATGACTCCAATTTGTCACCC,Red,5,1.0
9,ATGCTTCCTCATAGGGGCGA,Blue,0,0.0


In [34]:
## 生成整合后的barcode
## 这个步骤有个看不懂的警告
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
barcode_lib = create_barcode_lib(
    seq_lib,
    color_order=["Green", "Red", "Blue", "Yellow"],
    sum_num=5,
    sum_list=["Red", "Yellow", "Blue"],
)
barcode_lib

Unnamed: 0_level_0,Green,Red,Blue,Yellow,Greenseq,Redseq,Blueseq,Yellowseq,sum
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
GGGGTTGAATGGGTAGATTCACCTTACAATTTTTTACATCATGCTTCCTCATAGGGGCGAGCCTAACAGTTCCCACCCTT,0,0,0,5,GGGGTTGAATGGGTAGATTC,ACCTTACAATTTTTTACATC,ATGCTTCCTCATAGGGGCGA,GCCTAACAGTTCCCACCCTT,5
GGGGTTGAATGGGTAGATTCACCTTACAATTTTTTACATCCGTACAATAAGTGGCTTATCAGCAGACCCTGATCATCAAC,0,0,1,4,GGGGTTGAATGGGTAGATTC,ACCTTACAATTTTTTACATC,CGTACAATAAGTGGCTTATC,AGCAGACCCTGATCATCAAC,5
GGGGTTGAATGGGTAGATTCACCTTACAATTTTTTACATCGAGCCAGAACAGACCCTCGGGCTGGTACGTACGCAACTTC,0,0,2,3,GGGGTTGAATGGGTAGATTC,ACCTTACAATTTTTTACATC,GAGCCAGAACAGACCCTCGG,GCTGGTACGTACGCAACTTC,5
GGGGTTGAATGGGTAGATTCACCTTACAATTTTTTACATCTTAGGTATACACAAGAGTTCAGCTGCCCTCGCGTAACCCC,0,0,3,2,GGGGTTGAATGGGTAGATTC,ACCTTACAATTTTTTACATC,TTAGGTATACACAAGAGTTC,AGCTGCCCTCGCGTAACCCC,5
GGGGTTGAATGGGTAGATTCACCTTACAATTTTTTACATCTAGCACACCGCATATGGGAACGTGATGCTCTTATCAATCA,0,0,4,1,GGGGTTGAATGGGTAGATTC,ACCTTACAATTTTTTACATC,TAGCACACCGCATATGGGAA,CGTGATGCTCTTATCAATCA,5
...,...,...,...,...,...,...,...,...,...
TCCTCCTTTTTTGACACGCAGACCATGCCCCCGTGTACATCGTACAATAAGTGGCTTATCCGTGATGCTCTTATCAATCA,2,3,1,1,TCCTCCTTTTTTGACACGCA,GACCATGCCCCCGTGTACAT,CGTACAATAAGTGGCTTATC,CGTGATGCTCTTATCAATCA,5
TCCTCCTTTTTTGACACGCAGACCATGCCCCCGTGTACATGAGCCAGAACAGACCCTCGGTTCACATCTCCTACCTGTGT,2,3,2,0,TCCTCCTTTTTTGACACGCA,GACCATGCCCCCGTGTACAT,GAGCCAGAACAGACCCTCGG,TTCACATCTCCTACCTGTGT,5
TCCTCCTTTTTTGACACGCATCAAGCCAGGTCGATTATCTATGCTTCCTCATAGGGGCGACGTGATGCTCTTATCAATCA,2,4,0,1,TCCTCCTTTTTTGACACGCA,TCAAGCCAGGTCGATTATCT,ATGCTTCCTCATAGGGGCGA,CGTGATGCTCTTATCAATCA,5
TCCTCCTTTTTTGACACGCATCAAGCCAGGTCGATTATCTCGTACAATAAGTGGCTTATCTTCACATCTCCTACCTGTGT,2,4,1,0,TCCTCCTTTTTTGACACGCA,TCAAGCCAGGTCGATTATCT,CGTACAATAAGTGGCTTATC,TTCACATCTCCTACCTGTGT,5


In [35]:
## 保存seq 保存生成的barcode
#tmp=os.getcwd()
seq_lib.to_csv(r"./seq_lib.csv")
barcode_lib.to_csv(r"./barcode_lib.csv")

In [None]:
## 到目前 probe 和 barcode 都有了
## 这个 probe似乎还是添加了stich的probe 
## 所以下个步骤要将这两个部分粘合 文件分别是gene_binding_site.csv 和 barcode_lib.csv
## 是什么文件和什么文件粘合 ???

In [36]:
## 读取 gene_binding_site.csv文件
binding_df = pd.read_csv(DATASET_DIR / RUNID / "gene_binding_site.csv")
if organism == 'mouse': binding_df['gene_name'] = binding_df['gene_name'].str.capitalize()
elif organism == 'human': binding_df['gene_name'] = binding_df['gene_name'].str.upper()
# binding_df = binding_df[binding_df['gene_name'].isin(gene_list)]
print(len(binding_df))
binding_df.head()

83


Unnamed: 0.1,Unnamed: 0,accession,gene_name,mol_type,organism,pos,plp_bds,plp_Tm,plp_bds3',plp_bds5',plp_Tm3',plp_Tm5',mfe,wanted,align_num,align_accession,align_descrip,plus/minus
0,10,ENSMUST00000163829,Actb,protein_coding,mouse,59,GGAGGGGAATACAGCCCGGGGAGCATCGTCGCCCGCGAAG,72.43,GGAGGGGAATACAGCCCGGG,GAGCATCGTCGCCCGCGAAG,57.44,57.81,-8.4,True,43,AK166349.1|AK152844.1|AK078935.1|AK150866.1|NG...,Mus musculus mammary gland RCB-0526 Jyg-MC(A)...,"-1,-1,-1,-1,1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1..."
1,11,ENSMUST00000163829,Actb,protein_coding,mouse,60,TGGAGGGGAATACAGCCCGGGGAGCATCGTCGCCCGCGAA,72.24,TGGAGGGGAATACAGCCCGG,GGAGCATCGTCGCCCGCGAA,55.15,59.78,-8.4,True,43,AK166349.1|AK152844.1|AK078935.1|AK150866.1|NG...,Mus musculus mammary gland RCB-0526 Jyg-MC(A)...,"-1,-1,-1,-1,1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1..."
2,12,ENSMUST00000163829,Actb,protein_coding,mouse,112,TCCTTCTGACCCATTCCCACCATCACACCCTGGTGCCTAG,75.4,TCCTTCTGACCCATTCCCAC,CATCACACCCTGGTGCCTAG,59.8,58.05,-7.9,True,96,AK166349.1|AK152844.1|AK150662.1|AK078935.1|AK...,Mus musculus mammary gland RCB-0526 Jyg-MC(A)...,"-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-..."
3,13,ENSMUST00000163829,Actb,protein_coding,mouse,120,CATAGGAGTCCTTCTGACCCATTCCCACCATCACACCCTG,72.19,CATAGGAGTCCTTCTGACCC,ATTCCCACCATCACACCCTG,55.55,58.1,-6.7,True,55,AK166349.1|AK152844.1|AK150662.1|AK078935.1|AK...,Mus musculus mammary gland RCB-0526 Jyg-MC(A)...,"-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-..."
4,14,ENSMUST00000163829,Actb,protein_coding,mouse,124,CCCACATAGGAGTCCTTCTGACCCATTCCCACCATCACAC,72.15,CCCACATAGGAGTCCTTCTG,ACCCATTCCCACCATCACAC,55.53,58.04,-6.7,True,47,AK166349.1|AK152844.1|AK150662.1|AK078935.1|AK...,Mus musculus mammary gland RCB-0526 Jyg-MC(A)...,"-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-..."


In [37]:
## 告诉计算机这次用是那篇文章涉及的分析 
## 其实可以修改这个if函数 只要最终输出的是barcode_df 这样一个矩阵
## 先尝试将barcode列选出来
PANEL = 'PRISM'
if PANEL == 'PRISM':
    probe_df = pd.DataFrame()
    barcode_df = pd.read_csv("./barcode_lib.csv")
elif PANEL == 'SPRINTseq':
    barcode_df = pd.read_excel(DATASET_DIR / "SPRINTSEQ_369_barcode.xlsx", index_col=0)[['Barcode sequence']]
    primer_l = 'TCCCTACACGACGCTCTTCCGATCT'
    primer_r = 'CATTCCTGCTGAACCGCTCTTCCGA'
    barcode_df['Barcode(70bp)'] = primer_l + barcode_df['Barcode sequence'] + primer_r + barcode_df['Barcode sequence']
barcode_df.head()

Unnamed: 0,barcode,Green,Red,Blue,Yellow,Greenseq,Redseq,Blueseq,Yellowseq,sum
0,GGGGTTGAATGGGTAGATTCACCTTACAATTTTTTACATCATGCTT...,0,0,0,5,GGGGTTGAATGGGTAGATTC,ACCTTACAATTTTTTACATC,ATGCTTCCTCATAGGGGCGA,GCCTAACAGTTCCCACCCTT,5
1,GGGGTTGAATGGGTAGATTCACCTTACAATTTTTTACATCCGTACA...,0,0,1,4,GGGGTTGAATGGGTAGATTC,ACCTTACAATTTTTTACATC,CGTACAATAAGTGGCTTATC,AGCAGACCCTGATCATCAAC,5
2,GGGGTTGAATGGGTAGATTCACCTTACAATTTTTTACATCGAGCCA...,0,0,2,3,GGGGTTGAATGGGTAGATTC,ACCTTACAATTTTTTACATC,GAGCCAGAACAGACCCTCGG,GCTGGTACGTACGCAACTTC,5
3,GGGGTTGAATGGGTAGATTCACCTTACAATTTTTTACATCTTAGGT...,0,0,3,2,GGGGTTGAATGGGTAGATTC,ACCTTACAATTTTTTACATC,TTAGGTATACACAAGAGTTC,AGCTGCCCTCGCGTAACCCC,5
4,GGGGTTGAATGGGTAGATTCACCTTACAATTTTTTACATCTAGCAC...,0,0,4,1,GGGGTTGAATGGGTAGATTC,ACCTTACAATTTTTTACATC,TAGCACACCGCATATGGGAA,CGTGATGCTCTTATCAATCA,5


In [None]:
## 接下来就是探针粘贴的过程了
## 可以直接进行粘贴 direct combine of binding site all
## 也可以选择其中其中进行粘贴 select middle one for multi binding sites
## 给每个基因选择3个备用选项 select 3 binding sites for each gene
## probe_stitch.ipynb 是以下代码的来源  
## 选择了以上的第3种方式

In [38]:
## 每个基因保存3条序列
## 设置一些空参数
probe_df = pd.DataFrame()
cont = 0
prism_pos = 0
prism_pos_list = [_+1 for _ in range(9)]
prism = prism_pos_list[prism_pos]
max_cont = 3
pre_gene_name = binding_df["gene_name"].iloc[0]
for num, gene in enumerate(binding_df["gene_name"]):
    if pre_gene_name != gene:
        pre_gene_name = gene
        cont = 0
        prism_pos += 1
        prism = prism_pos_list[prism_pos]
    elif cont == max_cont:
        continue
    # print(num, gene, prism)
    cont += 1
    binding = binding_df["plp_bds"].iloc[num]
    assert len(binding) == 40, f"binding site at pos {num} length is not 40bp: {binding}, {len(binding)} instead."

    binding_l = binding[:20].lower()
    binding_r = binding[20:].lower()
    barcode = barcode_df.loc[prism, "barcode"]
    probe = binding_r + barcode + binding_l

    if PANEL == 'PRISM':
        probe_info = pd.DataFrame({
            "PRISM": [f"PRISM_{prism}"],
            "gene":[f'{gene}_{cont}'],
            "probe_name":[f'PR_{prism}_{gene}_{cont}'],
            "probe_seq": [probe],
            "barcode_seq": [barcode],
            "binding_seq": [binding],})

    elif PANEL == 'SPRINTseq':
        probe_info = pd.DataFrame({
            "SPRINTseq": [f"SPRINTseq_{prism}"],
            "gene":[f'{gene}'],
            "probe_name":[f'Seq_{prism}_{gene}_{cont}'],
            "probe": [probe],
            "barcode": [barcode],
            "binding": [binding],})
    if len(probe_df) == 0: probe_df = probe_info
    else: probe_df = pd.concat([probe_df, probe_info])
probe_df = probe_df.reset_index(drop=True)
probe_df

Unnamed: 0,PRISM,gene,probe_name,probe_seq,barcode_seq,binding_seq
0,PRISM_1,Actb_1,PR_1_Actb_1,gagcatcgtcgcccgcgaagGGGGTTGAATGGGTAGATTCACCTTA...,GGGGTTGAATGGGTAGATTCACCTTACAATTTTTTACATCCGTACA...,GGAGGGGAATACAGCCCGGGGAGCATCGTCGCCCGCGAAG
1,PRISM_1,Actb_2,PR_1_Actb_2,ggagcatcgtcgcccgcgaaGGGGTTGAATGGGTAGATTCACCTTA...,GGGGTTGAATGGGTAGATTCACCTTACAATTTTTTACATCCGTACA...,TGGAGGGGAATACAGCCCGGGGAGCATCGTCGCCCGCGAA
2,PRISM_1,Actb_3,PR_1_Actb_3,catcacaccctggtgcctagGGGGTTGAATGGGTAGATTCACCTTA...,GGGGTTGAATGGGTAGATTCACCTTACAATTTTTTACATCCGTACA...,TCCTTCTGACCCATTCCCACCATCACACCCTGGTGCCTAG
3,PRISM_2,Gapdh_1,PR_2_Gapdh_1,ccgttgaatttgccgtgagtGGGGTTGAATGGGTAGATTCACCTTA...,GGGGTTGAATGGGTAGATTCACCTTACAATTTTTTACATCGAGCCA...,CATTCTCGGCCTTGACTGTGCCGTTGAATTTGCCGTGAGT
4,PRISM_2,Gapdh_2,PR_2_Gapdh_2,gccgttgaatttgccgtgagGGGGTTGAATGGGTAGATTCACCTTA...,GGGGTTGAATGGGTAGATTCACCTTACAATTTTTTACATCGAGCCA...,CCATTCTCGGCCTTGACTGTGCCGTTGAATTTGCCGTGAG
5,PRISM_2,Gapdh_3,PR_2_Gapdh_3,tgtgccgttgaatttgccgtGGGGTTGAATGGGTAGATTCACCTTA...,GGGGTTGAATGGGTAGATTCACCTTACAATTTTTTACATCGAGCCA...,TTCCCATTCTCGGCCTTGACTGTGCCGTTGAATTTGCCGT
6,PRISM_3,Hprt1_1,PR_3_Hprt1_1,gtccataatcagtccatgagGGGGTTGAATGGGTAGATTCACCTTA...,GGGGTTGAATGGGTAGATTCACCTTACAATTTTTTACATCTTAGGT...,CGAGCAAGTCTTTCAGTCCTGTCCATAATCAGTCCATGAG
7,PRISM_3,Hprt1_2,PR_3_Hprt1_2,tgtccataatcagtccatgaGGGGTTGAATGGGTAGATTCACCTTA...,GGGGTTGAATGGGTAGATTCACCTTACAATTTTTTACATCTTAGGT...,TCGAGCAAGTCTTTCAGTCCTGTCCATAATCAGTCCATGA
8,PRISM_3,Hprt1_3,PR_3_Hprt1_3,ctgtccataatcagtccatgGGGGTTGAATGGGTAGATTCACCTTA...,GGGGTTGAATGGGTAGATTCACCTTACAATTTTTTACATCTTAGGT...,CTCGAGCAAGTCTTTCAGTCCTGTCCATAATCAGTCCATG


In [97]:
# pd.DataFrame(probe_df['gene'].unique())

Unnamed: 0,0
0,Glp1r_1
1,Glp1r_2
2,Glp1r_3
3,Prkcd_1
4,Prkcd_2
5,Prkcd_3
6,Vipr2_1
7,Vipr2_2
8,Vipr2_3


In [98]:
# import os
# os.getcwd()

'/home/data/t180501/3_zhangshiwen_RNA_FISH/FJ_New_EesyCode'

In [39]:
probe_df.to_csv( './'+f'{PANEL}_probe.csv')
print(len(probe_df))
probe_df.head()

9


Unnamed: 0,PRISM,gene,probe_name,probe_seq,barcode_seq,binding_seq
0,PRISM_1,Actb_1,PR_1_Actb_1,gagcatcgtcgcccgcgaagGGGGTTGAATGGGTAGATTCACCTTA...,GGGGTTGAATGGGTAGATTCACCTTACAATTTTTTACATCCGTACA...,GGAGGGGAATACAGCCCGGGGAGCATCGTCGCCCGCGAAG
1,PRISM_1,Actb_2,PR_1_Actb_2,ggagcatcgtcgcccgcgaaGGGGTTGAATGGGTAGATTCACCTTA...,GGGGTTGAATGGGTAGATTCACCTTACAATTTTTTACATCCGTACA...,TGGAGGGGAATACAGCCCGGGGAGCATCGTCGCCCGCGAA
2,PRISM_1,Actb_3,PR_1_Actb_3,catcacaccctggtgcctagGGGGTTGAATGGGTAGATTCACCTTA...,GGGGTTGAATGGGTAGATTCACCTTACAATTTTTTACATCCGTACA...,TCCTTCTGACCCATTCCCACCATCACACCCTGGTGCCTAG
3,PRISM_2,Gapdh_1,PR_2_Gapdh_1,ccgttgaatttgccgtgagtGGGGTTGAATGGGTAGATTCACCTTA...,GGGGTTGAATGGGTAGATTCACCTTACAATTTTTTACATCGAGCCA...,CATTCTCGGCCTTGACTGTGCCGTTGAATTTGCCGTGAGT
4,PRISM_2,Gapdh_2,PR_2_Gapdh_2,gccgttgaatttgccgtgagGGGGTTGAATGGGTAGATTCACCTTA...,GGGGTTGAATGGGTAGATTCACCTTACAATTTTTTACATCGAGCCA...,CCATTCTCGGCCTTGACTGTGCCGTTGAATTTGCCGTGAG


In [100]:
# 探针overlap检查
# 探针和探针之间不要互相形成环

In [40]:
# 已经获取序列 
# 对序列进行检查
def overlap_degree(str1, str2):
    max_overlap = min(len(str1), len(str2))
    
    for i in range(max_overlap, 0, -1):
        if str1[-i:] == str2[:i]:
            return i
    
    return 0
meta = pd.read_csv("./PRISM_probe.csv",index_col=0)
meta

Unnamed: 0,PRISM,gene,probe_name,probe_seq,barcode_seq,binding_seq
0,PRISM_1,Actb_1,PR_1_Actb_1,gagcatcgtcgcccgcgaagGGGGTTGAATGGGTAGATTCACCTTA...,GGGGTTGAATGGGTAGATTCACCTTACAATTTTTTACATCCGTACA...,GGAGGGGAATACAGCCCGGGGAGCATCGTCGCCCGCGAAG
1,PRISM_1,Actb_2,PR_1_Actb_2,ggagcatcgtcgcccgcgaaGGGGTTGAATGGGTAGATTCACCTTA...,GGGGTTGAATGGGTAGATTCACCTTACAATTTTTTACATCCGTACA...,TGGAGGGGAATACAGCCCGGGGAGCATCGTCGCCCGCGAA
2,PRISM_1,Actb_3,PR_1_Actb_3,catcacaccctggtgcctagGGGGTTGAATGGGTAGATTCACCTTA...,GGGGTTGAATGGGTAGATTCACCTTACAATTTTTTACATCCGTACA...,TCCTTCTGACCCATTCCCACCATCACACCCTGGTGCCTAG
3,PRISM_2,Gapdh_1,PR_2_Gapdh_1,ccgttgaatttgccgtgagtGGGGTTGAATGGGTAGATTCACCTTA...,GGGGTTGAATGGGTAGATTCACCTTACAATTTTTTACATCGAGCCA...,CATTCTCGGCCTTGACTGTGCCGTTGAATTTGCCGTGAGT
4,PRISM_2,Gapdh_2,PR_2_Gapdh_2,gccgttgaatttgccgtgagGGGGTTGAATGGGTAGATTCACCTTA...,GGGGTTGAATGGGTAGATTCACCTTACAATTTTTTACATCGAGCCA...,CCATTCTCGGCCTTGACTGTGCCGTTGAATTTGCCGTGAG
5,PRISM_2,Gapdh_3,PR_2_Gapdh_3,tgtgccgttgaatttgccgtGGGGTTGAATGGGTAGATTCACCTTA...,GGGGTTGAATGGGTAGATTCACCTTACAATTTTTTACATCGAGCCA...,TTCCCATTCTCGGCCTTGACTGTGCCGTTGAATTTGCCGT
6,PRISM_3,Hprt1_1,PR_3_Hprt1_1,gtccataatcagtccatgagGGGGTTGAATGGGTAGATTCACCTTA...,GGGGTTGAATGGGTAGATTCACCTTACAATTTTTTACATCTTAGGT...,CGAGCAAGTCTTTCAGTCCTGTCCATAATCAGTCCATGAG
7,PRISM_3,Hprt1_2,PR_3_Hprt1_2,tgtccataatcagtccatgaGGGGTTGAATGGGTAGATTCACCTTA...,GGGGTTGAATGGGTAGATTCACCTTACAATTTTTTACATCTTAGGT...,TCGAGCAAGTCTTTCAGTCCTGTCCATAATCAGTCCATGA
8,PRISM_3,Hprt1_3,PR_3_Hprt1_3,ctgtccataatcagtccatgGGGGTTGAATGGGTAGATTCACCTTA...,GGGGTTGAATGGGTAGATTCACCTTACAATTTTTTACATCTTAGGT...,CTCGAGCAAGTCTTTCAGTCCTGTCCATAATCAGTCCATG


In [147]:
# 计算序列存在重合的情况 避免形成圆圈
# 输出的这些都是不能被使用的
for prism in range(3): ## 多少个基因这边输入多少
    detect_seq = list(meta[meta['PRISM'] == f'PRISM_{prism+1}']['binding_seq'])
    gene = list(meta[meta['PRISM'] == f'PRISM_{prism+1}']['probe_name'])
    gene = gene[0]
    for i in range(-1,2,1):
        degree = max(overlap_degree(detect_seq[i], detect_seq[i+1]), overlap_degree(detect_seq[i+1], detect_seq[i]))
        if degree > 5:
            print(f'{gene}, {i}, {i+1}, degree={degree}')

PR_3_Vipr2_1, -1, 0, degree=36
PR_3_Vipr2_1, 0, 1, degree=37
PR_3_Vipr2_1, 1, 2, degree=39


In [148]:
## 接下来是空间位阻的检查

In [41]:
## 又是先载入作者写的函数
import pandas as pd
def dna_sec_struct(seq, temp=45):
    # Predict the minimum free energy
    mfe = dg(seq, temp=temp)
    # `fold` returns a list of `seqfold.Struct` from the minimum free energy structure
    structs = fold(seq, temp=temp)
    return mfe, structs

def seq_minus(seq):
    translib = {"A": "T", "T": "A", "C": "G", "G": "C"}
    return "".join(list(reversed([translib[i] for i in seq])))

In [154]:
# pd.read_csv('./PRISM_probe.csv',index_col=0)

Unnamed: 0,PRISM,gene,probe_name,probe_seq,barcode_seq,binding_seq
0,PRISM_1,Glp1r_1,PR_1_Glp1r_1,acaccgtggtaccctgagggTGCTATAATGAGTAGCTGAGTGACGC...,TGCTATAATGAGTAGCTGAGTGACGCCGCACGGTTCTCGAACGGAT...,CTGCACCGTCTCTGAGAGGGACACCGTGGTACCCTGAGGG
1,PRISM_1,Glp1r_2,PR_1_Glp1r_2,aaggaacctgggggcccatcTGCTATAATGAGTAGCTGAGTGACGC...,TGCTATAATGAGTAGCTGAGTGACGCCGCACGGTTCTCGAACGGAT...,AGGGGCAGCTGACATTCACGAAGGAACCTGGGGGCCCATC
2,PRISM_1,Glp1r_3,PR_1_Glp1r_3,cacactccgacaggtccctcTGCTATAATGAGTAGCTGAGTGACGC...,TGCTATAATGAGTAGCTGAGTGACGCCGCACGGTTCTCGAACGGAT...,CTCCCCTCGCTTAGACTCTTCACACTCCGACAGGTCCCTC
3,PRISM_2,Prkcd_1,PR_2_Prkcd_1,cacagaacagaaggtgggctTGCTATAATGAGTAGCTGAGTGACGC...,TGCTATAATGAGTAGCTGAGTGACGCCGCACGGTTCTCGAGATTCC...,CCCCAGACAAACTCTTTGCACACAGAACAGAAGGTGGGCT
4,PRISM_2,Prkcd_2,PR_2_Prkcd_2,ggtcacagaaggtggggctcTGCTATAATGAGTAGCTGAGTGACGC...,TGCTATAATGAGTAGCTGAGTGACGCCGCACGGTTCTCGAGATTCC...,CCAGAGCAAACTGCCACAGTGGTCACAGAAGGTGGGGCTC
5,PRISM_2,Prkcd_3,PR_2_Prkcd_3,cttccgggaagatctctgggTGCTATAATGAGTAGCTGAGTGACGC...,TGCTATAATGAGTAGCTGAGTGACGCCGCACGGTTCTCGAGATTCC...,ACAGACTCTGTTGTGTCCAGCTTCCGGGAAGATCTCTGGG
6,PRISM_3,Vipr2_1,PR_3_Vipr2_1,tgtggtcgtttgtgtcccagTGCTATAATGAGTAGCTGAGTGACGC...,TGCTATAATGAGTAGCTGAGTGACGCCGCACGGTTCTCGACTATTT...,AATGACCCACCAGGGGATGCTGTGGTCGTTTGTGTCCCAG
7,PRISM_3,Vipr2_2,PR_3_Vipr2_2,tgctgtggtcgtttgtgtccTGCTATAATGAGTAGCTGAGTGACGC...,TGCTATAATGAGTAGCTGAGTGACGCCGCACGGTTCTCGACTATTT...,CCGAATGACCCACCAGGGGATGCTGTGGTCGTTTGTGTCC
8,PRISM_3,Vipr2_3,PR_3_Vipr2_3,atgctgtggtcgtttgtgtcTGCTATAATGAGTAGCTGAGTGACGC...,TGCTATAATGAGTAGCTGAGTGACGCCGCACGGTTCTCGACTATTT...,TCCGAATGACCCACCAGGGGATGCTGTGGTCGTTTGTGTC


In [42]:
# 注意提供的序列并不是全部的序列，需要重新琢磨以下这个代码看这个具体的判断标准究竟是什么。
# 可能要用binding seq也说不定
# padlock = pd.read_excel(r'E:\TMC\probe_designer\dataset\2024.3.16_TCR&mutation_3_Breast_cancer\binding_site_revised.xlsx', sheet_name='padlock')
padlock = pd.read_csv('./PRISM_probe.csv',index_col=0)[['probe_name','probe_seq']]
padlock

Unnamed: 0,probe_name,probe_seq
0,PR_1_Actb_1,gagcatcgtcgcccgcgaagGGGGTTGAATGGGTAGATTCACCTTA...
1,PR_1_Actb_2,ggagcatcgtcgcccgcgaaGGGGTTGAATGGGTAGATTCACCTTA...
2,PR_1_Actb_3,catcacaccctggtgcctagGGGGTTGAATGGGTAGATTCACCTTA...
3,PR_2_Gapdh_1,ccgttgaatttgccgtgagtGGGGTTGAATGGGTAGATTCACCTTA...
4,PR_2_Gapdh_2,gccgttgaatttgccgtgagGGGGTTGAATGGGTAGATTCACCTTA...
5,PR_2_Gapdh_3,tgtgccgttgaatttgccgtGGGGTTGAATGGGTAGATTCACCTTA...
6,PR_3_Hprt1_1,gtccataatcagtccatgagGGGGTTGAATGGGTAGATTCACCTTA...
7,PR_3_Hprt1_2,tgtccataatcagtccatgaGGGGTTGAATGGGTAGATTCACCTTA...
8,PR_3_Hprt1_3,ctgtccataatcagtccatgGGGGTTGAATGGGTAGATTCACCTTA...


In [43]:
## 将probe_seq列小写全部换成大写
# 将 probe_seq 列中的小写字母转换为大写
padlock["probe_seq"] = padlock["probe_seq"].str.upper()
# 验证结果
print(padlock.head())

     probe_name                                          probe_seq
0   PR_1_Actb_1  GAGCATCGTCGCCCGCGAAGGGGGTTGAATGGGTAGATTCACCTTA...
1   PR_1_Actb_2  GGAGCATCGTCGCCCGCGAAGGGGTTGAATGGGTAGATTCACCTTA...
2   PR_1_Actb_3  CATCACACCCTGGTGCCTAGGGGGTTGAATGGGTAGATTCACCTTA...
3  PR_2_Gapdh_1  CCGTTGAATTTGCCGTGAGTGGGGTTGAATGGGTAGATTCACCTTA...
4  PR_2_Gapdh_2  GCCGTTGAATTTGCCGTGAGGGGGTTGAATGGGTAGATTCACCTTA...


In [44]:
## 保存最终的文件
padlock.to_csv('./probe_seq.csv')

In [49]:
## 调用上面的函数 查看评分
for index, (name, seq) in padlock.iterrows():
    seq = seq_minus(seq)
    seq = seq.upper() * 3
    mfe, structs = dna_sec_struct(seq, temp=60)
    # if mfe < 0:
    print(f"{name}:{mfe}\n{seq}\n{dot_bracket(seq, structs)}")
    # continue

PR_1_Actb_1:-0.3
CCCGGGCTGTATTCCCCTCCGTTGATGATCAGGGTCTGCTGATAAGCCACTTATTGTACGGATGTAAAAAATTGTAAGGTGAATCTACCCATTCAACCCCCTTCGCGGGCGACGATGCTCCCCGGGCTGTATTCCCCTCCGTTGATGATCAGGGTCTGCTGATAAGCCACTTATTGTACGGATGTAAAAAATTGTAAGGTGAATCTACCCATTCAACCCCCTTCGCGGGCGACGATGCTCCCCGGGCTGTATTCCCCTCCGTTGATGATCAGGGTCTGCTGATAAGCCACTTATTGTACGGATGTAAAAAATTGTAAGGTGAATCTACCCATTCAACCCCCTTCGCGGGCGACGATGCTC
.....................................................................................................................................................................................................................................................................................(((....))).........................................................................
PR_1_Actb_2:-0.3
CCGGGCTGTATTCCCCTCCAGTTGATGATCAGGGTCTGCTGATAAGCCACTTATTGTACGGATGTAAAAAATTGTAAGGTGAATCTACCCATTCAACCCCTTCGCGGGCGACGATGCTCCCCGGGCTGTATTCCCCTCCAGTTGATGATCAGGGTCTGCTGATAAGCCACTTATTGTACGGATGTAAAAAATTGTAAGGTGAATCTACCCATTCAACCCCTTCGCGGGCGACGATGCTCCCCGG

In [46]:
a='CCCGGGCTGTATTCCCCTCCGTTGATGATCAGGGTCTGCTGATAAGCCACTTATTGTACGGATGTAAAAAATTGTAAGGTGAATCTACCCATTCAACCCCCTTCGCGGGCGACGATGCTCCCCGGGCTGTATTCCCCTCCGTTGATGATCAGGGTCTGCTGATAAGCCACTTATTGTACGGATGTAAAAAATTGTAAGGTGAATCTACCCATTCAACCCCCTTCGCGGGCGACGATGCTCCCCGGGCTGTATTCCCCTCCGTTGATGATCAGGGTCTGCTGATAAGCCACTTATTGTACGGATGTAAAAAATTGTAAGGTGAATCTACCCATTCAACCCCCTTCGCGGGCGACGATGCTC'
print(len(a))

360
