In [1]:
import pandas as pd
from datasets import dataset_list, get_sg_groups_df
from sgrna_designer import ensembl, design
import re
from tqdm import tqdm

In [2]:
train_data_names = pd.read_csv('../data/processed/train_data_names.csv').name.to_list()

In [3]:
train_data_list = list()
for ds in dataset_list:
    if ds.name in train_data_names:
        if ds.endogenous:
            train_data_list.append(ds)


sg_groups_df = get_sg_groups_df(train_data_list)

Context sequences with multiple targets: 0


In [4]:
width = 25
pos_df = sg_groups_df.copy()
pos_df['seq_start'] = (pos_df['sgRNA Cut Position (1-based)'] - width -
                       (pos_df['Strand of sgRNA'] == '-').astype(int))
pos_df['seq_end'] = (pos_df['sgRNA Cut Position (1-based)'] + width -
                     (pos_df['Strand of sgRNA'] == '-').astype(int))

In [5]:
ensembl_id_df = (pos_df[['Target Gene ID']]
                 .drop_duplicates())
seq_region_list = []
for gene_id in tqdm(ensembl_id_df['Target Gene ID']):
    try:
        id_info = ensembl.get_ensembl_id_information(gene_id)
        seq_region_list.append(id_info['seq_region_name'])
    except:
        seq_region_list.append(pd.NA)
print('Missing transcripts: ' +  str(pd.isna(seq_region_list).sum()))
ensembl_id_df['seq_region'] = seq_region_list

100%|██████████| 125/125 [02:00<00:00,  1.04it/s]

Missing transcripts: 0





In [6]:
pos_df = (pos_df.merge(ensembl_id_df, how='inner',
                       on='Target Gene ID'))

In [7]:
rv_row = pos_df.loc[0,:]
fw_row = pos_df.loc[1,:]

rv_seq = design.reverse_compliment(
    ensembl.get_region_sequence(rv_row['seq_start'], rv_row['seq_end'], rv_row['seq_region']))
re.search(rv_row['sgRNA Context Sequence'], rv_seq)


<re.Match object; span=(4, 34), match='TCAGAAATAATACCAACAACTGGAGGGAGA'>

In [8]:
fw_seq = ensembl.get_region_sequence(fw_row['seq_start'], fw_row['seq_end'
                                                                 ''], fw_row['seq_region'])
re.search(fw_row['sgRNA Context Sequence'], fw_seq)

<re.Match object; span=(4, 34), match='GTCAAACAGCTCACTGATCTGGGCCGGCGT'>

In [9]:
relevant_cols = ['sgRNA Context Sequence', 'Target Gene ID', 'seq_start', 'seq_end', 'seq_region', 'Strand of sgRNA']
human_pos_df = (pos_df.loc[pos_df['Target Taxon'] == 9606,
                           relevant_cols]
                .drop_duplicates()
                .reset_index(drop=True))

In [10]:
human_expanded_seqs = ensembl.post_region_sequences(human_pos_df['seq_start'], human_pos_df['seq_end'],
                                                    human_pos_df['seq_region'])

In [11]:
human_expanded_seq_df = pd.DataFrame(human_expanded_seqs)
human_pos_seqs_df = pd.concat([human_pos_df, human_expanded_seq_df], axis=1)

In [12]:
mouse_pos_df = (pos_df.loc[pos_df['Target Taxon'] == 10090,
                           relevant_cols]
                .drop_duplicates()
                .reset_index(drop=True))

In [13]:
mouse_expanded_seqs = ensembl.post_region_sequences(mouse_pos_df['seq_start'], mouse_pos_df['seq_end'],
                                                    mouse_pos_df['seq_region'], species='mouse')

In [14]:
mouse_expanded_seq_df = pd.DataFrame(mouse_expanded_seqs)
mouse_pos_seqs_df = pd.concat([mouse_pos_df, mouse_expanded_seq_df], axis=1)

In [15]:
pos_seqs_df = (pd.concat([human_pos_seqs_df,
                          mouse_pos_seqs_df],
                         axis=0)
               .reset_index(drop=True))
pos_seqs_df['expanded seq'] = pos_seqs_df.apply(lambda df: (df['seq'] if df['Strand of sgRNA'] == '+' else
                                                            design.reverse_compliment(df['seq'])), axis=1)

In [19]:
out_df = (pos_seqs_df[['sgRNA Context Sequence', 'expanded seq']]
          .rename({'sgRNA Context Sequence': 'ID',
                   'expanded seq': 'Target'}, axis=1))
out_df['PAM Index'] = width + 3

In [22]:
out_df.to_csv('../data/interim/rs_dev_all_sgrnas_extended.txt',
              sep='\t', index=False)