In [1]:
import pandas as pd
import os
import numpy as np
import seaborn as sns
from scipy.stats import spearmanr, pearsonr
import matplotlib.pyplot as plt
import matplotlib
import scipy
import isolearn
import itertools
import collections
from Levenshtein import distance

matplotlib.rcParams['figure.dpi'] = 120
matplotlib.rcParams['font.sans-serif'] = 'Helvetica'

# import dinucleotide shuffling function from deeplift
from deeplift.dinuc_shuffle import dinuc_shuffle

import re

import sys

sys.path.append('../../../analysis/paper_figures')
sys.path.append('../')
from figure_utils import *
from analyze_motifs import *

from design_utils import TEST_FOLD_IDX,DESIGNED_SEQ_LEN,MODEL_TYPE_DF,CELL_TYPES
from design_utils import load_ensemble_model, seq_to_one_hot, longest_repeat, get_paired_editdistances, load_maxmin_ensemble_model

In [2]:
### Load data ###

d2_final_df,_,d2_deseq_df = load_motif_data_d2(qthresh_suffix='_qthresh05')
# rebase d2_final_df start and stop to 0 indexed
d2_final_df['start'] -= 1
# d2_final_df['stop'] -= 1 # don't adjust stop; this is the last base that is included in the sequence

# sort d2_deseq_df by descending log2FoldChange_H2K_deseq
# d2_deseq_df = d2_deseq_df.sort_values(by='log2FoldChange_H2K_deseq', ascending=False)

seq_df = pd.DataFrame(columns=['sequence_name','design_type','cell_type','sequence'])

In [3]:
# get indices of n_top_seqs highest log2FoldChange_H2K_deseq enhancers and n_top_seqs lowest log2FoldChange_H2K_deseq enhancers
n_top_seqs = 5
top_hepg2_inds = np.argsort(d2_deseq_df['log2FoldChange_H2K_deseq'])[::-1][:n_top_seqs].values
top_k562_inds = np.argsort(d2_deseq_df['log2FoldChange_H2K_deseq'])[:n_top_seqs].values
print(top_hepg2_inds)
print(top_k562_inds)

[433 905 633 335 667]
[ 976 1041  762  156 1099]


In [4]:
motif_flank_buffer = 1 # how many bps to each side of motif to include in motif
seq_len = 145

def extract_nonmotif_from_seq(nonmotif_mask):
    nonmotif_intervals = []
    cur_interval = []
    for i in range(len(nonmotif_mask)) :
        if nonmotif_mask[i] == 1 :
            if len(cur_interval) == 0 :
                cur_interval.append(i)
        else :
            if len(cur_interval) > 0 :
                cur_interval.append(i)
                nonmotif_intervals.append(cur_interval)
                cur_interval = []
    return nonmotif_intervals


def check_for_RE_sites(seq):

    KPNI = "GGTACC"
    XBAI = "TCTAGA"
    SFII = "GGCC.....GGCC"

    KPNI_sites = [m.start() for m in re.finditer(KPNI, seq)]
    if len(KPNI_sites) > 0 :
        return True
    XBAI_sites = [m.start() for m in re.finditer(XBAI, seq)]
    if len(XBAI_sites) > 0 :
        return True
    SFII_sites = [m.start() for m in re.finditer(SFII, seq)]
    if len(SFII_sites) > 0 :
        return True
    return False

def generate_shuffled_nonmotif_seqs(seq_idx, cur_cell_type, cur_seq, nonmotif_intervals, n_shuffled_nonmotif_seqs,seq_df):
    for i in range(n_shuffled_nonmotif_seqs):
        # get shuffled nonmotif sequence - w/o RE sites!
        RE_sites = True
        while RE_sites:
            shuffled_nonmotif_seq = np.array(list(cur_seq))
            for nonmotif_interval in nonmotif_intervals :
                start,stop = nonmotif_interval
                shuffled_nonmotif_seq[start:stop] = np.array(list(dinuc_shuffle(cur_seq[start:stop])))

            shuffled_nonmotif_seq = ''.join(shuffled_nonmotif_seq)
            RE_sites = check_for_RE_sites(shuffled_nonmotif_seq)

        seq_name = f'{seq_idx}_shuffled_nonmotif_{i}'
        seq_df = pd.concat([seq_df, pd.DataFrame([[seq_name, 'shuffled_nonmotif', cur_cell_type,shuffled_nonmotif_seq]], columns=['sequence_name','design_type','cell_type','sequence'])])
    return seq_df

def generate_ablated_motif_seqs(motif_flank_buffer, seq_idx, cur_cell_type, cur_seq, cur_seq_motifs, motif_pairs, seq_df,n_vars):
    # do single motif ablations
    for motif_idx in range(len(cur_seq_motifs)):
        for var_idx in range(n_vars):
            RE_sites = True
            while RE_sites:
                cur_seq_copy = cur_seq
                motif_start, motif_end = cur_seq_motifs.iloc[motif_idx]['start'], cur_seq_motifs.iloc[motif_idx]['stop']
                # replace motif with shuffled sequence - note here I'm shuffling the whole sequence and excerpting an appropriate length to replace the motif, this is a reasonable bg, shuffling only the motif would probably result in motif-like sequences too often
                cur_seq_copy = cur_seq_copy[:motif_start - motif_flank_buffer] + dinuc_shuffle(cur_seq)[motif_start - motif_flank_buffer:motif_end + motif_flank_buffer] + cur_seq_copy[motif_end + motif_flank_buffer:]
                RE_sites = check_for_RE_sites(cur_seq_copy)
            seq_name = f'{seq_idx}_ablated_{motif_idx}_{var_idx}'
            seq_df = pd.concat([seq_df, pd.DataFrame([[seq_name, 'ablated', cur_cell_type,cur_seq_copy]], columns=['sequence_name','design_type','cell_type','sequence'])])

    # do motif pair ablations
    if len(motif_pairs) > 2 :
        for motif_set_idx in range(len(motif_pairs)):
            for var_idx in range(n_vars):
                RE_sites = True
                while RE_sites:
                    cur_seq_copy = cur_seq
                    motifs = np.array(motif_pairs)[motif_set_idx]
                    cur_motif_set = cur_seq_motifs.iloc[motifs]
                    for motif_start, motif_end in zip(cur_motif_set['start'].values, cur_motif_set['stop'].values) :
                        # replace motif with shuffled sequence - note here I'm shuffling the whole sequence and excerpting an appropriate length to replace the motif, this is a reasonable bg, shuffling only the motif would probably result in motif-like sequences too often
                        cur_seq_copy = cur_seq_copy[:motif_start - motif_flank_buffer] + dinuc_shuffle(cur_seq)[motif_start - motif_flank_buffer:motif_end + motif_flank_buffer] + cur_seq_copy[motif_end + motif_flank_buffer:]
                    RE_sites = check_for_RE_sites(cur_seq_copy)

                powerset_suffix = ''.join([str(i) for i in motif_pairs[motif_set_idx]])
                seq_name = f'{seq_idx}_ablated_{powerset_suffix}_{var_idx}'
                seq_df = pd.concat([seq_df, pd.DataFrame([[seq_name, 'ablated', cur_cell_type,cur_seq_copy]], columns=['sequence_name','design_type','cell_type','sequence'])])
    return seq_df

seq_df = pd.DataFrame()

for seq_idx in np.concatenate([top_hepg2_inds,top_k562_inds]):
    print(seq_idx)
    
    # seq_idx = d2_deseq_df.index[cur_idx]
    cur_cell_type = d2_deseq_df.iloc[seq_idx]['cell_type']

    cur_seq = d2_deseq_df.iloc[seq_idx]['enhancer']
    cur_seq_motifs = d2_final_df[d2_final_df['sequence_name'] == seq_idx]

    # create mask with motif positions
    motif_mask = np.zeros(seq_len,dtype=int)
    for motif_start, motif_end in zip(cur_seq_motifs['start'].values, cur_seq_motifs['stop'].values) :
        motif_mask[motif_start - motif_flank_buffer:motif_end + motif_flank_buffer] = 1

    # get nonmotif_mask - this can be used as input to masked SeqProp for optimized/adversarial perturbations!
    nonmotif_mask = (1 - motif_mask)

    n_motifs = len(cur_seq_motifs)
    n_motifs_not_cluster27 = len(cur_seq_motifs[cur_seq_motifs['jaspar_cluster'] != 'cluster_27'])
    # get all pairs of motifs
    motif_pairs = list(itertools.combinations(range(n_motifs), 2))
    # for pair in motif_pairs :
    #     print(pair)
    print(n_motifs)

    # print(f'{seq_idx} {n_motifs} {n_motifs_not_cluster27} {len(motif_pairs)}')

    # get all intervals of nonmotif_sequence
    nonmotif_intervals = extract_nonmotif_from_seq(nonmotif_mask)

    n_shuffled_nonmotif_seqs = 10
    n_vars = 3

    seq_df = generate_shuffled_nonmotif_seqs(seq_idx, cur_cell_type, cur_seq, nonmotif_intervals, n_shuffled_nonmotif_seqs,seq_df)

    seq_df = generate_ablated_motif_seqs(motif_flank_buffer, seq_idx, cur_cell_type, cur_seq, cur_seq_motifs, motif_pairs, seq_df,n_vars)

seq_df['og_seq_idx'] = seq_df['sequence_name'].apply(lambda x: int(x.split('_')[0]))

433
3
905
5
633
3
335
3
667
4
976
2
1041
2
762
5
156
6
1099
7


In [5]:
def filter_RE_sites(temp_df):

    KPNI = "GGTACC"
    XBAI = "TCTAGA"
    SFII = "GGCC.....GGCC"

    temp_df["kpnI"] = temp_df["sequence"].apply(lambda x: KPNI in x)
    temp_df["xbaI"] = temp_df["sequence"].apply(lambda x: XBAI in x)
    temp_df["sfiI"] = temp_df["sequence"].apply(lambda x: re.search(SFII, x))
        # convert None to 0, False to 0, True to 1
    temp_df["sfiI"] = temp_df["sfiI"].apply(lambda x: 0 if x is None else 1)
    temp_df["kpnI"] = temp_df["kpnI"].apply(lambda x: 1 if x is True else 0)
    temp_df["xbaI"] = temp_df["xbaI"].apply(lambda x: 1 if x is True else 0)

        # extract only rows with no restriction sites
    temp_df = temp_df.loc[(temp_df['kpnI']==0) & (temp_df['xbaI']==0) & (temp_df['sfiI']==0)]
    return temp_df

n_total_seqs = seq_df.shape[0]
temp_df = filter_RE_sites(seq_df)
print(f'{temp_df.shape[0]} out of {n_total_seqs} sequences have no restriction sites')

433 out of 433 sequences have no restriction sites


In [6]:
seq_df[(seq_df['design_type']=='ablated')]['og_seq_idx'].value_counts()

1099    84
156     63
905     45
762     45
667     30
433     18
633     18
335     18
976      6
1041     6
Name: og_seq_idx, dtype: int64

In [7]:
## don't need to run this again ###


model_types = ['d1_finetuned','dhs64_finetuned']
design_seq_len = 145
x_tot = np.array([seq_to_one_hot(seq) for seq in seq_df['sequence'].values])

for model_type in model_types:
    model_basename = MODEL_TYPE_DF.loc[model_type,'model_basename']
    max_seq_len = MODEL_TYPE_DF.loc[model_type,'input_len']
    n_models_to_ensemble = MODEL_TYPE_DF.loc[model_type,'n_ensemble']

    model_dir = f'../../aws/for_aws/sequencing_analysis/retraining/retrained_models/cf10/test_fold_0/{model_type}'
    if model_type == 'd2_dhs':
        model_dir = f'../../aws/for_aws/sequencing_analysis/retraining/retrained_models/cf10/test_fold_0/{model_type}/hp_0'
    # if model_type in ['d1_finetuned']:
    #     continue

    K.clear_session()

    model_ensemble = load_ensemble_model(model_dir,model_basename,range(1,1+n_models_to_ensemble),
                                        design_seq_len,max_seq_len=max_seq_len)
    

    y_pred = model_ensemble.predict(x_tot)

    # add columns for the predictions to seq_df with the model type as a suffix
    seq_df[f'log2(HEPG2)_pred_{model_type}'] = y_pred[:,0]
    seq_df[f'log2(K562)_pred_{model_type}'] = y_pred[:,1]
    seq_df[f'log2(H2K)_pred_{model_type}'] = y_pred[:,0]-y_pred[:,1]

# write seq_df to csv
design_dir = '../D3_design_scripts/designed_seqs/'
seq_df.to_csv(f'{design_dir}/shuffled_and_ablated_top_d2_enhancers.csv',index=False)

