ok all importing of data and whatnot should work proper now

In [10]:
# @TODO Dynamically filter peptide set based on length(s) of input sequences of binders
#       i.e. 2 binders, one 11 AA long, one 13 AA long, each gets their own "subset" of the
#       full peptide lilst that can be compared to it. For any number of input sequences

import pandas as pd
import numpy as np
from scipy.stats import kendalltau
import matplotlib.pyplot as plt
from typing import Set, Tuple, Dict

class SequenceSimilarity:
    '''
    Class that takes in a path to a list of amino acid sequences as well
    as any number of peptide sequences explicitly that are known to have
    a certain set of properties. Generates metrics for similarity for each
    peptide in path and returns domains AA sequence with high similarity
    '''

    def __init__(self, binders: Dict, data_paths: Dict, peps_path: str, aa_col: str):
        
        self.binder_dict = binders
        self.data_paths_dict = data_paths
        self.__update_binders()
        self.__update_similarity_data()

        self.peps = pd.read_csv(peps_path)
        self.peps.columns = [aa_col]
        self.peps_same_len = self.peps[self.peps[aa_col].str.len() == self.binder_len]
        self.aa_col = aa_col

        self.AA = set('LINGVEPHKAYWQMSCTFRD')
        self.sseq_set: Set[Tuple[int, str]] # full set of binder subseqs
        self.top_sseq: Set[Tuple[int, str]] # set of sub sequences w/ high simil
        self.top_seq: Set[str]              # set of peptides with high simil
    
    def __update_binders(self) -> None:
        '''
        Private method to set the binders stored by
        class and check to make sure they are of the
        same length as one another
        '''
        print(self.binder_dict)
        self.binders = [binder for binder in self.binder_dict.values()]
        try:
            self.binder_len = len(self.binders[0])
        except:
            print("Need at least one input binder")
        
        for binder in self.binders:
            if len(binder) != self.binder_len:
                print("All binders must be of same length")
                # @TODO Handle multiple lengths of binders, each differently
                #       lengthed binder is compared with different parts of the
                #       full peptide set of the same length

    def __update_similarity_data(self) -> None:
        """
        Private method to store the paths of any data needed
        for similarity calcs and create Dataframes from them
        """
        self.data_paths: Dict = self.data_paths_dict
        self.data = {data:pd.read_csv(self.data_paths[data], header=None) \
                     for data in self.data_paths.keys()}


    def df_filter_subseq(self, sub_seq: str, ind: int = None):
        '''
        Takes in a subsequence of equal or lesser length to
        peptides in class peptide dataframe and returns a dataframe
        containing only those peptides containing the sequence
        '''
        if not {*sub_seq}.issubset({*self.AA}):
            raise Exception('Invalid subsequence')
        if ind is None:
            return self.peps_same_len[peps_same_len[self.aa_col].str.contains(sub_seq)]

        data_filter = self.peps_same_len[self.aa_col].apply(
            lambda s: s[ind:len(sub_seq) == sub_seq])
        return self.peps_same_len.filter(data_filter)

    def get_sim_matrix(self, seq) -> pd.DataFrame:
        return self.data.filter


    def get_binder_subseq(self) -> pd.DataFrame:
        '''
        Generates all possible subsequences for binders
        provided in class constructor
        '''
        def gen_all_subseq(seq, sub_seq, i):
            if i == len(seq):
                if len(sub_seq) != 0:
                    yield(sub_seq)
                else:
                    gen_all_subseq(seq, sub_seq, sub_seq)
                gen_all_subseq(seq, sub_seq+[seq[i]], i+1)

        sseq = dict.fromkeys(self.binders)
        for binder in self.binders:
            sseq[binder] = [sseq for sseq in list(
                gen_all_subseq(binder, '', 0))]
        return sseq

    def get_PAM30_similarity(self) -> pd.DataFrame:
        '''
        Returns the PAM30 similarity of peptides to
        specified binder sequences
        @TODO: Automatically get perfect match and lowest match
        @TODO: Generalize for all binder sequences inputted
        '''
        raise NotImplementedError

    def get_BLOSUM_similarity(self) -> pd.DataFrame:
        raise NotImplementedError
    '''
    def get_RRM_SN_ratio(self):
        get_eiip_seq = lambda pep: list(map(lambda aa: self.AA_EIIP[aa], pep))
        get_dft_from_eiip = lambda eiip: np.fft.rfft(eiip)[1:]
    '''

    def get_kendalltau_corr_map(self) -> Tuple:
        return kendalltau(self.data['AA_MAP'][['Num']], self.data['AA_MAP'][['EIIP']])

In [12]:

DATA_PATHS = {
    "BLOSUM":"../src_data/BLOSUM.csv",
    "PAM30":"../src_data/pam30.csv",
    "AA_MAP":"../src_data/aa_chart.csv",
}
SEQS = {
    'GRBP5':'IMVTESSDYSSY',
    'M6':'IMVTASSAYDDY'
}
AA_COL = 'Sequences'
PEP_PATH = '../src_data/Sequence_data.csv'
similarity = SequenceSimilarity(SEQS, DATA_PATHS, PEP_PATH, AA_COL)
similarity.peps_same_len

{'GRBP5': 'IMVTESSDYSSY', 'M6': 'IMVTASSAYDDY'}


Unnamed: 0,Sequences
13,SVPHFSDEDKDP
14,VPHFSDEDKDPE
28,SVPHFSEEEKEA
29,VPHFSEEEKEAE
43,SVPHFSDEDKDP
...,...
28873,FLRRIRPKLKWD
28874,LRRIRPKLKWDN
28875,RRIRPKLKWDNQ
28903,YGGFLRRQFKVV


In [37]:
## fixing the filter df function

AA = set('LINGVEPHKAYWQMSCTFRD')
aa_col = "Sequences"

def df_filter_subseq(peps, sub_seq: str, ind: int = None):
    
        if not {*sub_seq}.issubset({*AA}):
            raise Exception('Invalid subsequence')
        if ind is None:
            return peps[peps[aa_col].str.contains(sub_seq)]
        return peps[peps[aa_col].str.find(sub_seq) == ind]
    
df_filter_subseq(similarity.peps_same_len, 'SVP', ind=0)

Unnamed: 0,Sequences
13,SVPHFSDEDKDP
28,SVPHFSEEEKEA
43,SVPHFSDEDKDP
58,SVPHFSEEEKEP
73,SVPHFSDEDKDP
13697,SVPVEPEDDDMV
13760,SVPLRPEEDELI
27635,SVPHFSDEDKDP
28199,SVPHFSEEEKEA
28341,SVPHFSDEDKDP


In [42]:
binders = ['IMVTESSDYSSY', 'IMVTASSAYDDY']

def get_binder_subseq(binders) -> pd.DataFrame:
        '''
        Generates all possible subsequences for binders
        provided in class constructor
        '''
        def gen_all_subseq(seq, sub_seq, i):
            if i == len(seq):
                if len(sub_seq) != 0:
                    yield(sub_seq)
                else:
                    gen_all_subseq(seq, sub_seq, sub_seq)
                gen_all_subseq(seq, sub_seq+[seq[i]], i+1)

        sseq = dict.fromkeys(binders)
        for binder in binders:
            sseq[binder] = [sseq for sseq in list(
                gen_all_subseq(binders, '', 0))]
        return sseq
    
get_binder_subseq(binders)

{'IMVTESSDYSSY': [], 'IMVTASSAYDDY': []}

In [113]:
def it(word):
    set1 = list()
    for begin in range(len(word)):
        for end in range(begin,len(word)):
            set1.append(word[begin:end+1])
    return set1

print(len(it('IMVTESSDYSSY')))
# -> should return A, B, C, D, AB, BC, CD, ABC, BCD, ABCD

78


In [61]:
# get all sub seq
all_sseq = lambda s: set(s[i:j+1] for i in range(len(s)) for j in range(i, len(s)))
print(all_sseq("ABCDE"))

{'A', 'BCD', 'ABCDE', 'BCDE', 'D', 'CDE', 'E', 'ABC', 'ABCD', 'CD', 'B', 'BC', 'DE', 'AB', 'C'}


In [184]:
bs = ['IMVTESSDYSSY', 'IMVTASSAYDDY']

def get_binder_subseq(binders) -> pd.DataFrame:
    '''
    Generates all possible subsequences for binders
    provided in class constructor
    '''
        
    all_sseq = lambda s: [(s[i:j], i) for i in range(len(s)) for j in range(i+1, len(s)+1)]

    sseq = dict.fromkeys(binders)
    for binder in sseq.keys():
        sseq[binder] = all_sseq(binder)
    return sseq
    

    
result =get_binder_subseq(bs)

def countNonEmptySubstr(str): 
    n = len(str); 
    return int(n * (n + 1) / 2); 

print(countNonEmptySubstr(bs[0]))
print(countNonEmptySubstr(bs[1]))
print(len(result[bs[0]]))

print(result[bs[0]])

78
78
78
[('I', 0), ('IM', 0), ('IMV', 0), ('IMVT', 0), ('IMVTE', 0), ('IMVTES', 0), ('IMVTESS', 0), ('IMVTESSD', 0), ('IMVTESSDY', 0), ('IMVTESSDYS', 0), ('IMVTESSDYSS', 0), ('IMVTESSDYSSY', 0), ('M', 1), ('MV', 1), ('MVT', 1), ('MVTE', 1), ('MVTES', 1), ('MVTESS', 1), ('MVTESSD', 1), ('MVTESSDY', 1), ('MVTESSDYS', 1), ('MVTESSDYSS', 1), ('MVTESSDYSSY', 1), ('V', 2), ('VT', 2), ('VTE', 2), ('VTES', 2), ('VTESS', 2), ('VTESSD', 2), ('VTESSDY', 2), ('VTESSDYS', 2), ('VTESSDYSS', 2), ('VTESSDYSSY', 2), ('T', 3), ('TE', 3), ('TES', 3), ('TESS', 3), ('TESSD', 3), ('TESSDY', 3), ('TESSDYS', 3), ('TESSDYSS', 3), ('TESSDYSSY', 3), ('E', 4), ('ES', 4), ('ESS', 4), ('ESSD', 4), ('ESSDY', 4), ('ESSDYS', 4), ('ESSDYSS', 4), ('ESSDYSSY', 4), ('S', 5), ('SS', 5), ('SSD', 5), ('SSDY', 5), ('SSDYS', 5), ('SSDYSS', 5), ('SSDYSSY', 5), ('S', 6), ('SD', 6), ('SDY', 6), ('SDYS', 6), ('SDYSS', 6), ('SDYSSY', 6), ('D', 7), ('DY', 7), ('DYS', 7), ('DYSS', 7), ('DYSSY', 7), ('Y', 8), ('YS', 8), ('YSS', 8), (

In [172]:

      
# Python program to print all possible 
# substrings of a given string 
   
# Function to print all sub strings 
def subString(s): 
    # Pick starting point in outer loop 
    # and lengths of different strings for 
    # a given starting point 
    out = []
    for i in range(len(s)): 
        for j in range(i+1,len(s)+1): 
            out.append(s[i: j]); 
    return out

  
# This code is contributed by princiraj1992 


In [173]:
print(len(subString(bs[0])))

78


In [157]:
print(subString(bs[0]))

{'D', 'E', 'Y', 'M', 'V', 'S', 'T', 'I'}


In [185]:
def get_df_with_binder_subseqs(self) -> pd.DataFrame:
        '''
        Returns a filtered version of self.peps_same_len DataFrame containing only
        those rows with sequences which contain subsequences of the two binder sequences
        in the locations where they occur in the binders
        '''
        data: Dict[str, List[pd.DataFrame]] = dict.fromkeys(self.binders)
        sseq = self.get_binder_subseq()
        for binder in self.binders:
            for (ss, i) in sseq[binder]:
                filtered_data = self.df_filter_subseq(ss, i)
                print(filtered_data)
         
        