In [14]:
import os

import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from Bio import SeqIO
import regex as rx

### Setup

In [15]:
lothar = '/home/enno/uni/SS24/thesis/1_seq_analysis'

In [16]:
if os.getcwd() != lothar:
    input_file = '/ebio/abt1_share/prediction_hendecads/0_data/npf_data/final_dataset.fasta'  # "/home/enno/uni/SS23/thesis/data/hendecads/sequences.fasta"
else:
    input_file = '/home/enno/uni/SS24/thesis/data/hendecads/sequences.fasta'
    clans_file = '/home/enno/uni/SS24/thesis/data/hendecads/new_hendecads_1E-14.clans'
    df = pd.read_csv('/home/enno/uni/SS24/thesis/1_seq_analysis/regEx.csv', )
    
fasta_sequences = list(SeqIO.parse(open(input_file),'fasta'))
n_seq = len(fasta_sequences)

In [4]:
# Read .fasta file, extract stretches and store them in a df

df = pd.DataFrame(columns=['id', 'seq', 'stretch_ix', 'stretch_seq'])

pattern = r'\[\[.*?\]\]'

for seq_ix, seq in enumerate(list(SeqIO.parse(open(input_file), 'fasta'))):

    print(f"Processing sequence {seq_ix+1}/{n_seq}", end='\r')
    
    s = str(seq.seq).lower()
    d = str(seq.description)
    
    stretches = eval(rx.findall(pattern, d.split('|||')[-1])[0])

    tmp_six = []
    tmp_seq = []

    for sx, stretch in enumerate(stretches):
                
        cc_ix = [x for x in range(stretch[0], stretch[1]+1)]
        stretch_seq = s[min(cc_ix):max(cc_ix)]

        tmp_six.append(cc_ix)
        tmp_seq.append(stretch_seq)
    
    df.loc[len(df), ] = [seq.id, s, tmp_six, tmp_seq]

Processing sequence 36455/36455

In [236]:
df.head()

Unnamed: 0,id,seq,stretch_ix,stretch_seq
0,MCD6041253.1,mrlvyvavaailcsfsttslagaektakragkfvektatragkfve...,"[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1...",[rlvyvavaailcsfsttslagaektakragkfvektatragkfve...
1,MCD7737945.1,mqgrvffreaaalilaaalsmaglpasaaansgieaaalrteeete...,"[[41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, ...",[eeetepstkeavqetavetdtgekpesgedgqeesaesteeeqee...
2,MYF28459.1,merlqtdllkeihalrgemhaefasvrqemhagfasirqemhaeta...,"[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1...",[erlqtdllkeihalrgemhaefasvrqemhagfasirqemhaeta...
3,WP_168920948.1,msdvfltasyadrekvktlgarwnpaekrwyvpsgrdlspfaawlp...,"[[437, 438, 439, 440, 441, 442, 443, 444, 445,...",[aqslvveikhaasqqlllarhvvparmaevtaegrqalrtakaqs...
4,WP_026306873.1,mllrriarpllsaafiaegidilqnpgpladrlspaldftrrrsqh...,"[[172, 173, 174, 175, 176, 177, 178, 179, 180,...",[slgwrgrraardakdhaealaataaaiaatarergtnlvdtarer...


In [5]:
def parse_numbers(filename):
    # extracts the cluster assignments from the .clans file

    clusters = []
    with open(filename, 'r') as file:
       
        for line in file:
            
            if rx.match('numbers=', line):
                
                _, num_str = line.split('=')
                numbers = [int(num.strip()) for num in num_str.split(';')[:-1]]

                clusters.append(numbers)
    
    return clusters

In [6]:
numbers = parse_numbers(clans_file)

In [7]:
# assign clusters to sequences

for ix, cluster in enumerate(numbers):
        df.loc[cluster, 'c'] = ix

df.loc[df['c'].isna(), 'c'] = -1

### RegEx

In [8]:
def find_match(seq, pattern, i, mm):
    # suffix for fuzzy regex
    mm_suffix = r'{e<=' + str(mm) + '}'
    pattern = r'(' + pattern * i + r')' + mm_suffix

    N = 11 * i  # length of pattern
    i = 0       # running index

    hits = []
    hits_ix = []

    while len(seq) >= N:  # while remaining sequence is longer than pattern

        match = rx.search(pattern, seq[:N])

        if match and len(match.group(0)) == N:  # if match is found and fuzzyness is substitution only, not indel
            hits.append(match.group(0))
            hits_ix.append([i, i+N])

        seq = seq[1:]
        i += 1

    return hits, hits_ix

In [43]:
              # a--d---h---a--d---h---
# query_string = 'av1av11av11av2av22av22'
query_string = df.loc[0, 'stretch_seq'][0]
query_string

'rlvyvavaailcsfsttslagaektakragkfvektatragkfvertatkagkfvertadkaakgakkll'

In [45]:
find_match(query_string, pattern, 1, 0)

(['lvyvavaailc',
  'vyvavaailcs',
  'lagaektakra',
  'akragkfvekt',
  'atragkfvert',
  'atkagkfvert',
  'adkaakgakkl'],
 [[1, 12], [2, 13], [18, 29], [25, 36], [36, 47], [47, 58], [58, 69]])

In [22]:
pattern = r'[avilm]..[avilm]...[avilm]...'

def process_row(row, lx, mmx):
    tmp_stretch = [find_match(stretch, pattern, lx, mmx) for stretch in row['stretch_seq']]
    return tmp_stretch

for lx in [1, 3, 5, 10]:
    print(lx)
    for mmx in range(0, 3):
        df[f'{lx}R_{mmx}MM'] = df.apply(lambda x: process_row(x, lx, mmx), axis=1)

df.to_csv('/home/enno/uni/SS24/thesis/1_seq_analysis/regEx.csv', index=False)

1
3
5
10


In [17]:
col0 = ['1R_0MM', '3R_0MM', '5R_0MM', '10R_0MM']
col1 = ['1R_1MM', '3R_1MM', '5R_1MM', '10R_1MM']
col2 = ['1R_2MM', '3R_2MM', '5R_2MM', '10R_2MM']

In [48]:
[x[0] for x in eval(df['1R_1MM'][74
                                 ])]

[['mmqqeqnmspq',
  'eqnmspqmspq',
  'mspqeqmmsqq',
  'eqmmsqqmmsp',
  'mmspqeqmmqm',
  'mspqeqmmqmm',
  'qeqmmqmmspq',
  'mmqmmspqeqm',
  'mqmmspqeqmm',
  'mmspqeqmmsq',
  'mspqeqmmsqq',
  'eqmmsqqmmlp',
  'mmsqqmmlpqq',
  'mmlpqqqvasq',
  'mlpqqqvasql',
  'qqqvasqlisp',
  'qqvasqlispq',
  'asqlispqmis',
  'sqlispqmisd',
  'pqmisdpmmmp',
  'misdpmmmpqq',
  'liptptamiik',
  'iptptamiikq',
  'tamiikqasms',
  'amiikqasmsp',
  'miikqasmspr',
  'ikqasmsprpa',
  'asmsprparhm',
  'prparhmlpsp'],
 ['ltphltkalnn',
  'tphltkalnnp',
  'ltkalnnpalk',
  'tkalnnpalkg',
  'nnpalkgasyt',
  'lkgasytlpdg',
  'lpdgtiivdpr',
  'spnlmaalqnp',
  'lmaalqnpylk',
  'maalqnpylkg',
  'aalqnpylkgl',
  'lkglsytlpdg',
  'lpdgtiiidpr',
  'iidprkpvapd',
  'idprkpvapdl',
  'rkpvapdlska',
  'vapdlskalln',
  'apdlskallne',
  'lskallnenlr',
  'kallnenlrna',
  'lnenlrnasyq',
  'lrnasyqlrdg',
  'lrdgslvipgq']]

In [19]:
df[col2]

Unnamed: 0,1R_2MM,3R_2MM,5R_2MM,10R_2MM
0,"[(['rlvyvavaail', 'lvyvavaailc', 'vyvavaailcs'...","[(['aailcsfsttslagaektakragkfvektatra', 'sttsl...",[(['sttslagaektakragkfvektatragkfvertatkagkfve...,"[([], [])]"
1,"[(['tepstkeavqe', 'epstkeavqet', 'tkeavqetave'...","[([], [])]","[([], [])]","[([], [])]"
2,"[(['erlqtdllkei', 'lqtdllkeiha', 'qtdllkeihal'...","[(['ihalrgemhaefasvrqemhagfasirqemhae', 'mhagf...","[([], [])]","[([], [])]"
3,"[(['aqslvveikha', 'qslvveikhaa', 'slvveikhaas'...","[(['aqslvveikhaasqqlllarhvvparmaevtae', 'lvvei...",[(['vveikhaasqqlllarhvvparmaevtaegrqalrtakaqsq...,"[([], [])]"
4,"[(['lgwrgrraard', 'gwrgrraarda', 'grraardakdh'...","[(['lgwrgrraardakdhaealaataaaiaatarer', 'grraa...","[([], [])]","[([], [])]"
...,...,...,...,...
36450,"[(['akksgesfasl', 'kksgesfasla', 'sgesfaslask'...","[([], [])]","[([], [])]","[([], [])]"
36451,"[(['qrsiseeyeki', 'iseeyekirdy', 'yekirdyfkdl'...","[([], [])]","[([], [])]","[([], [])]"
36452,"[(['tltkqgevvdq', 'ltkqgevvdqk', 'qgevvdqkiqe'...","[([], [])]","[([], [])]","[([], [])]"
36453,"[(['isaltndknrm', 'altndknrmvr', 'ltndknrmvrk'...","[(['nrmvrknavdslgffpwdakaivpllvallsdk', 'lgffp...",[(['lgffpwdakaivpllvallsdkdsdlaataigslgrigngas...,"[([], [])]"


In [22]:
type(df['1R_2MM'][0])

str