In [3]:
#requires Biopython library
#!pip install biopython
WINDOW=15 # range within the search for secondary structure is performed
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import pandas as pd
import regex
import os
import glob
#function for searching for the presence of additional secondary structure elements and 
#export of domain sequences with these elements 
# using associated fasta file and csv file with secondary structure information in DSSP format
def dssp_check(data,fasta,dssp_data):
    records = list(SeqIO.parse(fasta, "fasta"))
    fastas={'id':[],'seq':[],'name':[]}
    for i in range(len(records)):
        fastas['id'].append(records[i].id.split(sep='|')[1])
        fastas['seq'].append(str(records[i].seq))
        fastas['name'].append(str(records[i].id.split(sep='|')[2]))
    fasta_df=pd.DataFrame(fastas)
    protein_ids=[]
    domain_seq=[] #sequences of found domains
    domain_names=[] #Uniprot identifiers
    for index,protein in data.iterrows():
        domains=protein[0].split(sep='\t')
        dssp_ss=list(dssp_data[dssp_data['short_names']==domains[0].split(sep='|')[0]]['ss'])
        protein_id=domains[0].split(sep='|')[0]
        # the search string, E means beta-strand, H - alpha-helix
        pattern_string = 'EEEEEE' #modify accordingly
        #means that no more than two insertions or two substitutions are allowed
        r = regex.compile('(%s){i<=2,s<=2}' % pattern_string) 
        sequence=list(fasta_df[fasta_df['id']==domains[0].split(sep='|')[0]]['seq'])
        name=list(fasta_df[fasta_df['id']==domains[0].split(sep='|')[0]]['name'])
        domains_list={
             'start':[],
             'stop':[],
             'name':[],
        }
        for domain in domains:
            d=domain.split(sep='|')
            if len(d) ==5:
                domains_list['start'].append(d[1])
                domains_list['stop'].append(d[2])
                domains_list['name'].append(d[4])
        domains_df=pd.DataFrame(domains_list)
        for znf in range(len(domains_df['start'])):
            if domains_df['name'][znf] == 'ZnF_C2H2': #replace domain name accordingly
                try:
                    if r.search(str(dssp_ss)[(int(domains_df['start'][znf])-WINDOW):(int(domains_df['start'][znf])-2)]):
                        # checking that no other domain is present in search range
                        if znf==0:
                            domain_seq.append(str(sequence)[(int(domains_df['start'][znf])-WINDOW):(int(domains_df['stop'][znf])+5)])
                            domain_names.append(str(name))
                        else:
                            if int(domains_df['stop'][znf-1])<(int(domains_df['start'][znf])-WINDOW):
                                domain_seq.append(str(sequence)[(int(domains_df['start'][znf])-WINDOW-5):(int(domains_df['stop'][znf])+5)])
                                domain_names.append(str(name)+"_"+str(znf))
                except:
                    None
    print(domain_names)             
    return domain_seq,domain_names
#batch processing of multiple files
for file in glob.glob("*.txt"):
        data =pd.read_csv(file, skiprows=[0,1,2,3,4,5,6,7,8,9,10,11] )
        name=os.path.basename(file)
        base_name=name.split(sep='.')[0]
        dssp_file=os.path.join(base_name+'.csv')
        fasta=os.path.join(base_name+'.fasta')
        dssp_pred=pd.read_csv(dssp_file)
        domain_sequences,domain_names = dssp_check(data,fasta,dssp_pred)
        dssp_name=os.path.join(base_name+'_dssp_beta_sequences.fasta')
        ofile = open(dssp_name, "w")
        for i in range(len(domain_sequences)):
            ofile.write(">" + domain_names[i] + "\n" +domain_sequences[i] + "\n")
        ofile.close()

["['EMF2_ARATH']", "['FIS2C_ARATH']", "['IDD13_ARATH']", "['SRRT_ARATH']_1", "['STOP1_ARATH']", "['STOP2_ARATH']", "['Q8VZP2_ARATH']", "['F4HPX2_ARATH']", "['Q9FM27_ARATH']"]
["['BTD_DROME']", "['OUIB_DROME']_1", "['SUZ12_DROME']", "['TEF_DROME']", "['TOPI_DROME']_2", "['TTKA_DROME']_2", "['Q9VE63_DROME']", "['Q1LZ24_DROME']", "['Q7K1V0_DROME']_9", "['B9A0M7_DROME']_4", "['Q9V9Q2_DROME']_4", "['Q9W3J9_DROME']", "['Q8IGP5_DROME']", "['Q8IGP5_DROME']_1", "['Q9VPQ3_DROME']", "['Q9W403_DROME']_3", "['Q5BIC3_DROME']", "['Q8MSB3_DROME']", "['Q9W3J2_DROME']", "['Q8T007_DROME']_1", "['Q8T007_DROME']_4", "['Q8T007_DROME']_7", "['Q8T007_DROME']_11", "['Q9VUS0_DROME']", "['Q9VUS0_DROME']_1", "['Q9VKC4_DROME']_1", "['A1Z9K0_DROME']", "['Q8INE2_DROME']_1", "['Q9VBX4_DROME']", "['Q9VBX4_DROME']_1", "['Q9VI24_DROME']_1", "['A1Z995_DROME']", "['Q9VGA4_DROME']_2"]
["['PRD10_DANRE']_9", "['PRD10_DANRE']_10", "['O93311_DANRE']", "['F1QVF4_DANRE']_4", "['Q68EH4_DANRE']_1", "['I3ITC3_DANRE']_5", "['X1WHR9_