In [1]:
#!/usr/bin/env python3

# standard library modules
import sys, errno, re, json, ssl
from urllib import request
from urllib.error import HTTPError
from time import sleep

BASE_URL = "https://www.ebi.ac.uk:443/interpro/api/protein/UniProt/entry/InterPro/IPR010215/?page_size=200&extra_fields=sequence"

HEADER_SEPARATOR = "|"
LINE_LENGTH = 80

def output_list():
  #disable SSL verification to avoid config issues
  context = ssl._create_unverified_context()

  next = BASE_URL
  last_page = False

  
  attempts = 0
  while next:
    try:
      req = request.Request(next, headers={"Accept": "application/json"})
      res = request.urlopen(req, context=context)
      # If the API times out due a long running query
      if res.status == 408:
        # wait just over a minute
        sleep(61)
        # then continue this loop with the same URL
        continue
      elif res.status == 204:
        #no data so leave loop
        break
      payload = json.loads(res.read().decode())
      next = payload["next"]
      attempts = 0
      if not next:
        last_page = True
    except HTTPError as e:
      if e.code == 408:
        sleep(61)
        continue
      else:
        # If there is a different HTTP error, it wil re-try 3 times before failing
        if attempts < 3:
          attempts += 1
          sleep(61)
          continue
        else:
          sys.stderr.write("LAST URL: " + next)
          raise e

    for i, item in enumerate(payload["results"]):
      
      entries = None
      if ("entry_subset" in item):
        entries = item["entry_subset"]
      elif ("entries" in item):
        entries = item["entries"]
      
      if entries is not None:
        entries_header = "-".join(
          [entry["accession"] + "(" + ";".join(
            [
              ",".join(
                [ str(fragment["start"]) + "..." + str(fragment["end"]) 
                  for fragment in locations["fragments"]]
              ) for locations in entry["entry_protein_locations"]
            ]
          ) + ")" for entry in entries]
        )
        sys.stdout.write(">" + item["metadata"]["accession"] + HEADER_SEPARATOR
                          + entries_header + HEADER_SEPARATOR
                          + item["metadata"]["name"] + "\n")
      else:
        sys.stdout.write(">" + item["metadata"]["accession"] + HEADER_SEPARATOR + item["metadata"]["name"] + "\n")

      seq = item["extra_fields"]["sequence"]
      fastaSeqFragments = [seq[0+i:LINE_LENGTH+i] for i in range(0, len(seq), LINE_LENGTH)]
      for fastaSeqFragment in fastaSeqFragments:
        sys.stdout.write(fastaSeqFragment + "\n")
      
    # Don't overload the server, give it time before asking for more
    if next:
      sleep(1)

if __name__ == "__main__":
  output_list()


>A0A014PV18|IPR010215(1...162)|Transcription antitermination protein RfaH
MESWYLLYCKRGQLLRAKEHLERQAVHCLSPMIAIEKLVRGKRTQVSEPLFPNYLFIEFDPEAIHTTTISATRGVSHFVR
FGNLPATVPEDVIAALQTERTETLDDPDLPQPGDQVVITAGAFEGLKAIFTEPDGEARSMLLLKLLNKQIVRSLDNKQFQ
KI
>A0A022PHD0|IPR010215(1...162)|Transcription antitermination protein RfaH
MENWYLIYCKRGQISRAIENLERQDVACLTPTARIEKITRGKRTVNIEPLFPNYLFVQFDPEVIHTTTINSTRGVSHFVR
FGVHPAIVPETLIKEIVSATEQEYVSPDTPVTGDTVLITEGIFEGLQAIYNEPDGETRSILLLNILNKQLPKALDNKQFV
KI
>A0A023WTR5|IPR010215(6...168)|Transcription antitermination protein RfaH
MTPSNTARWYLIQTKPRQEARAEENLLRQHFECYRPIKVPPPSRGPQGAKAGEALFPGYLFIRLDCIHDNWYPIRSTRGV
SRVVSFGGQPTPVRDELIEQLRRRLAQSEAAPAATFTPGERVQVSGGSFSDIEAIFVSSDGEERSVILLNLLQREQKVRV
PTRYLQCYS
>A0A024HDA4|IPR010215(6...164)|Transcription antitermination protein RfaH
MPDATGKQWYLIQCKPRQDMRALEHLERQGYPCLLPTHQIERLQKGKLQQLSEPLFPGYLFIHLDRVDDNWMPIRSTRGV
NQIVSFGGRPTPVPEAIVTKLQSPHTNVLPALIAGDRVVLNDTSLQQIEAIFLEKNSDGRVLLLLSLLQREVVVSVPLTQ
VQKIELRTYEPVSRLQLKKDQETTKK
>A0A060NIM1|I

In [1]:
!./cdhit/cd-hit -i /workspaces/FrustraEvo/rfah/IPR010215.fasta -o /workspaces/FrustraEvo/rfah/IPR010215_clustered.fasta -c 0.9

Program: CD-HIT, V4.8.1 (+OpenMP), Apr 04 2024, 05:57:25
Command: ./cdhit/cd-hit -i
         /workspaces/FrustraEvo/rfah/IPR010215.fasta -o
         /workspaces/FrustraEvo/rfah/IPR010215_clustered.fasta
         -c 0.9

Started: Thu Apr  4 05:57:35 2024
                            Output                              
----------------------------------------------------------------
total seq: 2896
longest and shortest : 314 and 100
Total letters: 482484
Sequences have been sorted

Approximated minimal memory consumption:
Sequence        : 0M
Buffer          : 1 X 10M = 10M
Table           : 1 X 65M = 65M
Miscellaneous   : 0M
Total           : 76M

Table limit with the given memory limit:
Max number of representatives: 2725481
Max number of word counting entries: 90388557

comparing sequences from          0  to       2896
..
     2896  finished       1078  clusters

Approximated maximum memory consumption: 78M
writing new database
writing clustering information
program completed !

Tota