<br><br>
<h1 style="font-size:40px" align="center"> Load Sequences from Interpro </h1><br><br><br><br><br><br>

##### Only run this if you need to tbh, you dont want rewrite the json file

In [9]:
#!/usr/bin/env python3

# standard library modules
import sys, errno, re, json, ssl
from urllib import request
from urllib.error import HTTPError
from time import sleep
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
import requests
import xmltodict
import concurrent.futures
import json
import os

In [10]:
BASE_URL = "https://www.ebi.ac.uk:443/interpro/api/protein/UniProt/entry/pfam/PF05935/?page_size=200&extra_fields=sequence&taxonomy"
output_filename = '../data/annotated_sequences.json'
installation_progress_filename = "../data/url_progress.txt"

In [11]:
with open('../data/annotated_sequences.json', 'w') as f:
    json.dump([],f)

In [12]:

def get_additional_info(seq_data):
    accession = seq_data["Accession_Interpro"]
    url = f"https://rest.uniprot.org/uniprotkb/{accession}.xml"
    response = requests.get(url)
    xml_data = xmltodict.parse(response.text)
    if(response.status_code == 200):
        seq_data["lineage"] = xml_data['uniprot']['entry']['organism']['lineage']["taxon"]
        seq_data["Accession_RefSeq"] = ""
        seq_data["Accession_AlphaFoldDB"] = ""
        for reference in xml_data['uniprot']['entry']['dbReference']:
            try:
                if reference['@type'] == 'RefSeq' or reference['@type'] == 'AlphaFoldDB':
                    id = reference['@id']
                    seq_data["Accession_"+reference["@type"]] = id
            except:
                print(f"\n Failed to get info for {accession}")
                
    else:
        print(f"FAILED RETRIEVING ADDITIONAL DATA FOR {accession}")
    return seq_data


In [14]:
def output_list(next = BASE_URL):
  #disable SSL verification to avoid config issues
  context = ssl._create_unverified_context()
  last_page = False
  total_data = []

  if(next != BASE_URL):
        print("Not starting from the beginning! Loading previous Json!")
        with open(output_filename, 'r') as f:
            # load the data from the file
            total_data = json.load(f)
        print(f"We have a total of {len(total_data)} sequences!")
        print("\nContinuing where we left off!...\n")
        
  attempts = 0
  while next:
    try:
      req = request.Request(next, headers={"Accept": "application/json"})
      res = request.urlopen(req, context=context)
      # If the API times out due a long running query
      if res.status == 408:
        # wait just over a minute
        sleep(61)
        # then continue this loop with the same URL
        continue
      elif res.status == 204:
        #no data so leave loop
        break
      payload = json.loads(res.read().decode())
      next = payload["next"]
      print(next)
      
      attempts = 0
      if not next:
        last_page = True
    except HTTPError as e:
      if e.code == 408:
        sleep(61)
        continue
      else:
        # If there is a different HTTP error, it wil re-try 3 times before failing
        if attempts < 3:
          attempts += 1
          sleep(61)
          continue
        else:
          sys.stderr.write("LAST URL: " + next)
          raise e
    data = []
    for i, item in enumerate(payload["results"]):
      entries = None
      if ("entry_subset" in item):
        entries = item["entry_subset"]
      elif ("entries" in item):
        entries = item["entries"]
      
      seq_data = {}
      if entries is not None:
        start = 0
        end = 0
        for entry in entries:
          for locations in entry['entry_protein_locations']:
            for fragment in locations['fragments']:
              start = fragment['start']
              end = fragment['end']
        
        seq_data["Accession_Interpro"] = item["metadata"]["accession"]
        seq_data["domain_boundaries"] = {"start":start, "end":end}
      seq_data["seq"] = item["extra_fields"]["sequence"]
      data.append(seq_data)
    with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
        # submit a task to retrieve information for each accession
        tasks = [executor.submit(get_additional_info, seq_data) for seq_data in data]

        # retrieve the results of the tasks as they complete
        results = [task.result() for task in concurrent.futures.as_completed(tasks)]

    
    total_data.extend(data)
    print(f"Processed {len(data)} in the last batch, total {len(total_data)}")
    # Don't overload the server, give it time before asking for more
    with open(output_filename, 'w') as f:
        json.dump(total_data,f)
        print("...Progress Saved!")
    with open(installation_progress_filename, "a") as f:
        f.write("\n"+next)
        
  return total_data
print("Hello")
total_data = []
url = BASE_URL
if os.path.exists(installation_progress_filename):
    print("Identified pre existing save, loading... ")
    with open(installation_progress_filename, "r") as f:
        lines = f.readlines()
        if lines:
            url = lines[-1]
else:
    print("Starting to load sequences and identifiers from scratch!")
    open(installation_progress_filename, "w").close()
    
total_data = output_list(url)


Hello
Starting to load sequences and identifiers from scratch!
https://www.ebi.ac.uk/interpro/api/protein/UniProt/entry/pfam/PF05935/?cursor=source%3As%3Aa0a0g4f9w9&extra_fields=sequence&page_size=200&taxonomy=
Processed 200 in the last batch, total 200
...Progress Saved!
https://www.ebi.ac.uk/interpro/api/protein/UniProt/entry/pfam/PF05935/?cursor=source%3As%3Aa0a0r9pem7&extra_fields=sequence&page_size=200&taxonomy=
Processed 200 in the last batch, total 400
...Progress Saved!
https://www.ebi.ac.uk/interpro/api/protein/UniProt/entry/pfam/PF05935/?cursor=source%3As%3Aa0a177p0b0&extra_fields=sequence&page_size=200&taxonomy=
Processed 200 in the last batch, total 600
...Progress Saved!
https://www.ebi.ac.uk/interpro/api/protein/UniProt/entry/pfam/PF05935/?cursor=source%3As%3Aa0a1h2ghv5&extra_fields=sequence&page_size=200&taxonomy=
Processed 200 in the last batch, total 800
...Progress Saved!
https://www.ebi.ac.uk/interpro/api/protein/UniProt/entry/pfam/PF05935/?cursor=source%3As%3Aa0a1q6

TypeError: can only concatenate str (not "NoneType") to str

In [None]:
total_data = []

with open('../processed_sequences/annotated_sequences.json', 'r') as f:
    total_data = json.load(f)
#total_data

# From the downloaded json, create a trimmed sequences data file

# From the downloaded json, create a non trimmed sequences data file

# Get Sequence information, such as domain architecture from Interpro

In [1]:
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqIO import write
import requests
import json
import requests
from tqdm import tqdm
import os
temp_filename = "del_me_temp_file_loading_domain_info.json"

In [2]:
data_filename = "../../../data/ASST_raw_sequences/ASSTs_annotated_sequences.json"
output_filename = "../../../data/ASST_raw_sequences/ASSTs_domain_data.json"


In [5]:
import requests

# Define the InterPro API endpoint
interpro_api_url = "https://www.ebi.ac.uk/interpro/api/entry/all/protein/unreviewed/"
# Replace 'your_accession_number' with the actual InterPro accession number
accession_number = 'A0A009Y387'

# Construct the API request URL
request_url = f"{interpro_api_url}/{accession_number}"

# Send the request
response =  requests.get(request_url, timeout=10)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the JSON response
    data = response.json()

    for result in data["results"]:
        print(result["metadata"]["accession"])
    # Extract domain architecture information
    domain_architecture = data.get('entries', {}).get('interpro', {}).get('entry_protein_locations', [])

    # Print or process the domain architecture information as needed
    print(domain_architecture)
else:
    # Handle errors
    print(f"Error: {response.status_code} - {response.text}")


G3DSA:2.130.10.10
IPR010262
IPR011047
IPR015943
PF05935
PTHR35340
SSF50998
[]


In [None]:
import requests
from requests.exceptions import Timeout

def get_domain_info(accession_number, element):
    max_retries = 3

    for retry in range(max_retries):
        try:
            # Construct the API request URL
            request_url = f"https://www.ebi.ac.uk/interpro/api/entry/all/protein/unreviewed/{accession_number}?format=json"

            # Send the request with a timeout
            response = requests.get(request_url, timeout=3)

            # Check if the request was successful (status code 200)
            if response.status_code == 200:
                # Parse the JSON response
                data = response.json()

                domain_info = []

                for result in data["results"]:
                    accession = ""
                    if not result["metadata"]["accession"]:
                        continue

                    accession = result["metadata"]["accession"]
                    for key, value in result["metadata"].items():
                        if not isinstance(value, str):
                            continue
                        if value.startswith('IPR'):
                            accession = value

                    if accession not in domain_info:
                        domain_info.append(accession)

                return domain_info
            elif response.status_code == 204:
                print(f"Error: Missing data for - {accession_number} - Continuing regardless")
                return ["error"]
            else:
                print(f"Error: {response.status_code} - {response.text} - {accession_number} - {element}")
                raise Exception("Request went wrong, please restart this script")

        except Timeout:
            if retry < max_retries - 1:
                print(f"Request timed out, retrying ({retry + 1}/{max_retries})...")
            else:
                print(f"Request timed out after {max_retries} retries, raising exception.")
                raise Exception("Request timed out after multiple retries")
    


data = []
with open(data_filename, 'r') as file:
    data = json.load(file)

all_domain_info = []
index = 0
if not os.path.exists(temp_filename):
    # If it doesn't exist, create the file
    print("No previous temp file found, starting from scratch!")
    with open(temp_filename, 'w') as temp:
        json.dump({"index":index}, temp, indent=4)
    
else:
    print("Previous save identified, loading previous data")
    with open(temp_filename, 'r') as file:
        index = json.load(file)["index"]
        print(f"Starting from position {index}")
    with open(output_filename, 'r') as file:
        all_domain_info = json.load(file)

for i in tqdm(range(0,len(data)), desc="Loading Domain information"):
    if(i < index):
        continue
    element = data[i]
    accession_interpro = element["Accession_Interpro"]

    domains = get_domain_info(accession_interpro,element)
    all_domain_info.append({"Accession_Interpro":accession_interpro, "domains":domains})
    with open(output_filename, 'w') as outfile:
        json.dump(all_domain_info, outfile, indent=4)
    index += 1
    with open(temp_filename, 'w') as temp:
        json.dump({"index":index}, temp, indent=4)


Previous save identified, loading previous data
Starting from position 5791


Loading Domain information:  54%|██████████████████████████████████████████████████████████████████████████████████████████▏                                                                           | 5834/10741 [13:07:34<59:21,  1.38it/s]

Error: Missing data for - A0A661YN05 - Continuing regardless


Loading Domain information:  57%|████████████████████████████████████████████████████████████████████████████████████████████▋                                                                       | 6071/10741 [13:11:28<1:18:22,  1.01s/it]

Request timed out, retrying (1/3)...


Loading Domain information:  57%|████████████████████████████████████████████████████████████████████████████████████████████▉                                                                       | 6087/10741 [13:11:48<1:23:10,  1.07s/it]

Error: Missing data for - A0A6J4GPH2 - Continuing regardless


Loading Domain information:  57%|███████████████████████████████████████████████████████████████████████████████████████████████▎                                                                      | 6171/10741 [13:13:07<57:48,  1.32it/s]

Request timed out, retrying (1/3)...


Loading Domain information:  58%|███████████████████████████████████████████████████████████████████████████████████████████████▏                                                                    | 6236/10741 [13:14:14<1:05:11,  1.15it/s]

Request timed out, retrying (1/3)...


Loading Domain information:  58%|███████████████████████████████████████████████████████████████████████████████████████████████▍                                                                    | 6251/10741 [13:14:33<1:11:06,  1.05it/s]

Error: Missing data for - A0A6N8VSK0 - Continuing regardless


Loading Domain information:  58%|████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                     | 6254/10741 [13:14:35<53:52,  1.39it/s]

Error: Missing data for - A0A6N8Y2X6 - Continuing regardless


Loading Domain information:  59%|████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                   | 6300/10741 [13:15:24<1:13:45,  1.00it/s]

Error: Missing data for - A0A6S5K6K0 - Continuing regardless


Loading Domain information:  66%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                        | 7061/10741 [13:25:11<55:05,  1.11it/s]

Request timed out, retrying (1/3)...


Loading Domain information:  66%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                        | 7067/10741 [13:25:21<1:12:00,  1.18s/it]

Request timed out, retrying (1/3)...


Loading Domain information:  66%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                        | 7070/10741 [13:25:30<2:07:43,  2.09s/it]

Request timed out, retrying (1/3)...


Loading Domain information:  67%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                       | 7153/10741 [13:41:19<44:22,  1.35it/s]

Error: Missing data for - A0A7A3AAF6 - Continuing regardless


Loading Domain information:  67%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                       | 7163/10741 [13:41:26<45:04,  1.32it/s]

Error: Missing data for - A0A7C1TC87 - Continuing regardless


Loading Domain information:  67%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                      | 7171/10741 [13:41:35<1:00:41,  1.02s/it]

Request timed out, retrying (1/3)...


Loading Domain information:  67%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                     | 7248/10741 [13:43:03<1:11:00,  1.22s/it]

Error: Missing data for - A0A7C7X575 - Continuing regardless


Loading Domain information:  67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                      | 7249/10741 [13:43:03<57:01,  1.02it/s]

Error: Missing data for - A0A7C7X8V5 - Continuing regardless


Loading Domain information:  68%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                     | 7280/10741 [13:43:38<53:26,  1.08it/s]

Error: Missing data for - A0A7D6P8Z4 - Continuing regardless


Loading Domain information:  68%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                     | 7294/10741 [13:43:50<48:54,  1.17it/s]

Error: Missing data for - A0A7G3F2X5 - Continuing regardless


Loading Domain information:  68%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                    | 7324/10741 [13:44:22<1:03:47,  1.12s/it]

Error: Missing data for - A0A7I7ADA8 - Continuing regardless


Loading Domain information:  71%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                               | 7663/10741 [13:50:04<56:58,  1.11s/it]

Request timed out, retrying (1/3)...


Loading Domain information:  71%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                              | 7677/10741 [13:50:27<1:07:06,  1.31s/it]

Error: Missing data for - A0A7V8TJM8 - Continuing regardless


Loading Domain information:  72%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                              | 7747/10741 [13:51:35<48:58,  1.02it/s]

Error: Missing data for - A0A7W3EVF0 - Continuing regardless


Loading Domain information:  72%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                              | 7750/10741 [13:51:38<41:54,  1.19it/s]

Error: Missing data for - A0A7W3ICF2 - Continuing regardless


Loading Domain information:  73%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                            | 7839/10741 [13:53:10<53:35,  1.11s/it]

Error: Missing data for - A0A7X6E910 - Continuing regardless


Loading Domain information:  73%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                            | 7863/10741 [13:53:35<54:49,  1.14s/it]

Error: Missing data for - A0A7X8P6G5 - Continuing regardless


Loading Domain information:  73%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                            | 7884/10741 [13:53:56<36:52,  1.29it/s]

Error: Missing data for - A0A7Y1YPQ6 - Continuing regardless


Loading Domain information:  74%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                           | 7925/10741 [13:54:32<46:24,  1.01it/s]

Error: Missing data for - A0A7Y2UG65 - Continuing regardless


Loading Domain information:  74%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                           | 7926/10741 [13:54:32<37:50,  1.24it/s]

Error: Missing data for - A0A7Y2UGQ7 - Continuing regardless


Loading Domain information:  74%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                           | 7927/10741 [13:54:33<32:47,  1.43it/s]

Error: Missing data for - A0A7Y2UKQ2 - Continuing regardless


Loading Domain information:  74%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                          | 7996/10741 [13:55:48<50:00,  1.09s/it]

Error: Missing data for - A0A7Y7A8L5 - Continuing regardless


Loading Domain information:  75%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                          | 8025/10741 [13:56:20<59:39,  1.32s/it]

Error: Missing data for - A0A7Z2K9Y6 - Continuing regardless


Loading Domain information:  75%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                         | 8065/10741 [13:57:02<39:11,  1.14it/s]

Error: Missing data for - A0A800BEB1 - Continuing regardless


Loading Domain information:  75%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                         | 8091/10741 [13:57:26<39:29,  1.12it/s]

Error: Missing data for - A0A822LWH9 - Continuing regardless


Loading Domain information:  75%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                         | 8095/10741 [13:57:30<51:39,  1.17s/it]

Error: Missing data for - A0A826QRQ1 - Continuing regardless


Loading Domain information:  75%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                        | 8099/10741 [13:57:34<50:26,  1.15s/it]

Error: Missing data for - A0A827KSV4 - Continuing regardless


Loading Domain information:  75%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                        | 8103/10741 [13:57:38<44:28,  1.01s/it]

Error: Missing data for - A0A828GJN7 - Continuing regardless


Loading Domain information:  76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                        | 8111/10741 [13:57:46<44:30,  1.02s/it]

Error: Missing data for - A0A829IX78 - Continuing regardless


Loading Domain information:  76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                        | 8123/10741 [13:57:57<39:08,  1.11it/s]

Error: Missing data for - A0A831JCI5 - Continuing regardless


Loading Domain information:  76%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                        | 8150/10741 [13:58:28<46:49,  1.08s/it]

Error: Missing data for - A0A837DPC3 - Continuing regardless


Loading Domain information:  77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                      | 8236/10741 [14:01:07<37:12,  1.12it/s]

Error: Missing data for - A0A844JQQ1 - Continuing regardless


Loading Domain information:  77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                      | 8244/10741 [14:01:14<36:16,  1.15it/s]

Error: Missing data for - A0A846FBI8 - Continuing regardless


Loading Domain information:  77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                      | 8274/10741 [14:01:42<38:52,  1.06it/s]

Error: Missing data for - A0A848J616 - Continuing regardless


Loading Domain information:  77%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                     | 8304/10741 [14:02:11<46:30,  1.15s/it]

Error: Missing data for - A0A855N1C7 - Continuing regardless


Loading Domain information:  77%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                     | 8307/10741 [14:02:13<35:22,  1.15it/s]

# Since the sequences have already been loaded, no need to run the previous script, just use the stuff below to reload the files

In [None]:
total_data = []

with open('../raw_sequences/annotated_sequences.json', 'r') as f:
    total_data = json.load(f)
#total_data

In [None]:
def transform_list(data, result={}):
    for item in data:
        lineage = item["lineage"]
        if lineage:
            lineage_dict = result
            for level in lineage:
                if level not in lineage_dict:
                    lineage_dict[level] = {}
                lineage_dict = lineage_dict[level]
            lineage_dict.update(item)
    return result

transformed_data = transform_list(total_data)


In [None]:
def get_sequences(data, family):
    return [s for s in data if family in s['lineage']]

In [None]:
from collections import defaultdict
import matplotlib.pyplot as plt

def Draw_Pie(data, depth = 2):
    # Create a defaultdict to store the count of each lineage
    lineage_count = defaultdict(int)

    # Iterate through the list of dictionaries and count the occurrences of each lineage
    for item in data:
        try:
            lineage_count[item["lineage"][depth]] += 1
        except:
            continue
            
    # Extract the labels (lineage names) and sizes (counts) for the pie chart
    labels = list(lineage_count.keys())
    sizes = list(lineage_count.values())

    # Create the pie chart
    plt.pie(sizes, labels=labels, autopct='%1.1f%%')
    plt.axis('equal')  # Ensure the chart is a circle, not an ellipse
    plt.legend(bbox_to_anchor=(1.1, 1.05))

    plt.show()

In [None]:
Draw_Pie(total_data, 1)

In [None]:
import Bio as bio
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
import os

family = "Cyanobacteria"
filterByDomain = True

sequences = get_sequences(total_data,family)
seq_recs = []
for seq_data in sequences:
    seq = Seq(seq_data["seq"])
    if(filterByDomain):
        seq = seq[seq_data["YcaO_domain"]["start"]:seq_data["YcaO_domain"]["end"]]
    seq_recs.append(SeqRecord(seq, id=seq_data["Accession_Interpro"]))

dirname = f'../processed_sequences/{family}_sequences'
filename = f"{family}_{'YcaO_only' if filterByDomain else 'whole_protein'}.fa"

if not os.path.exists(dirname):
    # Create the directory
    os.makedirs(dirname)
    
SeqIO.write(seq_recs,os.path.join(dirname,filename), "fasta")

<br><br>
<h1 style="font-size:36px" align="center"> Get Sequences for Rodeo </h1><br><br><br><br><br><br>

In [2]:
import os
import glob
from Bio import SeqIO
from tqdm.auto import tqdm
import pylev
import matplotlib.pyplot as plt
import numpy as np
import copy
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from Bio import SeqIO
import json
import matplotlib.colors as mcolors
import random
def get_item_by_accession(seqs, accession):
    for item in seqs:
        if item['Accession_Interpro'] == accession.split(".")[0]:
            return item
    return None
def get_item_by_RefSeq_accession(seqs, accession):
    print(accession)
    for item in seqs:
        if('Accession_RefSeq' in item):
            if item['Accession_RefSeq'] == accession.split(".")[0]:
                return item
    return None

<h3 style="font-size:24px"> Define Parameters</h3><br>

In [3]:
#If you want to work on all the sequences within the json, leave this variable empty
input_sequences_filename = "" 

all_annotations_filename = "../../../data/ASST_raw_sequences/ASSTs_annotated_sequences.json"
output_folder = "../../../data/ASST_processed_sequences/RODEO_accessions/"

sequences_per_file = 1000   #Default is 1000, as that is the upper limit of what RODEO will accept, maybe this changes in the future
                            #and will need tweeking, but leave it be until then

In [4]:
annotation_data = []
with open(all_annotations_filename, 'r') as f:
    annotation_data = json.load(f)

sequences = []
if (not input_sequences_filename):
    print("No input sequences where supplied, hence we are preparing the whole dataset for RODEO analysis")
    sequences = [d["Accession_Interpro"] for d in annotation_data]
else:
    sequences = [seq.id for seq in SeqIO.parse(input_sequences_filename,"fasta")]
print(f"{len(sequences)} sequences have been supplied for RODEO analysis, checking how many have REFSEQ accession numbers")
    
if not os.path.exists(output_folder):
    # If it doesn't exist, create it
    print(f"RODEO output folder '{output_folder}'not found, generating it")
    os.makedirs(output_folder)


# List to hold the accession RefSeqs
refseq_list = []

# Counters for sequences with and without RefSeq
count_with_refseq = 0
count_without_refseq = 0

# Process each sequence in the array
for seq in tqdm(sequences, desc="Identifying sequences with RefSeq Accession"):
    sequence_info = get_item_by_accession(annotation_data,seq)
    # Check if 'Accession_RefSeq' is present and not empty
    if 'Accession_RefSeq' in sequence_info and sequence_info['Accession_RefSeq']:
        refseq_list.append(sequence_info['Accession_RefSeq'])
        count_with_refseq += 1
    else:
        count_without_refseq += 1

# Save the RefSeqs to a file
file_counter = 1
refseqs_saved = 0
current_file = None

# Iterate through the list of words
for refseq in tqdm(refseq_list, desc="Saving sequences to files"):
    # Check if it's time to start a new file
    if refseqs_saved % sequences_per_file == 0:
        if current_file:
            current_file.close()
        file_name = f"{output_folder}/RODEO_{file_counter}.txt"
        current_file = open(file_name, "w")
        file_counter += 1
    
    # Write the word to the current file
    current_file.write(refseq + "\n")
    refseqs_saved += 1

# Close the last open file
if current_file:
    current_file.close()

print(f"{refseqs_saved} sequence_IDs have been saved in {file_counter - 1} files.")

print(f"Sequences with RefSeq: {count_with_refseq}, Sequences without RefSeq: {count_without_refseq}")

No input sequences where supplied, hence we are preparing the whole dataset for RODEO analysis
10741 sequences have been supplied for RODEO analysis, checking how many have REFSEQ accession numbers


Identifying sequences with RefSeq Accession:   0%|          | 0/10741 [00:00<?, ?it/s]

Saving sequences to files:   0%|          | 0/1311 [00:00<?, ?it/s]

1311 sequence_IDs have been saved in 2 files.
Sequences with RefSeq: 1311, Sequences without RefSeq: 9430
