<br><br>
<h1 style="font-size:36px" align="center"> Create Filtered Dataset </h1><br><br><br><br><br><br>

In [1]:
import os
import glob
from Bio import SeqIO
from tqdm.auto import tqdm
import pylev
import matplotlib.pyplot as plt
import numpy as np
import copy
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from Bio import SeqIO
import copy
import re
import math
import json
import csv
work_dir = "../processed_sequences/initial_dataset/"
interpro_filename = "../raw_sequences/interpro_all_YcaO_domain_sequences_trimmed.txt"
focused_sequences = [seqrec for seqrec in SeqIO.parse(interpro_filename,"fasta")]
for seq in focused_sequences:
    seq_id = seq.id.split(".")[0]
    seq.id = seq_id
    seq.name = seq_id
    seq.description = seq_id
if not os.path.exists(work_dir):
    os.makedirs(work_dir)
alignment_input_filename = os.path.join(work_dir, "alignment_input_sequences.txt")

  from .autonotebook import tqdm as notebook_tqdm


<h3 style="font-size:24px">Remove sequences with undefined codons</h3><br>

In [None]:
non_ambiguous_sequences = []
ambiguous_sequences = []
for sequence in focused_sequences:
    if("X" not in sequence.seq):
        non_ambiguous_sequences.append(sequence)
    else:
        ambiguous_sequences.append(sequence)

print(f"Filtering out sequences with undefined residues.\nOut of {len(focused_sequences)} sequences, {len(non_ambiguous_sequences)} remain.")
focused_sequences = non_ambiguous_sequences

<h3 style="font-size:24px">Remove sequences that are not Archaea or Bacteria</h3><br>

In [None]:
all_annotations_filename = "../raw_sequences/interpro_all_YcaO_annotated.json"

import json
import tqdm
YcaO_data = []
with open(all_annotations_filename, 'r') as f:
    YcaO_data = json.load(f)

def get_item_by_accession(seqs, accession):
    for item in seqs:
        if item['Accession_Interpro'] == accession.split(".")[0]:
            return item
    return None

taxonomy_index = 0
taxonomy_selected_sequences = []
branches_of_life = {}
for i in tqdm.tqdm(range(0,len(focused_sequences)),desc=f"Processing Sequence taxonomy"):
    sequence = non_ambiguous_sequences[i]
    branch_of_life = get_item_by_accession(YcaO_data,sequence.id)["lineage"][taxonomy_index].replace(" ", "_")
    if(branch_of_life == "Bacteria" or branch_of_life == "Archaea"):
        taxonomy_selected_sequences.append(sequence)
    else:
        if(branch_of_life not in branches_of_life):
            branches_of_life[branch_of_life] = 0
        branches_of_life[branch_of_life] += 1

print(f"Filtering out sequences that do not come from Archaea or Bacteria.\nOut of {len(focused_sequences)} sequences, {len(taxonomy_selected_sequences)} remain.")
print(f"Sequences of other branches that were filtered out:")
print(json.dumps(branches_of_life, indent=4))

focused_sequences = taxonomy_selected_sequences

<h3 style="font-size:24px">Remove sequences of abnormal size</h3><br>

In [None]:
abnormal_sized_seqs = []
size_filtered_sequences = []
for seq in focused_sequences:
    if(len(seq.seq) > 450 or len(seq.seq) < 340):
        abnormal_sized_seqs.append(seq)
    else:
        size_filtered_sequences.append(seq)
    
pre_filter = [len(seq.seq) for seq in focused_sequences]
filtered = [len(seq.seq) for seq in size_filtered_sequences]
  
bin_width = 5
bins = range(0, max(pre_filter + filtered) + bin_width, bin_width)

fig, ax = plt.subplots(figsize=(10,5))
ax.hist(pre_filter, bins, color='red', alpha=0.5, label='Pre-Filter')
ax.hist(filtered, bins, color='blue', alpha=0.5, label='Filtered')
ax.set_title("Filtering By Sequence Lengths")
ax.set_xlabel("Sequence Length")
ax.set_ylabel("Count")
plt.tight_layout()
plt.show()

print(f"Filtering out sequences with strange sizes.\nOut of {len(focused_sequences)} sequences, {len(size_filtered_sequences)} remain.")
focused_sequences = size_filtered_sequences

<h3 style="font-size:24px">Save Sequences to File</h3><br>

In [None]:
SeqIO.write(focused_sequences, alignment_input_filename, "fasta")

<br><br>
<h1 style="font-size:36px" align="center"> Create Phylogenetic Tree Data </h1><br><br><br><br><br><br>

In [2]:
all_annotations_filename = "../raw_sequences/interpro_all_YcaO_annotated.json"
tridomain_annotations_filename = "../raw_sequences/cyclodehydratase_annotaded_sequences.json"
rodeo_dir = "../raw_sequences/RODEO_data/"
additional_data_filename = os.path.join(work_dir,"tree_data.json")

data = {}

In [3]:
sequences = [seq for seq in SeqIO.parse(alignment_input_filename,"fasta")]

#Now redundant but keep it anyway
for seq in sequences:
    seq.seq = Seq("".join(str(seq.seq).split("-")))
    new_id = seq.id.split("/")[0]
    seq.id = new_id
    seq.description = new_id
    seq.name = new_id
    
sequence_accessions = [seq.id for seq in sequences]
for accession in sequence_accessions:
    data[accession] = {}

YcaO_data = []
with open(all_annotations_filename, 'r') as f:
    YcaO_data = json.load(f)
    
E1_data = []
with open(tridomain_annotations_filename, 'r') as f:
    E1_data = json.load(f)
    
def add_data(header,d):
    if(len(d) != len(sequences)):
        raise Exception("The data supplied is the wrong length, it needs to be the same length as the total number of sequences!")
    for i in range(0,len(d)):
        accession = sequence_accessions[i]
        data[accession][header] = d[i]

In [4]:
def get_item_by_accession(seqs, accession):
    for item in seqs:
        if item['Accession_Interpro'] == accession.split(".")[0]:
            return item
    return None
def get_item_by_RefSeq_accession(seqs, accession):
    for item in seqs:
        if item['Accession_RefSeq'] == accession.split(".")[0]:
            return item
    return None

In [19]:
#Load RODEO data
rodeo_files = []
for subdir, dirs, files in os.walk(rodeo_dir):
    for file in files:
        if file.endswith("main_co_occur.csv"):
            rodeo_files.append(os.path.join(subdir, file))
            
rodeo_data = []
for file in rodeo_files:
    with open(file) as csvfile:
        reader = csv.DictReader(csvfile)
        rows = [row for row in reader]
        rodeo_data.extend(rows)

        
processed_rodeo_data = {}
for i in tqdm(range(0,len(rodeo_data)),desc="Processing Rodeo Data"):
    row = rodeo_data[i]
    if(row["Query"] not in processed_rodeo_data):
        processed_rodeo_data[row["Query"]] = []
    contents = []
    #print(row)
    for key, value in row.items():
        if key is not None:
            if "PfamID" in key and value is not None:
                contents.append(value)
        elif isinstance(value,list):
            for v in value:
                if re.search(r'PF\d{5}', v):
                    contents.append(v)
    processed_rodeo_data[row["Query"]].extend(contents)

def get_sequence_ID_present(Pfam_ID):
    RefSeq_ids_with_ID = []
    RefSeq_ids_without_ID = []
    for key, value in processed_rodeo_data.items():
        if Pfam_ID in value:
            RefSeq_ids_with_ID.append(key)
        else:
            RefSeq_ids_without_ID.append(key)

    sequence_ID_present = []

    for i in tqdm(range(0,len(sequence_accessions)),desc=f"Scanning sequences for {Pfam_ID}"):
        accession = sequence_accessions[i]
        accession_data = get_item_by_accession(YcaO_data, accession)
        ID_present = "U"
        if("Accession_RefSeq" in accession_data):
            refSeq = accession_data["Accession_RefSeq"]
            if(refSeq in RefSeq_ids_with_ID):
                ID_present = "Y"
            elif(refSeq in RefSeq_ids_without_ID):
                ID_present = "N"
        sequence_ID_present.append(ID_present)
    return sequence_ID_present

# sequence_TfuA_present = get_sequence_ID_present("PF07812")
# sequence_ThiF_present = get_sequence_ID_present("PF00899")#IPR02291
# sequence_tricopeptide = get_sequence_ID_present("PF13432")
# sequence_tricopeptide2 = get_sequence_ID_present("PF13424")
# sequence_tricopeptide3 = get_sequence_ID_present("PF14559")
# sequence_GNAT = get_sequence_ID_present("PF00583")
# sequence_FMN = get_sequence_ID_present("PF03358")
# sequence_FMN2 = get_sequence_ID_present("PF01070")


#sequence_E1_present = get_sequence_ID_present("")#TIGR03603 but I can't find that on Pfam
#Potentially get FMN dependent YcaOs
add_data("TfuA_nearby",sequence_TfuA_present)
add_data("Ocin-ThiF_nearby",sequence_ThiF_present)
add_data("tricopeptide_nearby",sequence_tricopeptide)
add_data("tricopeptide_nearby2",sequence_tricopeptide2)
add_data("tricopeptide_nearby3",sequence_tricopeptide3)
add_data("GNAT_nearby",sequence_GNAT)
add_data("FMN_nearby",sequence_FMN)
add_data("FMN_nearby2",sequence_FMN2)


Processing Rodeo Data: 100%|██████████| 60868/60868 [00:00<00:00, 271722.05it/s]


In [13]:
taxonomy_index = 1
sequence_taxonomy = [get_item_by_accession(YcaO_data,accession)["lineage"][taxonomy_index].replace(" ", "_") for accession in sequence_accessions]
add_data("taxa", sequence_taxonomy)

In [7]:
import random
import colorsys
import matplotlib

def generate_colors(sequence_taxonomy):
    unique_taxa = list(set(sequence_taxonomy))
    unique_taxa.sort()
    print(unique_taxa)
    taxa_colors = {}
    sequence_colors = []
    predefined_colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
    color_index = 0
    for taxa in unique_taxa:
        if color_index < len(predefined_colors):
            taxa_colors[taxa] = predefined_colors[color_index]
            color_index += 1
        else:
            h, s, v = random.random(), 0.5 + random.random() / 2.0, 0.4 + random.random() / 5.0
            r, g, b = colorsys.hsv_to_rgb(h, s, v)
            taxa_colors[taxa] = '#{:02x}{:02x}{:02x}'.format(int(r * 255), int(g * 255), int(b * 255))
    print(taxa_colors)
    for taxa in sequence_taxonomy:
        sequence_colors.append(taxa_colors[taxa])
    return sequence_colors


sequence_colors = generate_colors(sequence_taxonomy)
add_data("color",sequence_colors)


['Acidobacteria', 'Actinobacteria', 'Aquificae', 'Asgard_group', 'Bacteroidetes', 'Balneolaeota', 'Caldiserica/Cryosericota_group', 'Candidatus_Absconditabacteria', 'Candidatus_Adlerbacteria', 'Candidatus_Aegiribacteria', 'Candidatus_Aenigmarchaeota', 'Candidatus_Azambacteria', 'Candidatus_Bathyarchaeota', 'Candidatus_Beckwithbacteria', 'Candidatus_Blackburnbacteria', 'Candidatus_Cloacimonetes', 'Candidatus_Curtissbacteria', 'Candidatus_Daviesbacteria', 'Candidatus_Delongbacteria', 'Candidatus_Dependentiae', 'Candidatus_Eremiobacteraeota', 'Candidatus_Giovannonibacteria', 'Candidatus_Gottesmanbacteria', 'Candidatus_Gracilibacteria', 'Candidatus_Gribaldobacteria', 'Candidatus_Kaiserbacteria', 'Candidatus_Kapabacteria', 'Candidatus_Kerfeldbacteria', 'Candidatus_Komeilibacteria', 'Candidatus_Korarchaeota', 'Candidatus_Levybacteria', 'Candidatus_Liptonbacteria', 'Candidatus_Lloydbacteria', 'Candidatus_Margulisbacteria', 'Candidatus_Marinimicrobia', 'Candidatus_Melainabacteria', 'Candidatus

In [8]:
# define the starting directory
root_dir = "../raw_sequences"

fasta_paths = []
for dirpath, dirnames, filenames in os.walk(root_dir):
    for file in filenames:
        if  "fasta.fa" in file.lower():
            fasta_path = os.path.join(dirpath, file)
            name_path = os.path.join(dirpath, "name.txt")
            
            # check if the name file exists
            if os.path.isfile(name_path):
                # read the content of the name file
                with open(name_path, "r") as f:
                    name = f.read().strip()
            else:
                raise Exception(f"Name file not found {name_path}")
            # append the fasta path and the name to the list as a tuple
            fasta_paths.append((fasta_path, name))



db_dir = os.path.join(work_dir,"db_data/")
db_filename = os.path.join(db_dir,"my_blast_db")
db_sequences_filename = os.path.join(db_dir,"db_sequences.fa")

if not os.path.exists(db_dir):
    os.mkdir(db_dir)
else:
    !rm -rf $db_dir
    os.mkdir(db_dir)


!makeblastdb -in {alignment_input_filename} -dbtype prot -out {db_filename}

import subprocess

path_accessions = []
for i in range(0,len(fasta_paths)):
    fasta_path = fasta_paths[i][0]
    print(f"{i+1}/{len(fasta_paths)} Processing {fasta_path}")
    shortened_path = fasta_paths[i][1]
    output = subprocess.run(f"blastp -db {db_filename} -query {fasta_path} -outfmt '6 sseqid' -evalue 1e-50", shell=True, capture_output=True)
    accession_numbers = output.stdout.decode().strip().split("\n")
    print(f"Got results:\n{accession_numbers}")
    path_accessions.append((shortened_path, accession_numbers))




Building a new DB, current time: 03/08/2023 22:41:55
New DB name:   /files/src/processed_sequences/initial_dataset/db_data/my_blast_db
New DB title:  ../processed_sequences/initial_dataset/alignment_input_sequences.txt
Sequence type: Protein
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 14094 sequences in 0.24846 seconds.


1/27 Processing ../raw_sequences/Amidine/Jesko_but_unknown/Fasta.fa
Got results:
['A0A377Z2T7', 'A0A2X3D8D2']
2/27 Processing ../raw_sequences/Amidine/Bottromycin/Fasta.fa
Got results:
['K4MHF8', 'A0A540Q9A9', 'A0A540P9C2', 'M3FEZ7', 'K0NZY4', 'A0A7K3EJ13', 'C9Z182', 'A0A100JQ73', 'L1L6X4', 'K4JZA2']
3/27 Processing ../raw_sequences/Unknown/EcYcaO/Fasta.fa
Got results:
['A0A376MUM2', 'A0A6N8QWD3', 'W8T509', 'V0AP47', 'U9XEU8', 'S1FBU1', 'Q3Z3L7', 'Q31YU6', 'P75838', 'K4VXU5', 'I6EPI9', 'I4SAI8', 'I2UM09', 'F4VCS9', 'F4TCD0', 'D8EAG6', 'B7LD97', 'A0A8F9I0L6', 'A0A8F9H8N8', 'A0A8E2TE79', 'A0A8E0KXK2', 'A0A862ZG40', 'A0A836NFK5', 'A0

Got results:
['C0JRZ4', 'A0A0K8Q0K9', 'A0A286HXX3', 'A0A2M8XIB2', 'A0A1H5CBQ8', 'A0A5C6WLH3', 'C0JRW9', 'A0A4Q1RYM6', 'A0A161MD05', 'A0A1H8THL2', 'A0A1C4YSW3', 'A0A318NFW1', 'A0A345FAE4', 'A0A2V4NEG2', 'C4RQU0', 'A0A7D6CFW1', 'A0A221W4A5', 'A0A3E2YY81', 'E5DUH9', 'A0A6G9YQJ4', 'A0A7H8XSI2', 'C6FX46', 'A0A1C4ZQY3', 'A0A3N1B292', 'S4MTH7', 'A0A7V9TS16', 'A0A1Q7AQ40', 'A0A3S9V231', 'A0A316ERX1', 'A0A0U2WKT2', 'A0A420GZ61', 'M5P7H2', 'A0A7Y8V037', 'A0A6I7SZE1', 'A0A2B4NHB7', 'A0A1G4KV18', 'A0A6L3BZ92', 'A0A4V6PBD1', 'A0A023CS17', 'A0A2T6C4G9', 'A0A2C4Z9V4', 'A0A0F5RPZ7', 'A0A0G8CDH9', 'C5D552', 'A0A1N6QFF4', 'A0A369WH22', 'A0A316XIF1', 'A0A7U3YHS4', 'A0A178TV90', 'F8FKA1', 'A0A1B7KUG5', 'I0BR67', 'H6NNG6', 'A0A2W4IQC1', 'A0A2W4J3Q2', 'C2NEM4', 'A0RB74', 'A0A7D4CDA7', 'A0A4V2ZTQ4', 'Q0H2Y0', 'A0A073KIP5', 'A0A2C4PP10', 'A0A8T9FGP7', 'M9LC22', 'A0A2G6Q781', 'A0A1T2PYB7', 'A0A7Z8RLA4', 'A0A7V7HSA4', 'A0A2B1DCH2', 'A0A2A7J4G4', 'A0A246PVW2', 'A0A1D3NUS0', 'A0A1C4BDG1', 'A0A1J9VTX6', 'A0A8T9ZFA

Got results:
['Q9A0K3', 'Q5XCZ7', 'Q48UE9', 'Q1JMK7', 'Q1JHQ2', 'Q1J7H7', 'J7M7A4', 'A0A4U7I6K5', 'A0A0H3BXM7', 'A0A0H2UUD9', 'A0A0E1EP97', 'U2U9D8', 'A0A3P5Y3Y4', 'A0A3L8GF98', 'B4U1J8', 'A0A2X3UZR5', 'A0A6M1KRL0', 'A0A6D2LSB5', 'A0A2X3S529', 'C0MDP8', 'C0M9K2', 'A0A0G6ZT77', 'A0A4V0BY49', 'M4YWM6', 'A0A8B6LB99', 'A0A509DF47', 'A0A4V0F4X6', 'G5K9G8', 'A0A448DMW8', 'A0A380KII4', 'C5WFP4', 'A0A4U9XI33', 'A0A5Q2RPU5', 'A0A5Q2S003', 'A0A4V0CKQ6', 'A0A4U9ZK76', 'A0A4V0H4C1', 'A0A4U9XUL5', 'Q8GP15', 'A0A7G9AZY3', 'A0A380L333', 'A0A2J9X7V9', 'A0A139QGI9', 'F9P4B1', 'A0A892RM16', 'A0A024FA74', 'A0A024F9K8', 'A0A024F8Z1', 'L7P6E5', 'I0SDF3', 'A0A024F915', 'U2Y1X1', 'G5K156', 'A0A0P6SLU4', 'A0A2Z4WC67', 'A0A1S9I6L7', 'B1KV59', 'A0A846J117', 'A0A2K9MM18', 'A0A2P1TVJ5', 'A0A7X5P8Z0', 'A0A7U9GJX5', 'A0A0B4W7U4', 'A0A1J1CX96', 'A0A1L3NLN4', 'A0A0S6U1E6', 'A0A0D1BQF2', 'J7SWK6', 'A0A0M1IXW4', 'A0A0A2HDT3', 'A0A6B4DSW4', 'A0A846I2Q6', 'A0A3F2ZTT1', 'A0A0E1KZV8', 'A0A6G4EB42', 'A0A2G7HIE4', 'A0A6B4JY1

Got results:
['A0A6L7HPD9', 'A0A6H3AJJ1', 'C3FZY3', 'A0A243CSM5', 'C3GFT8', 'B7JEU6', 'A0A2R2IAW6', 'A0A2B6CAU0', 'A0A0B5NLX1', 'A0A7X2AL62', 'A0A0J1HZ24', 'A0A1J9VTX6', 'C2NEM4', 'A0RB74', 'A0A7D4CDA7', 'Q6HLT1', 'C2VQP1', 'A0A6H9IWP6', 'A0A6A2TA41', 'A0A4Y8T6C6', 'A0A4Y7QX61', 'A0A3R9DV07', 'A0A2S9HG37', 'A0A150AYQ2', 'A0A1T3V7J1', 'Q63EB4', 'A0A6M8D2H6', 'A0A2K8ZF60', 'A0A6I2ASD8', 'A0A242VZK7', 'D8H648', 'A0A7U6BZX7', 'A0A4S4HZE3', 'A0A2S9Y8R1', 'A0A243BKP6', 'A0A1T2PYB7', 'A0A158RKX9', 'A0A0F5RPZ7', 'A0A2N1K2G0', 'A0A437SJF6', 'C2S0Q4', 'B9IU55', 'B7I0G9', 'A0A5M9GYS0', 'A0A402GGF0', 'A0A3G5U635', 'A0A1Y6A8S5', 'A0A329F4W3', 'J8D2V4', 'C2MHS9', 'A0A2N0YKY2', 'A0A5C5AKK1', 'A0A316XIF1', 'A0A150BY04', 'A0A853XHB5', 'A0A5C5A5T4', 'A0A2M9WXM0', 'A0A0E1MKR6', 'A0A7Z8S6T7', 'A0A2B4LQN5', 'A0A068N4S8', 'A0A6I6YPP1', 'A0A4P9F399', 'A0A8T9ZFA6', 'A0A6B3KPN6', 'A0A1J9SZM6', 'A0A556CIR3', 'A0A162PBM2', 'A0A6L5LL33', 'A0A7V7S836', 'A0A7V7L7L1', 'A0A150EJI6', 'A0A1T2P9E4', 'A0A4R4BH15', 'A0A2C

Got results:
['Q52QI6', 'A0A166XY79', 'A0A6H9GIQ3', 'A0A1E4Q845', 'S3JD46', 'I4HNZ7', 'A0A552CBV2', 'A0A552AVC8', 'L7EB34', 'A8Y998', 'A0A3N0X0V8', 'A8YE43', 'A0A1X9LB16', 'I4H1I7', 'A0A552LCQ7', 'B2KYG1', 'A0A367Q6E1', 'B2KYG8', 'A0A166XYF7', 'A0A7Z9C271', 'A0A350Y0Y3', 'A0YXD2', 'A0A822LBS3', 'A0A552FCL4', 'A0A328I5V0', 'A0A355DQZ2', 'A0A0M1JU92', 'A0A0D6KN79', 'A0A7C9PT41', 'K9VLB2', 'H1W859', 'A0A5Q4H6E3', 'K9RMB3', 'A0A846GSF1', 'D8G188', 'A0A845X128', 'A0A846DBR4', 'A0A845Y7Y0', 'E0UMX3', 'A0A8D6NR33', 'A0A6P0XHA0', 'Q112J9', 'A0A6P0NZB9', 'A0A6L9ZUD4', 'A0A6P0RHT8', 'A0A6M0FLX7', 'A0A6P0HTK1', 'A0A846ABY3', 'A0A1Z4QMD6', 'A0A846G8X9', 'A0A0S6UX82', 'A0A6I5RH71', 'A0A841WK81', 'A0A1Z4IW12', 'K9WBM2', 'A0A350X9X7', 'A0A5Q0TXK7', 'A0A5Q0TWV7', 'A0A352X3V7', 'A0A6M0S623', 'A0A6M0RJQ8', 'A0A845WWA1', 'A0A0D6KA13', 'A0A0D7P7Y9', 'A0A6L9ZF47', 'A0A235IR86', 'A0A6P0KVZ0', 'A0A1D9G459', 'B8HTZ0', 'A0A1D8TWC4', 'A0A6P0LJL1', 'A0A7C1VVK8', 'A0A496V9L2', 'A0A1W9X7G5', 'A0A6P0KJB4', 'A0A6P0L

Got results:
['K4MJT7', 'M3ENG2', 'K0P4N8', 'A0A7K3EIY9', 'C9Z183', 'A0A540P9G5', 'L1L6K6', 'A0A540P2Q5', 'K4JVC6']
19/27 Processing ../raw_sequences/Thioamide/Thioviridamide/Fasta.fa
Got results:
['T2HWM4', 'A0A7W9H482', 'A0A0N0HLJ3', 'A0A4D9CFI4', 'A0A2Z5WLL4', 'A0A1S2PC86', 'A0A1R1S5L9', 'A0A1J4PYC2', 'A0A1C6UB24', 'A0A229RNL7', 'A0A229RYW0', 'A0A154M4V3', 'A0A1R0KWS9', 'A0A193BSG9', 'A0A239PCE0', 'A0A2T0N7C3', 'A0A229S9K9', 'A0A8J3Q978', 'A0A1Q4XN37', 'A0A1C6TT33', 'A0A0F0GSX1', 'A0A840P1S7', 'A0A4R5EEK9', 'A0A7K1KSY3', 'A0A3A9Y6E0', 'A0A010ZR31', 'A0A1H3TLS8', 'A0A8J4E5M4', 'A0A429GDY0', 'A0A8J7KJS2', 'A0A0K1EF39', 'A0A3S5GY88', 'A0A4Q0SZI9', 'A0A1H1YV18', 'A0A2V8MVP5', 'A0A6A7LCX4', 'A0A3N5WB13', 'A0A6A8FJD2', 'A0A6G1YTD3', 'K0IEK3', 'A0A6G1YPE4', 'A0A2V8S2N8', 'A0A2V2U5N5', 'A0A8T7GJ04', 'A0A838W2R6', 'A0A143PIS8', 'A0A484ID53', 'A0A533VF70', 'A0A533UWL3', 'A0A533VBI8', 'A0A523B9K5', 'A0A832CEI4', 'A0A7C4TN06', 'A0A533U9J3', 'A0A2V2MW46', 'A0A842LU58', 'A0A533WLG8', 'A0A7J2YCK8'

Got results:
['A0A1Q4XN37', 'A0A1C6UB24', 'A0A229RNL7', 'A0A154M4V3', 'A0A193BSG9', 'A0A1R0KWS9', 'A0A229RYW0', 'A0A2T0N7C3', 'A0A010ZR31', 'A0A239PCE0', 'A0A229S9K9', 'A0A1H3TLS8', 'A0A4R5EEK9', 'A0A8J3Q978', 'A0A1C6TT33', 'A0A3A9Y6E0', 'A0A7W9H482', 'A0A0N0HLJ3', 'A0A7K1KSY3', 'A0A0F0GSX1', 'A0A1S2PC86', 'A0A1R1S5L9', 'A0A2Z5WLL4', 'A0A8J7KJS2', 'A0A8J4E5M4', 'A0A1J4PYC2', 'A0A840P1S7', 'A0A3S5GY88', 'A0A429GDY0', 'A0A4D9CFI4', 'T2HWM4', 'A0A0K1EF39', 'A0A4Q0SZI9', 'A0A1H1YV18', 'A0A3N5RCQ9', 'A0A060HQ07', 'D4N6W6', 'A0A7K4A113', 'A0A520KSE4', 'A0A533UWL3', 'A0A533VF70', 'A0A8T7GJ04', 'A0A075MW40', 'A0A7J4L8D5', 'A0A1G9ASI5', 'A0A8F5VP36', 'A0A8E7EG62', 'A0A6A7LCX4', 'A0A1V5A3B7', 'A0A1V4UU30', 'A0A838W2R6', 'A0A2I0NQ28', 'A0A2I0P8S1', 'A0A5B2Z6M8', 'A0A7J4PNY4', 'A0A832S9X1', 'A3CV63', 'A0A0H1QXZ8', 'A0A2V8MVP5', 'Q2FL72', 'A0A0Q0VBY2', 'A0A1V5T0H9', 'A0A5E4IH80', 'A0A2V2MW46', 'A0A0X3BRB0', 'A0A1M4MMA9', 'A0A8T7H033', 'A0A524QML5', 'I7LKL0', 'A0A842XIB1', 'A0A257MHR5', 'A0A2K5ASZ8'

Got results:
['A0A0N0HLJ3', 'A0A7W9H482', 'A0A2Z5WLL4', 'A0A1R1S5L9', 'A0A1S2PC86', 'A0A1J4PYC2', 'A0A0F0GSX1', 'A0A4D9CFI4', 'A0A193BSG9', 'A0A154M4V3', 'A0A229RYW0', 'A0A229RNL7', 'A0A1R0KWS9', 'A0A1C6UB24', 'A0A8J3Q978', 'A0A1C6TT33', 'T2HWM4', 'A0A3A9Y6E0', 'A0A2T0N7C3', 'A0A8J4E5M4', 'A0A7K1KSY3', 'A0A429GDY0', 'A0A840P1S7', 'A0A010ZR31', 'A0A1Q4XN37', 'A0A229S9K9', 'A0A239PCE0', 'A0A4R5EEK9', 'A0A1H3TLS8', 'A0A0K1EF39', 'A0A8J7KJS2', 'A0A3S5GY88', 'A0A4Q0SZI9', 'A0A6A7LCX4', 'A0A8T7GJ04', 'A0A3N5WB13', 'A0A838W2R6', 'A0A1H1YV18', 'A0A7J4PNY4', 'A0A060HQ07', 'A0A2V8MVP5', 'A0A843BXM9', 'A0A5B2Z6M8', 'A0A533UWL3', 'A0A101IKT6', 'A0A0Q1AX08', 'A0A8T4DEM3', 'A0A7K4A113', 'A0A533VF70', 'A0A843DT95', 'A0A3N5X0C7', 'A0A3N5RCQ9', 'A0A533WLG8', 'A0A843D4L7', 'A0A843DTJ8', 'A0A838NAR4', 'K0IEK3', 'D4N6W6', 'A0A1Q7JXM8', 'A0A533VBI8', 'A0A075MW40', 'A0A1Q7FAQ1', 'A0A520JXK9', 'A0A832S9X1', 'A0A8T9VP19', 'A0A1V4ZFV4', 'A0A1V4TWC7', 'A0A838U2P6', 'A0A1V5A3B7', 'A0A1V4UU30', 'A0A533U9J3', 'A0A

Got results:
['A0A1H8TGX8', 'A0A2V4NIH3', 'A0A318NF98', 'A0A1C4YSR5', 'A0A221W4I2', 'A0A229GYP0', 'D7BUX6', 'A0A640TM68', 'A0A1R1S9R9', 'A0A6G2X4V8', 'A0A3S9YEI0', 'A0A6G2R493', 'A0A1C6NDP0', 'A0A2U8VA38', 'A0A5N5VZL4', 'M3CDE7', 'A0A7X1IMZ1', 'A0A0M9Y9D3', 'A0A6M1L623', 'A0A1G7DKE5', 'A0A1G9IB19', 'A0A849C6Z3', 'A0A1E4I823', 'A0A1V4DCD0', 'A0A1W2LSQ0', 'A0A1V4CZ37', 'A0A372FXT4', 'A0A7W5YUQ2', 'A0A8J3CD60', 'A0A2B8AS43', 'A0A0W7X6X8', 'A0A1Q8CAF0', 'A0A7X0HKX8', 'A0A6G2XS19', 'A0A2K8R475', 'A0A2L2Q7H2', 'A0A150VL52', 'A0A1I1APM0', 'A0A1V2KQZ2', 'A0A0P7C7N5', 'A0A193BY82', 'W0FYB2', 'A0A4Q7ZL86', 'A0A1H3DU37', 'A0A124I0V6', 'A0A1Q5LDK5', 'A0A838ELM7', 'A0A2S4Y1F9', 'K4RH04', 'A0A1X1NBP1', 'A0A646KIP5', 'A0A6G9YIN9', 'A0A7K2JE43', 'A0A840INW5', 'A0A0Q6YPY1', 'A0A291RL38', 'A0A229H8X5', 'A0A370B538', 'A0A5R9DQX1', 'A0A7T4PM74', 'A0A1H6EQR9', 'A0A0M9YXN3', 'A0A561W045', 'A0A6G9Z7Y5', 'A0A0D7CJS1', 'A0A6B2RPJ1', 'A0A6H1NFM3', 'A0A1V2QL75', 'V6JMA9', 'W7VG94', 'A0A505HJJ9', 'A0A1X1UTC3', 'A

In [9]:
import itertools

def get_similar_sequence_name_if_any(accession,path_accessions):
    result = []
    for path in path_accessions:
        if accession in path[1]:
            result.append((path[1].index(accession),path[0]))
    return result

similar_accessions = list(itertools.chain(*[path[1] for path in path_accessions]))

sequence_characterised = ["Y" if accession in similar_accessions else "N" for accession in sequence_accessions]
sequence_related_sequence = [get_similar_sequence_name_if_any(accession,path_accessions) for accession in sequence_accessions]

add_data("related_to_known_seq",sequence_characterised)
add_data("related_seq",sequence_related_sequence)


In [10]:
#Annotate all sequences whether or not they were tridomain
E1_sequences = [val["Accession_Interpro"] for val in E1_data]
sequence_contains_E1 = ["Y" if accession.split(".")[0] in E1_sequences else "N" for accession in sequence_accessions]
add_data("contains_E1",sequence_contains_E1)

In [18]:

with open(additional_data_filename, "w") as file:
    # Write the dictionary to the file in JSON format
    json.dump(data, file)


<br><br>
<h1 style="font-size:36px" align="center"> Diagrams for fun </h1><br><br><br><br><br><br>

In [None]:
import os
import glob
from Bio import SeqIO
from tqdm.auto import tqdm
import pylev
import matplotlib.pyplot as plt
import numpy as np
import copy
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from Bio import SeqIO
import json
import matplotlib.colors as mcolors
import random
all_annotations_filename = "../raw_sequences/interpro_all_YcaO_annotated.json"


<h3 style="font-size:24px"> Define Parameters</h3><br>

In [None]:
input_sequences_filename = "../processed_sequences/initial_dataset/alignment_input_sequences.txt"

In [None]:
sequences = [seq for seq in SeqIO.parse(input_sequences_filename,"fasta")]


sequence_accessions = [seq.id for seq in sequences]

YcaO_data = []
with open(all_annotations_filename, 'r') as f:
    YcaO_data = json.load(f)
           
def get_item_by_accession(seqs, accession):
    for item in seqs:
        if item['Accession_Interpro'] == accession.split(".")[0]:
            return item
    return None

In [None]:
taxonomy_info = {}
for i in tqdm(range(0,len(sequence_accessions))):
    accession = sequence_accessions[i]
    taxonomy = get_item_by_accession(YcaO_data,accession)["lineage"]
    if taxonomy[0] not in taxonomy_info:
        taxonomy_info[taxonomy[0]] = {}
    if taxonomy[1] not in taxonomy_info[taxonomy[0]]:
        taxonomy_info[taxonomy[0]][taxonomy[1]] = 0
    taxonomy_info[taxonomy[0]][taxonomy[1]] += 1


In [None]:
for i in range(0,1):
    top_level = []
    first_level = []
    vals = []
    for key in taxonomy_info.keys():
        top_level.append(key)
        vals.append([])
        keys2 = list(taxonomy_info[key].keys())
        random.shuffle(keys2)
        for key2 in keys2:
            if(taxonomy_info[key][key2] > 30):
                first_level.append(key2)
                vals[-1].append((taxonomy_info[key][key2]))
    fig, ax = plt.subplots(figsize=(15, 15))

    size = 0.3

    inner_layer = [sum(x) for x in vals]
    outer_layer = [item for sublist in vals for item in sublist]

    inner_colors = ['#1f77b4', '#ff7f0e']

    bacteria_colors = ["#1f77b4"]*len(vals[0])
    archaea_colors = ["#ff7f0e"]*len(vals[1])

    bacteria_colors = [color+'{:02x}'.format(int(random.uniform(0.2, 1) * 255)) for color in bacteria_colors]
    archaea_colors = [color+'{:02x}'.format(int(random.uniform(0.3, 1) * 255)) for color in archaea_colors]
    print(bacteria_colors)
    outer_colors = bacteria_colors+archaea_colors

    ax.pie(outer_layer, radius=1, colors=outer_colors,
           labels=first_level, textprops={'fontsize': 13},
           wedgeprops=dict(width=size, edgecolor='w'))

    ax.pie(inner_layer, radius=1-size, colors=inner_colors,
           labels=top_level, textprops={'fontsize': 20},labeldistance=.6,
           wedgeprops=dict(width=size, edgecolor='w'))

    ax.set(aspect="equal", title='Taxonomy of YcaO sequences')
    plt.show()

<br><br>
<h1 style="font-size:36px" align="center"> Get Sequences for Rodeo </h1><br><br><br><br><br><br>

In [None]:
import os
import glob
from Bio import SeqIO
from tqdm.auto import tqdm
import pylev
import matplotlib.pyplot as plt
import numpy as np
import copy
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from Bio import SeqIO
import json
import matplotlib.colors as mcolors
import random
all_annotations_filename = "../raw_sequences/interpro_all_YcaO_annotated.json"


<h3 style="font-size:24px"> Define Parameters</h3><br>

In [None]:
input_sequences_filename = "../processed_sequences/initial_dataset/alignment_input_sequences.txt"

In [None]:
sequences = [seq for seq in SeqIO.parse(input_sequences_filename,"fasta")]
    
YcaO_data = []
with open(all_annotations_filename, 'r') as f:
    YcaO_data = json.load(f)
    

In [None]:
def get_item_by_accession(seqs, accession):
    for item in seqs:
        if item['Accession_Interpro'] == accession.split(".")[0]:
            return item
    return None
def get_item_by_RefSeq_accession(seqs, accession):
    print(accession)
    for item in seqs:
        if('Accession_RefSeq' in item):
            if item['Accession_RefSeq'] == accession.split(".")[0]:
                return item
    return None

In [None]:
print(YcaO_data[0])
print(sequences[0].id)
print(get_item_by_accession(YcaO_data, sequences[0].id))