<br><br>
<h1 style="font-size:36px" align="center"> Create Filtered Dataset </h1><br><br><br><br><br><br>

In [None]:
import os
import glob
from Bio import SeqIO
from tqdm.auto import tqdm
import pylev
import matplotlib.pyplot as plt
import numpy as np
import copy
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from Bio import SeqIO
import copy
import re
import math
import json
import csv

In [None]:
#Where do you want to save all this data?
work_dir = "../../../data/ASST_processed_sequences/23_02_24_Quick_Tree/"

interpro_filename = "../../../data/ASST_raw_sequences/interpro_all_ASST_domain_sequences_trimmed.txt"
focused_sequences = [seqrec for seqrec in SeqIO.parse(interpro_filename,"fasta")]
for seq in focused_sequences:
    seq_id = seq.id.split(".")[0]
    seq.id = seq_id
    seq.name = seq_id
    seq.description = seq_id
if not os.path.exists(work_dir):
    os.makedirs(work_dir)
alignment_input_filename = os.path.join(work_dir, "alignment_input_sequences.txt")

<h3 style="font-size:24px">Remove sequences with undefined codons</h3><br>

In [None]:
non_ambiguous_sequences = []
ambiguous_sequences = []
for sequence in focused_sequences:
    if("X" not in sequence.seq):
        non_ambiguous_sequences.append(sequence)
    else:
        ambiguous_sequences.append(sequence)

print(f"Filtering out sequences with undefined residues.\nOut of {len(focused_sequences)} sequences, {len(non_ambiguous_sequences)} remain.")
focused_sequences = non_ambiguous_sequences

<h3 style="font-size:24px">Remove sequences that are not Archaea or Bacteria</h3><br>

In [None]:
all_annotations_filename = "../../../data/ASST_raw_sequences/ASSTs_annotated_sequences.json"

import json
import tqdm
YcaO_data = []
with open(all_annotations_filename, 'r') as f:
    YcaO_data = json.load(f)

def get_item_by_accession(seqs, accession):
    for item in seqs:
        if item['Accession_Interpro'] == accession.split(".")[0]:
            return item
    return None

taxonomy_index = 0
taxonomy_selected_sequences = []
branches_of_life = {}
for i in tqdm.tqdm(range(0,len(focused_sequences)),desc=f"Processing Sequence taxonomy"):
    sequence = non_ambiguous_sequences[i]
    branch_of_life = get_item_by_accession(YcaO_data,sequence.id)["lineage"][taxonomy_index].replace(" ", "_")
    if(branch_of_life == "Bacteria" or branch_of_life == "Archaea"):
        taxonomy_selected_sequences.append(sequence)
    else:
        if(branch_of_life not in branches_of_life):
            branches_of_life[branch_of_life] = 0
        branches_of_life[branch_of_life] += 1

print(f"Filtering out sequences that do not come from Archaea or Bacteria.\nOut of {len(focused_sequences)} sequences, {len(taxonomy_selected_sequences)} remain.")
print(f"Sequences of other branches that were filtered out:")
print(json.dumps(branches_of_life, indent=4))

focused_sequences = taxonomy_selected_sequences

<h3 style="font-size:24px">Remove sequences of abnormal size</h3><br>

In [None]:
abnormal_sized_seqs = []
size_filtered_sequences = []
for seq in focused_sequences:
    if(len(seq.seq) > 400 or len(seq.seq) < 200):
        abnormal_sized_seqs.append(seq)
    else:
        size_filtered_sequences.append(seq)
    
pre_filter = [len(seq.seq) for seq in focused_sequences]
filtered = [len(seq.seq) for seq in size_filtered_sequences]
  
bin_width = 5
bins = range(0, max(pre_filter + filtered) + bin_width, bin_width)

fig, ax = plt.subplots(figsize=(10,5))
ax.hist(pre_filter, bins, color='red', alpha=0.5, label='Pre-Filter')
ax.hist(filtered, bins, color='blue', alpha=0.5, label='Filtered')
ax.set_title("Filtering By Sequence Lengths")
ax.set_xlabel("Sequence Length")
ax.set_ylabel("Count")
plt.tight_layout()
plt.show()

print(f"Filtering out sequences with strange sizes.\nOut of {len(focused_sequences)} sequences, {len(size_filtered_sequences)} remain.")
focused_sequences = size_filtered_sequences

<h3 style="font-size:24px">Save Sequences to File</h3><br>

In [None]:
SeqIO.write(focused_sequences, alignment_input_filename, "fasta")

<br><br>
<h1 style="font-size:36px" align="center"> Create Phylogenetic Tree Data </h1><br><br><br><br><br><br>

In [1]:
import os
import glob
from Bio import SeqIO
from tqdm.auto import tqdm
import pylev
import matplotlib.pyplot as plt
import numpy as np
import copy
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from Bio import SeqIO
import copy
import re
import math
import json
import csv

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
all_annotations_filename = "../../../data/ASST_raw_sequences/ASSTs_annotated_sequences.json"
sequences_filename = "../../../data/ASST_processed_sequences/23_02_24_Quick_Tree/clustalo_5_per_cluster_70_ID_final_MSA.fa"
output_filename = "../../../data/ASST_processed_sequences/23_02_24_Quick_Tree/tree_annotations.json"
output_filename_tsv = "../../../data/ASST_processed_sequences/23_02_24_Quick_Tree/tree_annotations.tsv"

work_dir = "../../../data/ASST_processed_sequences/23_02_24_Quick_Tree/"

In [3]:
sequences = [seq for seq in SeqIO.parse(sequences_filename,"fasta")]

general_json_data = []
with open(all_annotations_filename, 'r') as f:
    general_json_data = json.load(f)
    
data = {}
sequence_accessions = [seq.id for seq in sequences]
for accession in sequence_accessions:
    data[accession] = {}
    
def get_item_by_accession(seqs, accession):
    for item in seqs:
        if item['Accession_Interpro'] == accession.split(".")[0]:
            return item
    return None
def get_item_by_RefSeq_accession(seqs, accession):
    for item in seqs:
        if item['Accession_RefSeq'] == accession.split(".")[0]:
            return item
    return None

def add_data(header,d):
    if(len(d) != len(sequences)):
        raise Exception("The data supplied is the wrong length, it needs to be the same length as the total number of sequences!")
    for i in range(0,len(d)):
        accession = sequence_accessions[i]
        data[accession][header] = d[i]

In [4]:
taxonomy_index = 0
sequence_taxonomy = [get_item_by_accession(general_json_data,accession)["lineage"][taxonomy_index].replace(" ", "_") for accession in sequence_accessions]
add_data("taxa", sequence_taxonomy)

import random
import colorsys
import matplotlib

def generate_colors(sequence_taxonomy):
    unique_taxa = list(set(sequence_taxonomy))
    unique_taxa.sort()
    print(unique_taxa)
    taxa_colors = {}
    sequence_colors = []
    predefined_colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
    color_index = 0
    for taxa in unique_taxa:
        if color_index < len(predefined_colors):
            taxa_colors[taxa] = predefined_colors[color_index]
            color_index += 1
        else:
            h, s, v = random.random(), 0.5 + random.random() / 2.0, 0.4 + random.random() / 5.0
            r, g, b = colorsys.hsv_to_rgb(h, s, v)
            taxa_colors[taxa] = '#{:02x}{:02x}{:02x}'.format(int(r * 255), int(g * 255), int(b * 255))
    print(taxa_colors)
    for taxa in sequence_taxonomy:
        sequence_colors.append(taxa_colors[taxa])
    return sequence_colors


sequence_colors = generate_colors(sequence_taxonomy)
add_data("color",sequence_colors)


['Archaea', 'Bacteria']
{'Archaea': '#1f77b4', 'Bacteria': '#ff7f0e'}


In [5]:
#Load special sequences and find them in the tree
# define the starting directory
root_dir = "../../../data/ASST_raw_sequences/interesting_sequences/"

import os

fasta_paths = []
for dirpath, dirnames, filenames in os.walk(root_dir):
    for file in filenames:
        if  "sequence.fa" in file.lower():
            fasta_path = os.path.join(dirpath, file)
            name = os.path.basename(os.path.dirname(fasta_path))
            
            # append the fasta path and the name to the list as a tuple
            fasta_paths.append((fasta_path, name))


db_dir = os.path.join(work_dir,"db_data/")
db_filename = os.path.join(db_dir,"my_blast_db")
db_sequences_filename = os.path.join(db_dir,"db_sequences.fa")

if not os.path.exists(db_dir):
    os.mkdir(db_dir)
else:
    !rm -rf $db_dir
    os.mkdir(db_dir)


def get_full_sequence(seq_id):
    entry = get_item_by_accession(general_json_data,seq_id)
    seq = Seq(entry["seq"])
    return SeqRecord(seq, id=entry["Accession_Interpro"], description="")
    
db_sequences = []
for accession in sequence_accessions:
    db_sequences.append(get_full_sequence(accession))
SeqIO.write(db_sequences, db_sequences_filename, "fasta")

!makeblastdb -in {db_sequences_filename} -dbtype prot -out {db_filename}

import subprocess

path_accessions = []
for i in range(0,len(fasta_paths)):
    fasta_path = fasta_paths[i][0]
    print(f"{i+1}/{len(fasta_paths)} Processing {fasta_path}")
    shortened_path = fasta_paths[i][1]
    output = subprocess.run(f"blastp -db {db_filename} -query {fasta_path} -outfmt '6 sseqid' -evalue 1e-50", shell=True, capture_output=True)
    accession_numbers = output.stdout.decode().strip().split("\n")
    print(f"Got results:\n{accession_numbers}")
    path_accessions.append((shortened_path, accession_numbers))




Building a new DB, current time: 02/27/2024 15:05:07
New DB name:   /files/data/ASST_processed_sequences/23_02_24_Quick_Tree/db_data/my_blast_db
New DB title:  ../../../data/ASST_processed_sequences/23_02_24_Quick_Tree/db_data/db_sequences.fa
Sequence type: Protein
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 805 sequences in 0.087117 seconds.


1/24 Processing ../../../data/ASST_raw_sequences/interesting_sequences/3ELQ/sequence.fa
Got results:
['A0A444R377', 'A0A447PXQ2', 'A0A744VXH3', 'R5Q6T1', 'A0A831LQ00', 'A0A139JVE0', 'H3KH99', 'A0A381DJT3', 'A0A659PJW9', 'A0A2R4P1T2', 'A0A496LFW1', 'A0A139JS08']
2/24 Processing ../../../data/ASST_raw_sequences/interesting_sequences/ASTB/sequence.fa
Got results:
['I4A3U7', 'A0A942KLW0', 'A0A8J7RUW5', 'A0A8J7RRZ2', 'A0A1Y4RY76', 'A0A2V2EI48', 'A0A143WX81', 'A0A7X1LV92', 'A0A1Y4RZ00', 'A0A4V5NPL0', 'A0A2V3Y1W7', 'A0A1C5Y9T5', 'A0A1C6AJ16', 'A0A2L1GLL8', 'A0A1Y4CK53', 'A0A350X2F4', 'A0A5N1BVY2', 'A0A847E3G1', 'A0

In [6]:
import itertools

def get_similar_sequence_name_if_any(accession,path_accessions):
    result = []
    for path in path_accessions:
        if accession in path[1]:
            result.append((path[1].index(accession),path[0]))
    return result

similar_accessions = list(itertools.chain(*[path[1] for path in path_accessions]))

sequence_characterised = ["Y" if accession in similar_accessions else "N" for accession in sequence_accessions]
sequence_related_sequence = [get_similar_sequence_name_if_any(accession,path_accessions) for accession in sequence_accessions]


def get_closest_sequence_if_any(accession,path_accessions):
    for important_sequence_info in path_accessions:
        if important_sequence_info[1] != []:
            top_hit =  important_sequence_info[1][0]
            if top_hit == accession:
                return important_sequence_info[0]
    return "_" 
    
important_sequences = [get_closest_sequence_if_any(accession,path_accessions) for accession in sequence_accessions]


# add_data("related_to_known_seq",sequence_characterised)
# add_data("related_seq",sequence_related_sequence)
add_data("important_seq",important_sequences)

In [7]:
# Sequence size
sequences_sizes = [len(get_item_by_accession(general_json_data,accession)["seq"]) for accession in sequence_accessions]

add_data("sequence_length",sequences_sizes)

In [30]:
# Domain Info
domain_info_filename = "../../../data/ASST_raw_sequences/ASSTs_domain_data.json"

domain_info = []
with open(domain_info_filename, 'r') as f:
    domain_info = json.load(f)

def get_domain_info(accession):
    for element in domain_info:
        if(element["Accession_Interpro"] == accession):
            if("error" in element["domains"]):
                return []
            return element["domains"]
    return []
    
sequence_domains = [get_domain_info(accession) for accession in sequence_accessions]

from itertools import chain
from collections import Counter
unique_domains = set(chain.from_iterable(sequence_domains))

def generate_colour_from_set(value,set_of_values):
    predefined_colors = [ '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf', '#1f77b4', '#ff7f0e', '#2ca02c', '#d62728']
    value_index = list(set_of_values).index(value)
    rng_preset_seed = 42
    rng_seed = rng_preset_seed + value_index
    random.seed(rng_seed)
    if value_index < len(predefined_colors):
        return predefined_colors[value_index]
    else:
        h, s, v = random.random(), 0.5 + random.random() / 2.0, 0.4 + random.random() / 5.0
        r, g, b = colorsys.hsv_to_rgb(h, s, v)
        return '#{:02x}{:02x}{:02x}'.format(int(r * 255), int(g * 255), int(b * 255))


all_domains = list(chain.from_iterable(sequence_domains))
# Count occurrences of each domain
domain_counter = Counter(all_domains)

# Get the 10 most common domains
top_10_domains = domain_counter.most_common(10)

# Extract just the domain names from the Counter results
most_common_domains = [domain for domain, count in top_10_domains]

print("Top 10 Most Common Domains:\n")
for domain in most_common_domains:
    print(domain)
    domain_in_sequence = ["Y" for domains in sequence_domains]

Top 10 Most Common Domains:

IPR010262
PTHR35340
IPR011047
IPR035391
IPR038477
IPR026444
PS51257
SSF63829
IPR013783
IPR011044


In [8]:
with open(output_filename, "w") as file:
    # Write the dictionary to the file in JSON format
    json.dump(data, file)

import csv
def convert_json_to_tsv(data, output_path):
    # Check if all elements have the same fields
    fields = set()
    for item in data.values():
        for key in item.keys():
            fields.add(key)

    for item in data.values():
        if set(item.keys()) != fields:
            raise ValueError("All elements in the dictionary must have the same fields.")

    # Prepare data for writing to TSV
    rows = []
    for seq_id, details in data.items():
        row = [seq_id] + [details[field] for field in sorted(details)]
        rows.append(row)

    # Define header based on fields
    header = ["Name"] + sorted(list(fields))

    # Write to TSV file
    with open(output_path, 'wt', newline='', encoding='utf-8') as f:
        writer = csv.writer(f, delimiter='\t')
        writer.writerow(header)
        writer.writerows(rows)

    return f"Data successfully written to {output_path}"

convert_json_to_tsv(data,output_filename_tsv)

'Data successfully written to ../../../data/ASST_processed_sequences/23_02_24_Quick_Tree/tree_annotations.tsv'

In [None]:
#Load RODEO data
rodeo_files = []
for subdir, dirs, files in os.walk(rodeo_dir):
    for file in files:
        if file.endswith("main_co_occur.csv"):
            rodeo_files.append(os.path.join(subdir, file))
            
rodeo_data = []
for file in rodeo_files:
    with open(file) as csvfile:
        reader = csv.DictReader(csvfile)
        rows = [row for row in reader]
        rodeo_data.extend(rows)

        
processed_rodeo_data = {}
for i in tqdm(range(0,len(rodeo_data)),desc="Processing Rodeo Data"):
    row = rodeo_data[i]
    if(row["Query"] not in processed_rodeo_data):
        processed_rodeo_data[row["Query"]] = []
    contents = []
    #print(row)
    for key, value in row.items():
        if key is not None:
            if "PfamID" in key and value is not None:
                contents.append(value)
        elif isinstance(value,list):
            for v in value:
                if re.search(r'PF\d{5}', v):
                    contents.append(v)
    processed_rodeo_data[row["Query"]].extend(contents)

def get_sequence_ID_present(Pfam_ID):
    RefSeq_ids_with_ID = []
    RefSeq_ids_without_ID = []
    for key, value in processed_rodeo_data.items():
        if Pfam_ID in value:
            RefSeq_ids_with_ID.append(key)
        else:
            RefSeq_ids_without_ID.append(key)

    sequence_ID_present = []

    for i in tqdm(range(0,len(sequence_accessions)),desc=f"Scanning sequences for {Pfam_ID}"):
        accession = sequence_accessions[i]
        accession_data = get_item_by_accession(YcaO_data, accession)
        ID_present = "U"
        if("Accession_RefSeq" in accession_data):
            refSeq = accession_data["Accession_RefSeq"]
            if(refSeq in RefSeq_ids_with_ID):
                ID_present = "Y"
            elif(refSeq in RefSeq_ids_without_ID):
                ID_present = "N"
        sequence_ID_present.append(ID_present)
    return sequence_ID_present



In [None]:
#Annotate all sequences whether or not they were tridomain
E1_sequences = [val["Accession_Interpro"] for val in E1_data]
sequence_contains_E1 = ["Y" if accession.split(".")[0] in E1_sequences else "N" for accession in sequence_accessions]
add_data("contains_E1",sequence_contains_E1)

In [None]:

with open(additional_data_filename, "w") as file:
    # Write the dictionary to the file in JSON format
    json.dump(data, file)


<br><br>
<h1 style="font-size:36px" align="center"> Diagrams for fun </h1><br><br><br><br><br><br>

In [None]:
import os
import glob
from Bio import SeqIO
from tqdm.auto import tqdm
import pylev
import matplotlib.pyplot as plt
import numpy as np
import copy
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from Bio import SeqIO
import json
import matplotlib.colors as mcolors
import random
all_annotations_filename = "../raw_sequences/interpro_all_YcaO_annotated.json"


<h3 style="font-size:24px"> Define Parameters</h3><br>

In [None]:
input_sequences_filename = "../processed_sequences/initial_dataset/alignment_input_sequences.txt"

In [None]:
sequences = [seq for seq in SeqIO.parse(input_sequences_filename,"fasta")]


sequence_accessions = [seq.id for seq in sequences]

YcaO_data = []
with open(all_annotations_filename, 'r') as f:
    YcaO_data = json.load(f)
           
def get_item_by_accession(seqs, accession):
    for item in seqs:
        if item['Accession_Interpro'] == accession.split(".")[0]:
            return item
    return None

In [None]:
taxonomy_info = {}
for i in tqdm(range(0,len(sequence_accessions))):
    accession = sequence_accessions[i]
    taxonomy = get_item_by_accession(YcaO_data,accession)["lineage"]
    if taxonomy[0] not in taxonomy_info:
        taxonomy_info[taxonomy[0]] = {}
    if taxonomy[1] not in taxonomy_info[taxonomy[0]]:
        taxonomy_info[taxonomy[0]][taxonomy[1]] = 0
    taxonomy_info[taxonomy[0]][taxonomy[1]] += 1


In [None]:
for i in range(0,1):
    top_level = []
    first_level = []
    vals = []
    for key in taxonomy_info.keys():
        top_level.append(key)
        vals.append([])
        keys2 = list(taxonomy_info[key].keys())
        random.shuffle(keys2)
        for key2 in keys2:
            if(taxonomy_info[key][key2] > 30):
                first_level.append(key2)
                vals[-1].append((taxonomy_info[key][key2]))
    fig, ax = plt.subplots(figsize=(15, 15))

    size = 0.3

    inner_layer = [sum(x) for x in vals]
    outer_layer = [item for sublist in vals for item in sublist]

    inner_colors = ['#1f77b4', '#ff7f0e']

    bacteria_colors = ["#1f77b4"]*len(vals[0])
    archaea_colors = ["#ff7f0e"]*len(vals[1])

    bacteria_colors = [color+'{:02x}'.format(int(random.uniform(0.2, 1) * 255)) for color in bacteria_colors]
    archaea_colors = [color+'{:02x}'.format(int(random.uniform(0.3, 1) * 255)) for color in archaea_colors]
    print(bacteria_colors)
    outer_colors = bacteria_colors+archaea_colors

    ax.pie(outer_layer, radius=1, colors=outer_colors,
           labels=first_level, textprops={'fontsize': 13},
           wedgeprops=dict(width=size, edgecolor='w'))

    ax.pie(inner_layer, radius=1-size, colors=inner_colors,
           labels=top_level, textprops={'fontsize': 20},labeldistance=.6,
           wedgeprops=dict(width=size, edgecolor='w'))

    ax.set(aspect="equal", title='Taxonomy of YcaO sequences')
    plt.show()