## Load all sequences info, and save it to a tsv for treeViewer

## Working with the 70% ID cutoff tree

In [5]:
base_dir = "../processed_sequences/YcaO_domain_70/"
all_annotations_filename = base_dir+"interpro_all_YcaO_annotated.json"
tridomain_annotations_filename = base_dir+"cyclodehydratase_annotaded_sequences.json"
tree_sequences_filename = base_dir+"unique_filtered_70_sequences.txt"


from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from Bio import SeqIO
import copy
import json

## Load sequences, accessions and annotations.

In [8]:
sequences = [seq for seq in SeqIO.parse(tree_sequences_filename,"fasta")]
sequence_accessions = [seq.id.split("|")[0] for seq in sequences]

YcaO_data = []
with open(all_annotations_filename, 'r') as f:
    YcaO_data = json.load(f)
    
    
E1_data = []

with open(tridomain_annotations_filename, 'r') as f:
    E1_data = json.load(f)


In [12]:
def get_item_by_accession(data, accession):
    for item in data:
        if item['Accession_Interpro'] == accession:
            return item
    return None

In [44]:
taxonomy_index = 0
sequence_taxonomy = [get_item_by_accession(YcaO_data,accession)["lineage"][taxonomy_index].replace(" ", "_") for accession in sequence_accessions]

In [45]:
import random
import colorsys
import matplotlib

def generate_colors(sequence_taxonomy):
    unique_taxa = set(sequence_taxonomy)
    taxa_colors = {}
    sequence_colors = []
    predefined_colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
    color_index = 0
    for taxa in unique_taxa:
        if color_index < len(predefined_colors):
            taxa_colors[taxa] = predefined_colors[color_index]
            color_index += 1
        else:
            h, s, v = random.random(), 0.5 + random.random() / 2.0, 0.4 + random.random() / 5.0
            r, g, b = colorsys.hsv_to_rgb(h, s, v)
            taxa_colors[taxa] = '#{:02x}{:02x}{:02x}'.format(int(r * 255), int(g * 255), int(b * 255))
    for taxa in sequence_taxonomy:
        sequence_colors.append(taxa_colors[taxa])
    return sequence_colors


sequence_colors = generate_colors(sequence_taxonomy)


In [116]:
import os

# define the starting directory
root_dir = "../raw_sequences"

# create an empty list to store the file paths
fasta_paths = []

# use os.walk to iterate through all the subdirectories
for dirpath, dirnames, filenames in os.walk(root_dir):
    # iterate through the files in the current directory
    for file in filenames:
        # check if the file name is "fasta.fa" (case-insensitive)
        if  "fasta.fa" in file.lower():
            # construct the full path of the file
            fasta_path = os.path.join(dirpath, file)
            # append the path to the list
            fasta_paths.append(fasta_path)

db_dir = base_dir+"db_data/"
db_filename = db_dir+"my_blast_db"

if not os.path.exists(db_dir):
    os.mkdir(db_dir)
    !makeblastdb -in {tree_sequences_filename} -dbtype prot -out {db_filename}
else:
    print("db_data folder already exists.")

import subprocess

path_accessions = []
for i in range(len(fasta_paths)):
    fasta_path = fasta_paths[i]
    print(f"{i}/{len(fasta_paths)} Processing {fasta_path}")
    path_parts = fasta_path.split("/")
    shortened_path = path_parts[2] + "_" + path_parts[3] + (("_" + path_parts[4]) if "fasta" not in path_parts[4].lower() else "")
    output = subprocess.run(f"blastp -db {db_filename} -query {fasta_path} -outfmt '6 sseqid' -max_target_seqs 1 -evalue 1e-50", shell=True, capture_output=True)
    accession_numbers = output.stdout.decode().strip().split("\n")
    print(f"Got results:\n{accession_numbers}")
    path_accessions.append((shortened_path, accession_numbers))


db_data folder already exists.
0/27 Processing ../raw_sequences/Amidine/Bottromycin/Fasta.fa
Got results:
['K4MHF8']
1/27 Processing ../raw_sequences/Amidine/Jesko_but_unknown/Fasta.fa
Got results:
['A0A2X3D8D2']
2/27 Processing ../raw_sequences/Amidine/Jesko_but_unknown_outgroup_POTENTIALLY_IGNORE_DUE_TO_WEIRD_ALIGNMENT/outgroup_sequence_Fasta.fa
Got results:
['A0A4Q0YMK5']
3/27 Processing ../raw_sequences/Azoline/Bottromycin/Fasta.fa
Got results:
['K4MJT7']
4/27 Processing ../raw_sequences/Azoline/Cyanobactins/Cyanobactins_Mus/Fasta.fa
Got results:
['A0A5Q0TWV7']
5/27 Processing ../raw_sequences/Azoline/Cyanobactins/Cyanobactins_PatD/PatD_Fasta.fa
Got results:
['A0A166XY79']
6/27 Processing ../raw_sequences/Azoline/Cyanobactins/Cyanobactins_TruD/TruD_Fasta.Fa
Got results:
['A0A166XY79']
7/27 Processing ../raw_sequences/Azoline/LAPs/Azolemycin_updated/Fasta.fa
Got results:
['A0A3S8WK97']
8/27 Processing ../raw_sequences/Azoline/LAPs/BalhD/Fasta.fa
Got results:
['A0A246PI99']
9/27 Proc

In [121]:
import itertools

def get_similar_sequence_name_if_any(accession,path_accessions):
    for path in path_accessions:
        if accession in path[1]:
            return path[0]
    return "_"

similar_accessions = list(itertools.chain(*[path[1] for path in path_accessions]))

sequence_characterised = ["Y" if accession in similar_accessions else "N" for accession in sequence_accessions]
sequence_related_sequence = [get_similar_sequence_name_if_any(accession,path_accessions) for accession in sequence_accessions]

In [122]:
#Annotate all sequences whether or not they were tridomain
E1_sequences = [data["Accession_Interpro"] for data in E1_data]
sequence_contains_E1 = ["Y" if accession in E1_sequences else "N" for accession in sequence_accessions]

In [124]:
import csv
additional_data_filename = base_dir+"additional_data.tsv"
headers = ['accession', 'taxa', "color", "related_to_known_seq","related_seq","contains_E1"]

def save_to_tsv(data, headers, file_name):
    with open(file_name, mode='w', newline='') as file:
        writer = csv.writer(file, delimiter='\t')
        writer.writerow(headers)
        for i in range(len(data[0])):
            writer.writerow([d[i] for d in data])
            
save_to_tsv([sequence_accessions,sequence_taxonomy,sequence_colors,sequence_characterised,sequence_related_sequence,sequence_contains_E1],headers,additional_data_filename)