In [91]:
import os
from Bio import SeqIO
from Bio.Seq import Seq
from collections import Counter
from typing import List, Dict
import numpy as np
import numpy as np
from typing import List, Dict
from collections import Counter
from numpy.linalg import norm

START_CODON = "ATG"
STOP_CODONS = ["TAA", "TAG", "TGA"]
FASTA_DIRECTORY = "viruses/data"
file_prefixes = ["bacterial", "mamalian"]
amino_acids = "ACDEFGHIKLMNPQRSTVWY"

In [92]:
class Virus:
    def __init__(self, fileName, name, sequence, orfs):
        self.fileName = fileName
        self.name = name
        self.sequence = sequence 
        self.orfs = orfs 
        self.protein_sequences = []  
        self.proteinsDicodones = [] 
        self.codon_frequency = [] 
        self.dicodon_frequency = []
        self.amino_acid_frequencies = {}

    def calculate_frequencies(self):
        combined_sequence = ''.join(self.protein_sequences)
        self.amino_acid_frequencies = self.calculate_amino_acid_frequencies(combined_sequence)
        self.dicodon_frequency = self.calculate_dicodon_frequencies(combined_sequence)

    @staticmethod
    def calculate_amino_acid_frequencies(sequence: str) -> Dict[str, float]:
        amino_acid_count = Counter(sequence)
        total_amino_acids = sum(amino_acid_count.values())
        return {amino_acid: count / total_amino_acids for amino_acid, count in amino_acid_count.items()}

    @staticmethod
    def calculate_dicodon_frequencies(sequence: str) -> Dict[str, float]:
        dicodons = [sequence[i:i + 2] for i in range(len(sequence) - 1)]
        dicodon_count = Counter(dicodons)
        total_dicodons = sum(dicodon_count.values())
        return {dicodon: count / total_dicodons for dicodon, count in dicodon_count.items()}

In [93]:
def validate_sequence(seq):
    
    if START_CODON not in seq:
        return False, "No start codon found"
    
    if seq[-3:] not in STOP_CODONS:
        return False, "No valid stop codon found"
    
    if len(seq) % 3 != 0:
        return False, "Sequence length is not divisible by 3"
    
    if len(seq) < 100:
        return False, "Sequence length is less than 100 bp"
    
    return True, "Sequence is valid"

In [94]:
def find_orfs(sequence):
    orfs = []
    seq_len = len(sequence)
    
    for strand, nuc_seq in [(+1, sequence), (-1, sequence.reverse_complement())]:
        for frame in range(3):
            start_pos = None
            for i in range(frame, seq_len - 2, 3):
                codon = str(nuc_seq[i:i+3])
                if codon == START_CODON:
                    start_pos = i
                elif codon in STOP_CODONS and start_pos is not None:
                    stop_pos = i + 3
                    if (stop_pos - start_pos) >= 100:
                        orfs.append(str(nuc_seq[start_pos:stop_pos]))
                    start_pos = None
    return orfs

In [95]:
def process_fasta(file_path):
    all_orfs = []
    for record in SeqIO.parse(file_path, "fasta"):
        orfs = find_orfs(record.seq)
        all_orfs.extend(orfs)
    return all_orfs, record

In [96]:
def process_files(fasta_directory, file_prefixes):
    for prefix in file_prefixes:
        file_path = f"{fasta_directory}/{prefix}.fasta"
        orfs, record = process_fasta(file_path)
        virus = Virus(fileName=f"{prefix}.fasta", name=record.name, sequence=record.seq, orfs=orfs)
    return virus

In [97]:
def translate_to_protein(virus):
    protein_sequences = []
    proteinsDicodones = []

    for sequence  in virus.orfs:
        protein = Seq(sequence).translate()
        protein_sequences.append(str(protein))

        dicodons = [str(protein[i:i+2]) for i in range(len(protein)-1)]
        proteinsDicodones.append(dicodons[:-1])
    return protein_sequences, proteinsDicodones

In [98]:
def get_unique_frequencies(viruses: List[Virus], frequency_type: str) -> Dict[str, int]:
    unique_frequencies = set()
    
    for virus in viruses:
        if frequency_type == 'amino_acid':
            unique_frequencies.update(virus.amino_acid_frequencies.keys())
        elif frequency_type == 'dicodon':
            unique_frequencies.update(virus.dicodon_frequency.keys())
        else:
            raise ValueError(f"Unknown frequency type: {frequency_type}")
    
    return {item: idx for idx, item in enumerate(sorted(unique_frequencies))}

In [99]:
def calculate_distance_matrix(viruses: List[Virus], frequency_type: str = 'amino_acid') -> np.ndarray:
    num_viruses = len(viruses)
    distance_matrix = np.zeros((num_viruses, num_viruses))

    unique_frequencies = get_unique_frequencies(viruses, frequency_type)

    for i in range(num_viruses):
        frequency_vector_i = np.zeros(len(unique_frequencies))
        
        if frequency_type == 'amino_acid':
            for amino_acid, freq in viruses[i].amino_acid_frequencies.items():
                frequency_vector_i[unique_frequencies[amino_acid]] = freq
        elif frequency_type == 'dicodon':
            for dicodon, freq in viruses[i].dicodon_frequency.items():
                frequency_vector_i[unique_frequencies[dicodon]] = freq

        for j in range(i, num_viruses):
            frequency_vector_j = np.zeros(len(unique_frequencies))
            if frequency_type == 'amino_acid':
                for amino_acid, freq in viruses[j].amino_acid_frequencies.items():
                    frequency_vector_j[unique_frequencies[amino_acid]] = freq
            elif frequency_type == 'dicodon':
                for dicodon, freq in viruses[j].dicodon_frequency.items():
                    frequency_vector_j[unique_frequencies[dicodon]] = freq

            i_j = frequency_vector_i - frequency_vector_j
                        
            distance = norm(i_j, ord=2)

            distance_matrix[i][j] = distance
            distance_matrix[j][i] = distance

    return distance_matrix

In [100]:
def write_phylip_format(viruses: List[Virus], distance_matrix: np.ndarray, file_name: str):
    num_viruses = len(viruses)

    if distance_matrix.shape[0] != num_viruses or distance_matrix.shape[1] != num_viruses:
        raise ValueError("Distance matrix dimensions do not match the number of viruses")

    phylip_lines = []
    
    phylip_lines.append(f"{num_viruses}")
    
    for i in range(num_viruses):
        virus_name = viruses[i].name
        distances = [f"{distance_matrix[i][j]:8.4f}" for j in range(num_viruses)] 
        
        phylip_lines.append(f"{virus_name:<15}{' '.join(distances)}")

    with open(file_name, 'w') as file:
        file.write("\n".join(phylip_lines))

In [101]:
# Surinkti duomenis ir start end poras
viruses = []
for i in range(1, 5):
    for prefix in file_prefixes:
        virus = process_files(FASTA_DIRECTORY, [f"{prefix}{i}"])
        viruses.append(virus)

# Konvertuoti sekas į baltymo kodonus ir dikodonus
for virus in viruses:
    virus.protein_sequences, virus.proteinsDicodones = translate_to_protein(virus)

# Apskaičiuoti baltymo ir dikodonų dažnius
for virus in viruses:
    virus.calculate_frequencies()

# Apskaičiuoti atstumų matricas
amino_acid_distance_matrix = calculate_distance_matrix(viruses, frequency_type='amino_acid')
dicodon_distance_matrix = calculate_distance_matrix(viruses, frequency_type='dicodon')

# Spausdinti matricas, jei reikia
if False:
    print("Amino Acid Distance Matrix:")
    print(amino_acid_distance_matrix)

    print("Dicodon Distance Matrix:")
    print(dicodon_distance_matrix)

# Sukurti ir išsaugoti atstumo matricas phylip formatu
file_name = "codon viruses_distance_matrix.phy"
write_phylip_format(viruses, amino_acid_distance_matrix, file_name)
file_name = "dicodon viruses_distance_matrix.phy"
write_phylip_format(viruses, dicodon_distance_matrix, file_name)

In [108]:
# Labiausiai varijuojančių kodonų ir dikodonų paieška
def find_most_variable_codons_dicodons(viruses, frequency_type='codon', top_n=5):
   
    all_frequencies = []
    if frequency_type == 'codon':
        codon_list = list(viruses[0].amino_acid_frequencies.keys()) 
        for virus in viruses:
            codon_freqs = [virus.amino_acid_frequencies.get(codon, 0) for codon in codon_list]
            all_frequencies.append(codon_freqs)
    elif frequency_type == 'dicodon':
        codon_list = list(viruses[0].dicodon_frequency.keys())
        for virus in viruses:
            dicodon_freqs = [virus.dicodon_frequency.get(dicodon, 0) for dicodon in codon_list]
            all_frequencies.append(dicodon_freqs)

    all_frequencies = np.array(all_frequencies)
    std_devs = np.std(all_frequencies, axis=0)
    sorted_indices = np.argsort(std_devs)[::-1]
    most_variable = [(codon_list[i], std_devs[i]) for i in sorted_indices[:top_n]]

    return most_variable

top_variable_codons = find_most_variable_codons_dicodons(viruses, frequency_type='codon', top_n=5)
top_variable_dicodons = find_most_variable_codons_dicodons(viruses, frequency_type='dicodon', top_n=5)

print("Top 5 labiausiai varijuojantys kodonai:")
for codon, variance in top_variable_codons:
    print(f"{codon}: {variance}")

print("\nTop 5 labiausiai varijuojantys dikodonai:")
for dicodon, variance in top_variable_dicodons:
    print(f"{dicodon}: {variance}")


Top 5 labiausiai varijuojantys kodonai:
R: 0.02376735004204152
I: 0.020765823658179663
K: 0.020472715049806457
A: 0.01492226505004192
N: 0.013723334338643492

Top 5 labiausiai varijuojantys dikodonai:
RR: 0.005642487278072178
LL: 0.004438320449881403
AA: 0.003428929951903611
II: 0.0032532247332899104
GG: 0.0030099978834778607
