In [109]:
import os
from Bio import SeqIO
from Bio.Seq import Seq
from collections import Counter

START_CODON = "ATG"
STOP_CODONS = ["TAA", "TAG", "TGA"]
FASTA_DIRECTORY = "viruses/data"
file_prefixes = ["bacterial", "mamalian"]
amino_acids = "ACDEFGHIKLMNPQRSTVWY"

In [110]:
class Virus:
    def __init__(self, fileName, name, sequence, orfs):
        self.fileName = fileName
        self.name = name
        self.sequence = sequence # Originali seka
        self.orfs = orfs # Sekos start-end 
        self.proteinsCodones = []
        self.proteinsDicodones = []
        self.codon_frequency = []
        self.dicodon_frequency = []

In [111]:
def validate_sequence(seq):
    
    if START_CODON not in seq:
        return False, "No start codon found"
    
    if seq[-3:] not in STOP_CODONS:
        return False, "No valid stop codon found"
    
    if len(seq) % 3 != 0:
        return False, "Sequence length is not divisible by 3"
    
    if len(seq) < 100:
        return False, "Sequence length is less than 100 bp"
    
    return True, "Sequence is valid"

In [112]:
def find_orfs(sequence):
    orfs = []
    seq_len = len(sequence)
    
    for strand, nuc_seq in [(+1, sequence), (-1, sequence.reverse_complement())]:
        for frame in range(3):
            start_pos = None
            for i in range(frame, seq_len - 2, 3):
                codon = str(nuc_seq[i:i+3])
                if codon == START_CODON:
                    start_pos = i
                elif codon in STOP_CODONS and start_pos is not None:
                    stop_pos = i + 3
                    if (stop_pos - start_pos) >= 100:
                        orfs.append(str(nuc_seq[start_pos:stop_pos]))
                    start_pos = None
    return orfs

In [113]:
def process_fasta(file_path):
    all_orfs = []
    for record in SeqIO.parse(file_path, "fasta"):
        orfs = find_orfs(record.seq)
        all_orfs.extend(orfs)
    return all_orfs, record

In [114]:
def process_files(fasta_directory, file_prefixes):
    for prefix in file_prefixes:
        file_path = f"{fasta_directory}/{prefix}.fasta"
        orfs, record = process_fasta(file_path)
        virus = Virus(fileName=f"{prefix}.fasta", name=record.name, sequence=record.seq, orfs=orfs)
    return virus

In [115]:
def translate_to_protein(virus):
    proteins = []
    proteinsDicodones = []

    for sequence  in virus.orfs:
        protein = Seq(sequence).translate()
        proteins.append(str(protein))

        dicodons = [str(protein[i:i+2]) for i in range(len(protein)-1)]
        proteinsDicodones.append(dicodons[:-1])
    return proteins, proteinsDicodones

In [116]:
# Surinkti duomenis ir start end poras
viruses = []
for i in range(1, 4):
    for prefix in file_prefixes:
        virus = process_files(FASTA_DIRECTORY, [f"{prefix}{i}"])
        viruses.append(virus)

In [117]:
# Konvertuoti sekas į baltymo kodonus ir dikodonus
for virus in viruses:
    virus.proteinsCodones, virus.proteinsDicodones = translate_to_protein(virus)

In [118]:
def codon_dicodon_frequency(protein_codons, protein_dicodons):
    # Patikrinti, ar kodonai ir dikodonai nėra tušti
    if not protein_codons:
        raise ValueError("Protein codons list is empty.")
    if not protein_dicodons:
        raise ValueError("Protein dicodons list is empty.")

    # Inicializuoti žodynus kodonų ir dikodonų dažniams
    codon_frequency = {}
    dicodon_frequency = {}

    # Skaičiuoti kodonų dažnį
    for codon in protein_codons:
        codon_frequency[codon] = codon_frequency.get(codon, 0) + 1

    # Skaičiuoti dikodonų dažnį
    for dicodon in protein_dicodons:
        dicodon_frequency[dicodon] = dicodon_frequency.get(dicodon, 0) + 1

    # Normalizuoti kodonų dažnį pagal bendrą kodonų skaičių
    total_codons = len(protein_codons)
    codon_frequency = {codon: freq / total_codons for codon, freq in codon_frequency.items()}

    # Normalizuoti dikodonų dažnį pagal bendrą dikodonų skaičių
    total_dicodons = len(protein_dicodons)  # Dikodonų skaičius yra tikrasis įrašytų dikodonų kiekis
    dicodon_frequency = {dicodon: freq / total_dicodons for dicodon, freq in dicodon_frequency.items()}

    return codon_frequency, dicodon_frequency


In [119]:
def get_amino_acid_frequency(protein_seq):
    amino_acid_count = Counter(protein_seq)
    amino_acid_freq = {aa: amino_acid_count.get(aa, 0) for aa in amino_acids}
    return amino_acid_freq

def get_dipeptide_frequency(protein_seq):
    dipeptides = [protein_seq[i:i+2] for i in range(len(protein_seq)-1)]
    dipeptide_count = Counter(dipeptides)
    possible_dipeptides = [a+b for a in amino_acids for b in amino_acids]
    dipeptide_freq = {dipeptide: dipeptide_count.get(dipeptide, 0) for dipeptide in possible_dipeptides}
    return dipeptide_freq

In [120]:
# Rasti kodonų ir dikodonų dažnius
for virus in viruses:
    for i in range(len(virus.orfs)):
        codon_freq = get_amino_acid_frequency(virus.proteinsCodones[i])
        dicodon_freq = get_dipeptide_frequency(virus.proteinsCodones[i])
        virus.codon_frequency.append(codon_freq)
        virus.dicodon_frequency.append(dicodon_freq)

In [121]:
import numpy as np

def calculate_distance_matrix(viruses, use_dicodons=False):

    num_viruses = len(viruses)
    dist_matrix = np.zeros((num_viruses, num_viruses))
    
    if use_dicodons:
        get_frequencies = lambda virus, all_keys: get_frequency_values(virus.dicodon_frequency, all_keys)
    else:
        get_frequencies = lambda virus, all_keys: get_frequency_values(virus.codon_frequency, all_keys)

    all_keys = get_all_keys(viruses, use_dicodons)
    
    for i in range(num_viruses):
        for j in range(i, num_viruses):
            freq_i = get_frequencies(viruses[i], all_keys)
            freq_j = get_frequencies(viruses[j], all_keys)
            distance = euclidean_distance(freq_i, freq_j)
            dist_matrix[i][j] = distance
            dist_matrix[j][i] = distance
    
    return dist_matrix

def get_all_keys(viruses, use_dicodons):
    all_keys = set()
    for virus in viruses:
        if use_dicodons:
            if isinstance(virus.dicodon_frequency, dict):
                all_keys.update(virus.dicodon_frequency.keys())
            elif isinstance(virus.dicodon_frequency, list):
          
                pass
        else:
            if isinstance(virus.codon_frequency, dict):
                all_keys.update(virus.codon_frequency.keys())
            elif isinstance(virus.codon_frequency, list):
              
                pass
    return all_keys

def get_frequency_values(freq_data, all_keys):
    if isinstance(freq_data, dict):

        return [freq_data.get(key, 0) for key in sorted(all_keys)]
    elif isinstance(freq_data, list):
       
        return freq_data
    else:
        raise TypeError("Frequency data must be either a list or a dictionary")

def euclidean_distance(freq_i, freq_j):
    return np.linalg.norm(np.array(freq_i) - np.array(freq_j))


In [122]:



distance_matrix_codons = calculate_distance_matrix(viruses)
distance_matrix_dicodons = calculate_distance_matrix(viruses, use_dicodons=True)
print(distance_matrix_codons)
print(distance_matrix_dicodons)


TypeError: unsupported operand type(s) for -: 'dict' and 'dict'

In [66]:
# Spausdinimas
for virus in viruses:
    for i in range(len(virus.orfs)):
        print(virus.name)
        print(virus.orfs[i])
        print(virus.proteinsCodones[i])
        print(virus.proteinsDicodones[i])
        print(virus.codon_frequency[i])
        print(virus.dicodon_frequency[i])

Lactococcus_phage
ATGGTTGCTAAGGCTGGAAACCCTGAACTTTATAACCCTACTGAATGGCGTAGATTGCAACAAGAAGAATCAAGCGCTAATGACCTTAAAGCTAAGATTGAAGAACTTGATGACTATAAACTAAGTAAGTACGAAACACCAAAAATTGAAGTGCCGAAAGGGTTTGAATAA
MVAKAGNPELYNPTEWRRLQQEESSANDLKAKIEELDDYKLSKYETPKIEVPKGFE*
['MV', 'VA', 'AK', 'KA', 'AG', 'GN', 'NP', 'PE', 'EL', 'LY', 'YN', 'NP', 'PT', 'TE', 'EW', 'WR', 'RR', 'RL', 'LQ', 'QQ', 'QE', 'EE', 'ES', 'SS', 'SA', 'AN', 'ND', 'DL', 'LK', 'KA', 'AK', 'KI', 'IE', 'EE', 'EL', 'LD', 'DD', 'DY', 'YK', 'KL', 'LS', 'SK', 'KY', 'YE', 'ET', 'TP', 'PK', 'KI', 'IE', 'EV', 'VP', 'PK', 'KG', 'GF', 'FE']
{'A': 4, 'C': 0, 'D': 3, 'E': 9, 'F': 1, 'G': 2, 'H': 0, 'I': 2, 'K': 7, 'L': 5, 'M': 1, 'N': 3, 'P': 4, 'Q': 2, 'R': 2, 'S': 3, 'T': 2, 'V': 2, 'W': 1, 'Y': 3}
{'AA': 0, 'AC': 0, 'AD': 0, 'AE': 0, 'AF': 0, 'AG': 1, 'AH': 0, 'AI': 0, 'AK': 2, 'AL': 0, 'AM': 0, 'AN': 1, 'AP': 0, 'AQ': 0, 'AR': 0, 'AS': 0, 'AT': 0, 'AV': 0, 'AW': 0, 'AY': 0, 'CA': 0, 'CC': 0, 'CD': 0, 'CE': 0, 'CF': 0, 'CG': 0, 'CH': 0, 'CI': 0, 'CK': 0

In [None]:
# klasterizavimo metodas Euklido atstumą