In [152]:
import os
from Bio import SeqIO
from Bio.Seq import Seq

START_CODON = "ATG"
STOP_CODONS = ["TAA", "TAG", "TGA"]
FASTA_DIRECTORY = "viruses/data"
file_prefixes = ["bacterial", "mamalian"]

In [153]:
class Virus:
    def __init__(self, fileName, name, sequence, orfs):
        self.fileName = fileName
        self.name = name
        self.sequence = sequence # Originali seka
        self.orfs = orfs # Sekos start-end 
        self.proteinsCodones = []
        self.proteinsDicodones = []
        self.codon_frequency = []
        self.dicodon_frequency = []

In [154]:
def validate_sequence(seq):
    
    if START_CODON not in seq:
        return False, "No start codon found"
    
    if seq[-3:] not in STOP_CODONS:
        return False, "No valid stop codon found"
    
    if len(seq) % 3 != 0:
        return False, "Sequence length is not divisible by 3"
    
    if len(seq) < 100:
        return False, "Sequence length is less than 100 bp"
    
    return True, "Sequence is valid"

In [155]:
def find_orfs(sequence):
    orfs = []
    seq_len = len(sequence)
    
    for strand, nuc_seq in [(+1, sequence), (-1, sequence.reverse_complement())]:
        for frame in range(3):
            start_pos = None
            for i in range(frame, seq_len - 2, 3):
                codon = str(nuc_seq[i:i+3])
                if codon == START_CODON:
                    start_pos = i
                elif codon in STOP_CODONS and start_pos is not None:
                    stop_pos = i + 3
                    if (stop_pos - start_pos) >= 100:
                        orfs.append(str(nuc_seq[start_pos:stop_pos]))
                    start_pos = None
    return orfs

In [156]:
def process_fasta(file_path):
    all_orfs = []
    for record in SeqIO.parse(file_path, "fasta"):
        orfs = find_orfs(record.seq)
        all_orfs.extend(orfs)
    return all_orfs, record

In [157]:
def process_files(fasta_directory, file_prefixes):
    for prefix in file_prefixes:
        file_path = f"{fasta_directory}/{prefix}.fasta"
        orfs, record = process_fasta(file_path)
        virus = Virus(fileName=f"{prefix}.fasta", name=record.name, sequence=record.seq, orfs=orfs)
    return virus

In [158]:
def translate_to_protein(virus):
    proteins = []
    proteinsDicodones = []

    for sequence  in virus.orfs:
        protein = Seq(sequence).translate()
        proteins.append(str(protein))

        dicodons = [str(protein[i:i+2]) for i in range(len(protein)-1)]
        proteinsDicodones.append(dicodons[:-1])
    return proteins, proteinsDicodones

In [159]:
# Surinkti duomenis ir start end poras
viruses = []
for i in range(1, 4):
    for prefix in file_prefixes:
        virus = process_files(FASTA_DIRECTORY, [f"{prefix}{i}"])
        viruses.append(virus)

In [160]:
# Konvertuoti sekas į baltymo kodonus ir dikodonus
for virus in viruses:
    virus.proteinsCodones, virus.proteinsDicodones = translate_to_protein(virus)

In [161]:
def codon_dicodon_frequency(protein_codons, protein_dicodons):
    # Patikrinti, ar kodonai ir dikodonai nėra tušti
    if not protein_codons:
        raise ValueError("Protein codons list is empty.")
    if not protein_dicodons:
        raise ValueError("Protein dicodons list is empty.")

    # Inicializuoti žodynus kodonų ir dikodonų dažniams
    codon_frequency = {}
    dicodon_frequency = {}

    # Skaičiuoti kodonų dažnį
    for codon in protein_codons:
        codon_frequency[codon] = codon_frequency.get(codon, 0) + 1

    # Skaičiuoti dikodonų dažnį
    for dicodon in protein_dicodons:
        dicodon_frequency[dicodon] = dicodon_frequency.get(dicodon, 0) + 1

    # Normalizuoti kodonų dažnį pagal bendrą kodonų skaičių
    total_codons = len(protein_codons)
    codon_frequency = {codon: freq / total_codons for codon, freq in codon_frequency.items()}

    # Normalizuoti dikodonų dažnį pagal bendrą dikodonų skaičių
    total_dicodons = len(protein_dicodons)  # Dikodonų skaičius yra tikrasis įrašytų dikodonų kiekis
    dicodon_frequency = {dicodon: freq / total_dicodons for dicodon, freq in dicodon_frequency.items()}

    return codon_frequency, dicodon_frequency


In [165]:
# Funkcija, kuri skaičiuoja kodonų dažnį
def count_codons(codon_sequence):
    codon_freq = {}
    for codon in codon_sequence:
        codon_freq[codon] = codon_freq.get(codon, 0) + 1
    total_codons = len(codon_sequence)
    # Normalizuoja kodonų dažnį
    codon_freq = {codon: freq / total_codons for codon, freq in codon_freq.items()}
    return codon_freq

# Funkcija, kuri skaičiuoja dikodonų dažnį
def count_dicodons(codon_sequence):
    dicodon_freq = {}
    for i in range(len(codon_sequence) - 1):
        dicodon = f"{codon_sequence[i]}-{codon_sequence[i+1]}"
        dicodon_freq[dicodon] = dicodon_freq.get(dicodon, 0) + 1
    total_dicodons = len(codon_sequence) - 1
    # Normalizuoja dikodonų dažnį
    dicodon_freq = {dicodon: freq / total_dicodons for dicodon, freq in dicodon_freq.items()}
    return dicodon_freq

# Pagrindinė funkcija, kuri apdoroja baltymo seką ir apskaičiuoja kodonų ir dikodonų dažnius
def protein_to_codon_dicodon_frequency(protein_sequence):
    
    # Skaičiuoja kodonų ir dikodonų dažnius
    codon_freq = count_codons(protein_sequence)
    dicodon_freq = count_dicodons(protein_sequence)
    
    return codon_freq, dicodon_freq

In [167]:
# Rasti kodonų ir dikodonų dažnius
for virus in viruses:
    for i in range(len(virus.orfs)):
        codon_freq, dicodon_freq = protein_to_codon_dicodon_frequency(virus.proteinsCodones[i])#, virus.proteinsDicodones[i])
        virus.codon_frequency.append(codon_freq)
        virus.dicodon_frequency.append(dicodon_freq)

In [168]:
# Spausdinimas
for virus in viruses:
    for i in range(len(virus.orfs)):
        print(virus.name)
        print(virus.orfs[i])
        print(virus.proteinsCodones[i])
        print(virus.proteinsDicodones[i])
        print(virus.codon_frequency[i])
        print(virus.dicodon_frequency[i])

Lactococcus_phage
ATGGTTGCTAAGGCTGGAAACCCTGAACTTTATAACCCTACTGAATGGCGTAGATTGCAACAAGAAGAATCAAGCGCTAATGACCTTAAAGCTAAGATTGAAGAACTTGATGACTATAAACTAAGTAAGTACGAAACACCAAAAATTGAAGTGCCGAAAGGGTTTGAATAA
MVAKAGNPELYNPTEWRRLQQEESSANDLKAKIEELDDYKLSKYETPKIEVPKGFE*
['MV', 'VA', 'AK', 'KA', 'AG', 'GN', 'NP', 'PE', 'EL', 'LY', 'YN', 'NP', 'PT', 'TE', 'EW', 'WR', 'RR', 'RL', 'LQ', 'QQ', 'QE', 'EE', 'ES', 'SS', 'SA', 'AN', 'ND', 'DL', 'LK', 'KA', 'AK', 'KI', 'IE', 'EE', 'EL', 'LD', 'DD', 'DY', 'YK', 'KL', 'LS', 'SK', 'KY', 'YE', 'ET', 'TP', 'PK', 'KI', 'IE', 'EV', 'VP', 'PK', 'KG', 'GF', 'FE']
{'M': 0.017543859649122806, 'V': 0.03508771929824561, 'A': 0.07017543859649122, 'K': 0.12280701754385964, 'G': 0.03508771929824561, 'N': 0.05263157894736842, 'P': 0.07017543859649122, 'E': 0.15789473684210525, 'L': 0.08771929824561403, 'Y': 0.05263157894736842, 'T': 0.03508771929824561, 'W': 0.017543859649122806, 'R': 0.03508771929824561, 'Q': 0.03508771929824561, 'S': 0.05263157894736842, 'D': 0.05263157894736842, 'I

In [164]:
from Bio import SeqIO
from Bio.Seq import Seq
from collections import defaultdict



# Kodonų ir dikodonų dažnio analizė
def codon_dicodon_frequency(protein_sequences):
    codon_count = defaultdict(int)
    dicodon_count = defaultdict(int)
    
    for protein in protein_sequences:
        for i in range(0, len(protein), 1):
            codon = protein[i:i+1]
            codon_count[codon] += 1
            if i + 1 < len(protein):
                dicodon = protein[i:i+2]
                dicodon_count[dicodon] += 1
    
    return codon_count, dicodon_count

# Atstumo matricos sudarymas (pvz., naudojant Euklido atstumą)
def calculate_distance_matrix(frequencies):
    import numpy as np
    matrix = np.zeros((len(frequencies), len(frequencies)))
    for i, freq1 in enumerate(frequencies):
        for j, freq2 in enumerate(frequencies):
            if i != j:
                dist = np.linalg.norm(np.array(freq1) - np.array(freq2))
                matrix[i][j] = dist
    return matrix

# Pagrindinis funkcijų vykdymas
fasta_file = "virus_sequences.fasta"
sequences = read_fasta(fasta_file)

# Analizuoti kiekvieną seką
for sequence in sequences:
    reverse_seq = reverse_complement(sequence)
    
    pairs = find_start_stop_pairs(sequence)
    reverse_pairs = find_start_stop_pairs(reverse_seq)
    
    filtered_pairs = filter_short_sequences(pairs)
    filtered_reverse_pairs = filter_short_sequences(reverse_pairs)
    
    protein_sequences = translate_to_protein(sequence, filtered_pairs)
    reverse_protein_sequences = translate_to_protein(reverse_seq, filtered_reverse_pairs)
    
    codon_freq, dicodon_freq = codon_dicodon_frequency(protein_sequences + reverse_protein_sequences)
    
    # Pavyzdžiui, čia išveskite kodonų dažnius
    print("Kodonų dažnis:", codon_freq)
    print("Dikodonų dažnis:", dicodon_freq)

# Sudaryti atstumo matricą (galite pritaikyti klasterizavimo metodą)
dist_matrix = calculate_distance_matrix([codon_freq, dicodon_freq])


FileNotFoundError: [Errno 2] No such file or directory: 'virus_sequences.fasta'

bacterial1.fasta


[<__main__.Virus at 0x213fe85c9b0>,
 <__main__.Virus at 0x213fe85e0c0>,
 <__main__.Virus at 0x213fe85ce60>,
 <__main__.Virus at 0x213fe85d790>,
 <__main__.Virus at 0x213fe85cc50>,
 <__main__.Virus at 0x213fe85ce00>]