In [79]:
import os
from Bio import SeqIO
from Bio.Seq import Seq

START_CODON = "ATG"
STOP_CODONS = ["TAA", "TAG", "TGA"]
FASTA_DIRECTORY = "viruses/data"
file_prefixes = ["bacterial", "mamalian"]

In [80]:
class Virus:
    def __init__(self, fileName, name, sequence, orfs):
        self.fileName = fileName
        self.name = name
        self.sequence = sequence # Originali seka
        self.orfs = orfs # Sekos start-end 
        self.proteins = []
        self.proteinsDicodones = []

In [81]:
def validate_sequence(seq):
    
    if START_CODON not in seq:
        return False, "No start codon found"
    
    if seq[-3:] not in STOP_CODONS:
        return False, "No valid stop codon found"
    
    if len(seq) % 3 != 0:
        return False, "Sequence length is not divisible by 3"
    
    if len(seq) < 100:
        return False, "Sequence length is less than 100 bp"
    
    return True, "Sequence is valid"

In [82]:
def find_orfs(sequence):
    orfs = []
    seq_len = len(sequence)
    
    for strand, nuc_seq in [(+1, sequence), (-1, sequence.reverse_complement())]:
        for frame in range(3):
            start_pos = None
            for i in range(frame, seq_len - 2, 3):
                codon = str(nuc_seq[i:i+3])
                if codon == START_CODON:
                    start_pos = i
                elif codon in STOP_CODONS and start_pos is not None:
                    stop_pos = i + 3
                    if (stop_pos - start_pos) >= 100:
                        orfs.append(str(nuc_seq[start_pos:stop_pos]))
                    start_pos = None
    return orfs

In [83]:
def process_fasta(file_path):
    all_orfs = []
    for record in SeqIO.parse(file_path, "fasta"):
        orfs = find_orfs(record.seq)
        all_orfs.extend(orfs)
    return all_orfs, record

In [84]:
def process_files(fasta_directory, file_prefixes):
    for prefix in file_prefixes:
        file_path = f"{fasta_directory}/{prefix}.fasta"
        orfs, record = process_fasta(file_path)
        virus = Virus(fileName=f"{prefix}.fasta", name=record.name, sequence=record.seq, orfs=orfs)
    return virus

In [88]:
def translate_to_protein(virus):
    proteins = []
    proteinsDicodones = []

    for sequence  in virus.orfs:
        protein = Seq(sequence).translate()
        proteins.append(str(protein))

        dicodons = [str(protein[i:i+2]) for i in range(len(protein)-1)]
        proteinsDicodones.append(dicodons)
    return proteins, proteinsDicodones

In [85]:
# Surinkti duomenis ir start end poras
viruses = []
for i in range(1, 4):
    for prefix in file_prefixes:
        virus = process_files(FASTA_DIRECTORY, [f"{prefix}{i}"])
        viruses.append(virus)

In [89]:
# Konvertuoti sekas į baltymo kodonus ir dikodonus
for virus in viruses:
    virus.proteins, virus.proteinsDicodones = translate_to_protein(virus)

In [93]:
for virus in viruses:
    for i in range(len(virus.proteins)):
        print(virus.orfs[i])
        print(virus.proteins[i])
        print(virus.proteinsDicodones[i])

ATGGTTGCTAAGGCTGGAAACCCTGAACTTTATAACCCTACTGAATGGCGTAGATTGCAACAAGAAGAATCAAGCGCTAATGACCTTAAAGCTAAGATTGAAGAACTTGATGACTATAAACTAAGTAAGTACGAAACACCAAAAATTGAAGTGCCGAAAGGGTTTGAATAA
MVAKAGNPELYNPTEWRRLQQEESSANDLKAKIEELDDYKLSKYETPKIEVPKGFE*
ATGGCTTGGGTTGGTCTACAACTGGTAATTGGAAAACGAAATGTGTTCAGTACCTTATTAAAGGGCGAAAACGTGATAAAGTTACAGGAGAGTTTATTGACGGTTACCGTGTAG
MAWVGLQLVIGKRNVFSTLLKGENVIKLQESLLTVTV*
ATGGCAAAAAAGTTTCTGCTATTGAATACGAAATTTGGGGAGACCAAGCAAAAGACTTCGCAAACAAAATGGAAGCCGGCTTGTTCATCATGCAACCTGATACGGAACTTGCTGGCGAAGTTACATTAG
MAKKFLLLNTKFGETKQKTSQTKWKPACSSCNLIRNLLAKLH*
ATGACCCTTGGTTTCGATGTTTCTGTACCATTGAATGCTAAAGTTGGAAATAATATTTCTGTCCAGCTTAAAGGTCAAAATTCTCAAGCTCATGGAAATGTTGGAGCCAATGATTTCAACACAATTGTTGGTGAAAAATGGCATAATATTGAACAAAGCGATTTAGGTAAAACAATTCGTTTAAGCACTTCAGTGGAATTAGATCCTAAATATCATTCTTTTGATACTGCTCTAGCTGATACTGATAGTATTACTATCAGACAAGTAGAAGGTACACCAGGACTTGTGTATTCTAAATTAAAACTTGAACCAGGTTCAACCGATACACCTTGGATTCCCTCATCTAGTGAAGTAACAGCCGAAGATTATCCAAGCTATATCGGAACATATACTGATAAAAACTCCAATGAACAAAGTACAGACCCAGAAAAATATACTTGGA

In [68]:
from Bio import SeqIO
from Bio.Seq import Seq
from collections import defaultdict



# Kodonų ir dikodonų dažnio analizė
def codon_dicodon_frequency(protein_sequences):
    codon_count = defaultdict(int)
    dicodon_count = defaultdict(int)
    
    for protein in protein_sequences:
        for i in range(0, len(protein), 1):
            codon = protein[i:i+1]
            codon_count[codon] += 1
            if i + 1 < len(protein):
                dicodon = protein[i:i+2]
                dicodon_count[dicodon] += 1
    
    return codon_count, dicodon_count

# Atstumo matricos sudarymas (pvz., naudojant Euklido atstumą)
def calculate_distance_matrix(frequencies):
    import numpy as np
    matrix = np.zeros((len(frequencies), len(frequencies)))
    for i, freq1 in enumerate(frequencies):
        for j, freq2 in enumerate(frequencies):
            if i != j:
                dist = np.linalg.norm(np.array(freq1) - np.array(freq2))
                matrix[i][j] = dist
    return matrix

# Pagrindinis funkcijų vykdymas
fasta_file = "virus_sequences.fasta"
sequences = read_fasta(fasta_file)

# Analizuoti kiekvieną seką
for sequence in sequences:
    reverse_seq = reverse_complement(sequence)
    
    pairs = find_start_stop_pairs(sequence)
    reverse_pairs = find_start_stop_pairs(reverse_seq)
    
    filtered_pairs = filter_short_sequences(pairs)
    filtered_reverse_pairs = filter_short_sequences(reverse_pairs)
    
    protein_sequences = translate_to_protein(sequence, filtered_pairs)
    reverse_protein_sequences = translate_to_protein(reverse_seq, filtered_reverse_pairs)
    
    codon_freq, dicodon_freq = codon_dicodon_frequency(protein_sequences + reverse_protein_sequences)
    
    # Pavyzdžiui, čia išveskite kodonų dažnius
    print("Kodonų dažnis:", codon_freq)
    print("Dikodonų dažnis:", dicodon_freq)

# Sudaryti atstumo matricą (galite pritaikyti klasterizavimo metodą)
dist_matrix = calculate_distance_matrix([codon_freq, dicodon_freq])


FileNotFoundError: [Errno 2] No such file or directory: 'virus_sequences.fasta'

In [None]:
viruses[0].fileName

AttributeError: 'list' object has no attribute 'fileName'