### Make a brute force engine which provides a list of combinations of groups of 2-3 nucleotides (dinucleotides and trinucleodites).
#### Make a filter for anything outside the 4 letters (A T C G)
##### Replace "N" with emptyspace

In [1]:
pip install tensorflow-cpu

Collecting tensorflow-cpu
  Downloading tensorflow_cpu-2.17.0-cp312-cp312-win_amd64.whl.metadata (3.2 kB)
Downloading tensorflow_cpu-2.17.0-cp312-cp312-win_amd64.whl (2.1 kB)
Installing collected packages: tensorflow-cpu
Successfully installed tensorflow-cpu-2.17.0
Note: you may need to restart the kernel to use updated packages.


In [1]:
import itertools
from collections import defaultdict

In [36]:
def read_and_clean_fasta(file_name):
    genomes = {}
    current_header = None
    current_seq = []
    
    with open(file_name, 'r') as file:
        for line in file:
            line = line.strip()
            if line.startswith(">"):  
                if current_header:
                    cleaned_sequence = ''.join(current_seq).replace("N", "")  # N
                    genomes[current_header] = ''.join([char for char in cleaned_sequence if char in "ATCG"])
                current_header = line[1:]  # >
                current_seq = []
            else:
                current_seq.append(line)
        
        if current_header:
            cleaned_sequence = ''.join(current_seq).replace("N", "")  # Last genome, remove 'N'
            genomes[current_header] = ''.join([char for char in cleaned_sequence if char in "ATCG"])
    
    return genomes

In [37]:
def generate_nucleotide_combinations(sequence, group_size):
    combinations = defaultdict(int)
    
    for i in range(len(sequence) - group_size + 1):
        group = sequence[i:i + group_size]
        combinations[group] += 1
    
    return combinations

In [38]:
def brute_force_engine(file_name):
    genomes = read_and_clean_fasta(file_name)
    
    for genome_name, sequence in genomes.items():
        print(f"Processing genome: {genome_name}")
        
        dinucleotides = generate_nucleotide_combinations(sequence, 2)
        print("\nDinucleotide combinations + counts:")
        for comb, count in dinucleotides.items():
            print(f"{comb}: {count}")
        
        trinucleotides = generate_nucleotide_combinations(sequence, 3)
        print("\nTrinucleotide combinations + counts:")
        for comb, count in trinucleotides.items():
            print(f"{comb}: {count}")

In [39]:
file_name = "sequence.fasta"
brute_force_engine(file_name)

Processing genome: NC_000013.11 Homo sapiens chromosome 13, GRCh38.p14 Primary Assembly

Dinucleotide combinations + counts:
AG: 6615960
GC: 3759122
CA: 6952415
AT: 8253782
TT: 10357652
TC: 5716274
CT: 6605441
TG: 6998638
GA: 5752103
AA: 10252939
TA: 7090153
AC: 4924930
GT: 4945842
CC: 4438866
GG: 4476537
CG: 842470

Trinucleotide combinations + counts:
AGC: 1275626
GCA: 1349308
CAT: 1877183
ATT: 2736424
TTC: 2021369
TCT: 2198494
CTG: 1792630
TGA: 1961342
GAG: 1494016
AGA: 2182820
GAA: 2030411
AAA: 4063909
AAT: 2708652
TTA: 2275486
TAC: 1171524
ACT: 1604418
CTT: 2012785
TTT: 4116325
TTG: 1944472
TGT: 2055222
GTG: 1400672
GAT: 1348879
ATG: 1860864
TGC: 1332316
TCA: 1945061
ATC: 1332697
CAC: 1376743
ACA: 2043793
CAA: 1877457
AAG: 1982512
AGT: 1610424
GTT: 1492118
AAC: 1497866
ACC: 1036375
CCT: 1549871
TGG: 1649758
GGT: 1042254
CAG: 1821032
CTC: 1490829
GTA: 1181673
TAG: 1318399
TAA: 2281162
GGA: 1416543
ATA: 2323797
TAT: 2319068
AGG: 1547089
GGC: 960069
GCT: 1252658
CTA: 1309197
CCA: 161