In [20]:
from Bio import SeqIO
from tqdm import tqdm
import os, random
import pandas as pd

In [21]:
KMER_LENGTH = 6
MAX_KMER_INPUT_LENGTH = 512


random.seed(1000)

In [16]:
# Accessing the sequence records in all of Amanda's Dataset

sequence_records = list()

for seq_record in SeqIO.parse(f"out_s0/2023.03.07_09.24.27_sample_0/reads/read1.fq", "fastq"):
    #print(seq_record.id)
    #print(repr(seq_record.seq))
    #print(len(seq_record))

    sequence_records.append(seq_record)
    


In [19]:
print(sequence_records[0].id)
print(repr(sequence_records[0].seq))
print(len(sequence_records[0]))

S0R0/1
Seq('CAGTTCCTGCTGATTTACATAAATTGTAAGCTCCATCTCGCTGGGCACATAAAC...TAG')
150


In [15]:
# how the fine-tuning data should be formatted
pd.read_csv('sampletrain.tsv', sep='\t')

Unnamed: 0,sequence,label
0,CACAGC ACAGCC CAGCCA AGCCAG GCCAGC CCAGCC CAGC...,0
1,CTAATC TAATCT AATCTA ATCTAG TCTAGT CTAGTA TAGT...,1
2,GGAAGA GAAGAG AAGAGG AGAGGG GAGGGA AGGGAA GGGA...,1
3,CGAAAG GAAAGC AAAGCA AAGCAA AGCAAT GCAATC CAAT...,1
4,TGACTC GACTCC ACTCCC CTCCCA TCCCAA CCCAAA CCAA...,1
...,...,...
32361,AGGCTG GGCTGA GCTGAG CTGAGG TGAGGC GAGGCA AGGC...,0
32362,ACTGGG CTGGGG TGGGGA GGGGAA GGGAAC GGAACG GAAC...,1
32363,ACCCTG CCCTGA CCTGAA CTGAAG TGAAGA GAAGAA AAGA...,1
32364,TTGGCC TGGCCT GGCCTG GCCTGA CCTGAC CTGACC TGAC...,1


In [25]:
mapping = pd.read_csv('out_s0/2023.03.07_09.24.27_sample_0/reads/reads_mapping.tsv', sep='\t')

In [64]:
#print(mapping['#anonymous_read_id'][0])
#print(str(sequence_records[0].id))
#mapping.loc[mapping['#anonymous_read_id'] == str(sequence_records[1].id)].iloc[0]['genome_id']

S0R0/1
S0R0/1


'genome7'

In [104]:
def build_kmers(sequence, ksize):
    kmers = []
    n_kmers = len(sequence) - ksize + 1

    for i in range(n_kmers):
        kmer = sequence[i:i + ksize]
        kmers.append(kmer)
    return kmers

In [107]:
def kmer_array_to_long_string(sequence):
    long_string = ""
    for segment in sequence:
        long_string += segment + " "
        
    long_string = long_string.rstrip()
    return long_string

In [115]:
datarows = []

# create sequences and labels 
# is slow because it needs to search mapping; speed up?

for seq_record in tqdm(sequence_records):
    
    datapoint = (kmer_array_to_long_string(build_kmers(str(seq_record.seq), 6)),
                mapping.loc[mapping['#anonymous_read_id'] == str(seq_record.id)].iloc[0]['genome_id'])
    
    datarows.append(datapoint)
    
    
    
    
    

100%|████████████████████████████████████████████████████████████████████████| 333330/333330 [3:51:16<00:00, 24.02it/s]


In [116]:
datarows[0]

('CAGTTC AGTTCC GTTCCT TTCCTG TCCTGC CCTGCT CTGCTG TGCTGA GCTGAT CTGATT TGATTT GATTTA ATTTAC TTTACA TTACAT TACATA ACATAA CATAAA ATAAAT TAAATT AAATTG AATTGT ATTGTA TTGTAA TGTAAG GTAAGC TAAGCT AAGCTC AGCTCC GCTCCA CTCCAT TCCATC CCATCT CATCTC ATCTCG TCTCGC CTCGCT TCGCTG CGCTGG GCTGGG CTGGGC TGGGCA GGGCAC GGCACA GCACAT CACATA ACATAA CATAAA ATAAAC TAAACT AAACTG AACTGA ACTGAG CTGAGG TGAGGT GAGGTC AGGTCC GGTCCT GTCCTG TCCTGG CCTGGA CTGGAC TGGACC GGACCA GACCAT ACCATC CCATCC CATCCC ATCCCT TCCCTC CCCTCG CCTCGT CTCGTC TCGTCC CGTCCG GTCCGA TCCGAA CCGAAA CGAAAA GAAAAG AAAAGC AAAGCG AAGCGA AGCGAT GCGATA CGATAA GATAAC ATAACA TAACAA AACAAA ACAAAC CAAACT AAACTA AACTAT ACTATG CTATGT TATGTT ATGTTT TGTTTG GTTTGC TTTGCA TTGCAG TGCAGC GCAGCG CAGCGG AGCGGC GCGGCA CGGCAC GGCACC GCACCT CACCTT ACCTTT CCTTTA CTTTAA TTTAAT TTAATG TAATGT AATGTC ATGTCT TGTCTG GTCTGG TCTGGT CTGGTA TGGTAC GGTACT GTACTC TACTCC ACTCCA CTCCAC TCCACA CCACAT CACATT ACATTT CATTTT ATTTTT TTTTTG TTTTGC TTTGCA TTGCAT TGCATG GCATGT CATGTT ATGT

In [117]:
dataset = pd.DataFrame(datarows, columns=['sequence', 'label'])

In [118]:
dataset.head(10)

Unnamed: 0,sequence,label
0,CAGTTC AGTTCC GTTCCT TTCCTG TCCTGC CCTGCT CTGC...,genome7
1,GTGGTT TGGTTT GGTTTG GTTTGT TTTGTA TTGTAT TGTA...,genome7
2,CGGTAC GGTACC GTACCT TACCTG ACCTGA CCTGAC CTGA...,genome7
3,AATGCA ATGCAG TGCAGT GCAGTT CAGTTA AGTTAA GTTA...,genome7
4,TGTGGC GTGGCC TGGCCA GGCCAT GCCATC CCATCC CATC...,genome7
5,TCGCCA CGCCAA GCCAAT CCAATC CAATCG AATCGC ATCG...,genome7
6,CAGCCC AGCCCG GCCCGA CCCGAT CCGATG CGATGT GATG...,genome22
7,TTTTCC TTTCCA TTCCAG TCCAGC CCAGCA CAGCAC AGCA...,genome7
8,AACTGG ACTGGA CTGGAA TGGAAC GGAACA GAACAA AACA...,genome7
9,TATAAA ATAAAA TAAAAA AAAAAC AAAACA AAACAC AACA...,genome7


In [119]:
len(dataset)

333330

In [6]:
from Bio import SeqIO
from tqdm import tqdm
import os, random
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
# splitting dataset to train and test split
dataset = pd.read_csv("finetuningdataset2.tsv", sep='\t')

In [4]:
dataset.head()

Unnamed: 0,sequence,label
0,CAGTTC AGTTCC GTTCCT TTCCTG TCCTGC CCTGCT CTGC...,genome7
1,GTGGTT TGGTTT GGTTTG GTTTGT TTTGTA TTGTAT TGTA...,genome7
2,CGGTAC GGTACC GTACCT TACCTG ACCTGA CCTGAC CTGA...,genome7
3,AATGCA ATGCAG TGCAGT GCAGTT CAGTTA AGTTAA GTTA...,genome7
4,TGTGGC GTGGCC TGGCCA GGCCAT GCCATC CCATCC CATC...,genome7


In [7]:
X_train, X_test, y_train, y_test = train_test_split(dataset['sequence'], dataset['label'], test_size = 0.2, random_state = 10)

In [14]:
list(X_train)

['GATCTT ATCTTG TCTTGT CTTGTT TTGTTG TGTTGA GTTGAT TTGATC TGATCG GATCGG ATCGGG TCGGGG CGGGGG GGGGGA GGGGAA GGGAAA GGAAAA GAAAAA AAAAAT AAAATA AAATAT AATATT ATATTT TATTTA ATTTAT TTTATT TTATTC TATTCT ATTCTT TTCTTT TCTTTT CTTTTA TTTTAT TTTATT TTATTC TATTCC ATTCCG TTCCGG TCCGGC CCGGCA CGGCAG GGCAGA GCAGAT CAGATA AGATAT GATATT ATATTT TATTTG ATTTGC TTTGCG TTGCGA TGCGAA GCGAAG CGAAGT GAAGTG AAGTGA AGTGAT GTGATT TGATTA GATTAT ATTATT TTATTC TATTCA ATTCAA TTCAAC TCAACA CAACAC AACACG ACACGC CACGCC ACGCCG CGCCGG GCCGGA CCGGAG CGGAGA GGAGAG GAGAGC AGAGCA GAGCAA AGCAAA GCAAAC CAAACA AAACAT AACATC ACATCC CATCCA ATCCAC TCCACC CCACCG CACCGC ACCGCC CCGCCG CGCCGG GCCGGG CCGGGG CGGGGT GGGGTT GGGTTT GGTTTT GTTTTG TTTTGG TTTGGA TTGGAG TGGAGT GGAGTA GAGTAT AGTATC GTATCA TATCAA ATCAAG TCAAGA CAAGAT AAGATA AGATAA GATAAA ATAAAA TAAAAA AAAAAT AAAATA AAATAG AATAGT ATAGTG TAGTGA AGTGAA GTGAAC TGAACT GAACTT AACTTC ACTTCA CTTCAT TTCATG TCATGA CATGAC ATGACC TGACCT GACCTG ACCTGA CCTGAC CTGACA TGACAG GACAGA ACAGAC CAGA

In [19]:
list(y_train)

['genome7',
 'genome7',
 'genome7',
 'genome7',
 'genome14',
 'genome7',
 'genome17',
 'genome7',
 'genome7',
 'genome7',
 'genome9',
 'genome7',
 'genome7',
 'genome10',
 'genome7',
 'genome7',
 'genome7',
 'genome14',
 'genome13',
 'genome25',
 'genome14',
 'genome7',
 'genome7',
 'genome7',
 'genome7',
 'genome11',
 'genome7',
 'genome7',
 'genome10',
 'genome7',
 'genome17',
 'genome7',
 'genome7',
 'genome9',
 'genome7',
 'genome7',
 'genome7',
 'genome7',
 'genome17',
 'genome17',
 'genome10',
 'genome7',
 'genome14',
 'genome7',
 'genome7',
 'genome7',
 'genome7',
 'genome14',
 'genome7',
 'genome7',
 'genome14',
 'genome10',
 'genome7',
 'genome7',
 'genome14',
 'genome7',
 'genome7',
 'genome7',
 'genome14',
 'genome7',
 'genome7',
 'genome7',
 'genome7',
 'genome17',
 'genome9',
 'genome7',
 'genome7',
 'genome22',
 'genome7',
 'genome9',
 'genome7',
 'genome7',
 'genome7',
 'genome7',
 'genome14',
 'genome17',
 'genome7',
 'genome7',
 'genome7',
 'genome7',
 'genome10',
 'ge

In [16]:
train_dataset = pd.DataFrame(list(zip(X_train, y_train)), columns=['sequence', 'label'])

In [20]:
train_dataset.head(10)

Unnamed: 0,sequence,label
0,GATCTT ATCTTG TCTTGT CTTGTT TTGTTG TGTTGA GTTG...,genome7
1,TCCACC CCACCG CACCGG ACCGGT CCGGTG CGGTGC GGTG...,genome7
2,TTTTCC TTTCCT TTCCTT TCCTTT CCTTTT CTTTTT TTTT...,genome7
3,AGAGAG GAGAGC AGAGCG GAGCGG AGCGGC GCGGCG CGGC...,genome7
4,CAGGAT AGGATG GGATGG GATGGA ATGGAG TGGAGA GGAG...,genome14
5,GACAAT ACAATT CAATTA AATTAC ATTACA TTACAT TACA...,genome7
6,ACCACT CCACTG CACTGG ACTGGG CTGGGG TGGGGC GGGG...,genome17
7,AATAAT ATAATT TAATTT AATTTA ATTTAT TTTATT TTAT...,genome7
8,GGTGAA GTGAAA TGAAAA GAAAAA AAAAAC AAAACC AAAC...,genome7
9,AAACAC AACACG ACACGC CACGCA ACGCAA CGCAAT GCAA...,genome7


In [21]:
test_dataset = pd.DataFrame(list(zip(X_test, y_test)), columns=['sequence', 'label'])

In [23]:
test_dataset.head(10)

Unnamed: 0,sequence,label
0,CTCCGG TCCGGC CCGGCC CGGCCG GGCCGC GCCGCC CCGC...,genome17
1,GGTCAA GTCAAC TCAACG CAACGG AACGGT ACGGTG CGGT...,genome14
2,CGCCTA GCCTAA CCTAAC CTAACG TAACGC AACGCA ACGC...,genome7
3,ATGTGC TGTGCA GTGCAT TGCATC GCATCT CATCTG ATCT...,genome9
4,GCTTAA CTTAAA TTAAAG TAAAGA AAAGAA AAGAAT AGAA...,genome7
5,GGTCAT GTCATG TCATGG CATGGT ATGGTC TGGTCA GGTC...,genome7
6,TGGTTC GGTTCT GTTCTT TTCTTG TCTTGC CTTGCA TTGC...,genome7
7,GCAATA CAATAG AATAGC ATAGCG TAGCGC AGCGCC GCGC...,genome7
8,GAATCG AATCGG ATCGGT TCGGTT CGGTTC GGTTCG GTTC...,genome7
9,AGTTTA GTTTAC TTTACT TTACTT TACTTA ACTTAA CTTA...,genome7


In [None]:
train_dataset.to_csv("finetuning-traindataset.tsv", sep='\t', index=False)
test_dataset.to_csv("finetuning-testdataset.tsv", sep='\t', index=False)