In [3]:
import os

DATA_FOLDER = os.path.abspath(os.path.join('..', 'data'))
FIGURE_FOLDER = os.path.abspath(os.path.join('..', 'figures'))

notebook_name = '023_make_train_valid_test_sets'

data_folder = os.path.join(DATA_FOLDER, notebook_name)
figure_folder = os.path.join(FIGURE_FOLDER, notebook_name)

! mkdir -p $data_folder
! mkdir -p $figure_folder

input_folder = os.path.join(DATA_FOLDER, '022_slice_seqs_to_same_lengths')

In [13]:
ls $input_folder

Homo_sapiens.GRCh38.cdna.all.fa.nuclear_or_cytoplasmic.last500
Homo_sapiens.GRCh38.cdna.all.fa.nuclear_or_cytoplasmic.last500.tsv
Homo_sapiens.GRCh38.cds.all.fa.nuclear_or_cytoplasmic.last450
Homo_sapiens.GRCh38.cds.all.fa.nuclear_or_cytoplasmic.last450.tsv
Homo_sapiens.GRCh38.pep.all.fa.nuclear_or_cytoplasmic.last150
Homo_sapiens.GRCh38.pep.all.fa.nuclear_or_cytoplasmic.last150.tsv


In [16]:
fastas = ! ls $input_folder
fastas = [x for x in fastas if not x.endswith('tsv')]
fastas

['Homo_sapiens.GRCh38.cdna.all.fa.nuclear_or_cytoplasmic.last500',
 'Homo_sapiens.GRCh38.cds.all.fa.nuclear_or_cytoplasmic.last450',
 'Homo_sapiens.GRCh38.pep.all.fa.nuclear_or_cytoplasmic.last150']

In [20]:
# %%time
import pandas as pd
import numpy as np

np.random.seed(0)

# %%time

from Bio import SeqIO

train_fraction = 0.8

for fasta in fastas:
    input_fasta = os.path.join(input_folder, fasta)
    print(input_fasta)
    tsv = os.path.join(input_folder, f"{fasta}.target.tsv")
    target = pd.read_table(tsv, header=None)
    print('\ttarget.shape', target.shape)
    
    total = len(target.index)
    eighty_percent = int(np.round(0.8*total))
    print(f"\teighty_percent: {eighty_percent}")
    
    # Create test set ids (80% of data)
    train_ids = np.random.choice(target.index, replace=False, size=eighty_percent)
    
    # Create training set ids (20% of data)
    test_ids = target.index[~target.index.isin(train_ids)]
    
    # Create validation set ids (1/8 of test set --> 10% of data)
    one_eighth_of_train = int(np.round(eighty_percent/8.))
    print(f"\tone_eighth_of_train: {one_eighth_of_train}")
    valid_ids = np.random.choice(train_ids, replace=False, size=one_eighth_of_train)
    
    # Update training set ids to not contain the validation set ids
    # --> now test set is 70% of the data
    train_ids = train_ids[~np.isin(train_ids, valid_ids)]

    train_target = target.loc[train_ids]
    test_target = target.loc[test_ids]
    valid_target = target.loc[valid_ids]
    
    id_to_dataset = pd.Series(index=target.index)
    id_to_dataset[train_ids] = 'train'
    id_to_dataset[test_ids] = 'test'
    id_to_dataset[valid_ids] = 'valid'
    
    print("\ttest_target.shape:", test_target.shape, f'{100*float(len(test_target.index))/total:.1f}%')
    print("\ttrain_target.shape:", train_target.shape, f'{100*float(len(train_target.index))/total:.1f}%')
    print("\tvalid_target.shape:", valid_target.shape, f'{100*float(len(valid_target.index))/total:.1f}%')
        
    targets = {'test': test_target, 'train': train_target, 'valid': valid_target}
    records = {'test': [], 'train': [], "valid": []}
    
    for i, record in enumerate(SeqIO.parse(input_fasta, "fasta")):
        dataset = id_to_dataset[i]
        records[dataset].append(record)
    
    for name, target_vector in targets.items():
        datset_tsv = os.path.join(data_folder, os.path.basename(tsv) + "." + name)
        print(f"\tWriting target vector {os.path.basename(datset_tsv)} ...")
        target_vector.to_csv(datset_tsv, index=False, header=False, sep='\t')
        
        dataset_fasta = os.path.join(data_folder, os.path.basename(input_fasta) + '.' + name)
        print(f"\tWriting sequences to {os.path.basename(dataset_fasta)} ...")
        SeqIO.write(records[name], dataset_fasta, 'fasta')
        
#         if train_index[i]:
#             train_records.append(record)
#         else:
#             test_records.append(record)
    
#     SeqIO.write(test_records, input_fasta + ".test", "fasta")
#     SeqIO.write(train_records, input_fasta + ".train", "fasta")

/src/myhome/code/sequence-localization/data/022_slice_seqs_to_same_lengths/Homo_sapiens.GRCh38.cdna.all.fa.nuclear_or_cytoplasmic.last500
	target.shape (19338, 2)
	eighty_percent: 15470
	one_eighth_of_train: 1934
	test_target.shape: (3868, 2) 20.0%
	train_target.shape: (13536, 2) 70.0%
	valid_target.shape: (1934, 2) 10.0%
	Writing target vector Homo_sapiens.GRCh38.cdna.all.fa.nuclear_or_cytoplasmic.last500.target.tsv.test ...
	Writing sequences to Homo_sapiens.GRCh38.cdna.all.fa.nuclear_or_cytoplasmic.last500.test ...
	Writing target vector Homo_sapiens.GRCh38.cdna.all.fa.nuclear_or_cytoplasmic.last500.target.tsv.train ...
	Writing sequences to Homo_sapiens.GRCh38.cdna.all.fa.nuclear_or_cytoplasmic.last500.train ...
	Writing target vector Homo_sapiens.GRCh38.cdna.all.fa.nuclear_or_cytoplasmic.last500.target.tsv.valid ...
	Writing sequences to Homo_sapiens.GRCh38.cdna.all.fa.nuclear_or_cytoplasmic.last500.valid ...
/src/myhome/code/sequence-localization/data/022_slice_seqs_to_same_lengt

In [38]:
seq = records[name][0]

In [39]:
seq[-1000:]

SeqRecord(seq=Seq('MCAARLAAAAAAAQSVYAFSARPLAGGEPVSLGSLRGKVLLIENVASLGTTVRD...SCA', SingleLetterAlphabet()), id='ENSP00000478837.1', name='ENSP00000478837.1', description='ENSP00000478837.1 pep chromosome:GRCh38:3:49357180:49358358:-1 gene:ENSG00000233276.4 transcript:ENST00000620890.1 gene_biotype:polymorphic_pseudogene transcript_biotype:protein_coding gene_symbol:GPX1 description:glutathione peroxidase 1 [Source:HGNC Symbol;Acc:HGNC:4553]', dbxrefs=[])

In [36]:
ls $data_folder

Homo_sapiens.GRCh38.cdna.all.fa.nuclear_or_cytoplasmic.target.tsv.test
Homo_sapiens.GRCh38.cdna.all.fa.nuclear_or_cytoplasmic.target.tsv.train
Homo_sapiens.GRCh38.cdna.all.fa.nuclear_or_cytoplasmic.target.tsv.valid
Homo_sapiens.GRCh38.cdna.all.fa.nuclear_or_cytoplasmic.test
Homo_sapiens.GRCh38.cdna.all.fa.nuclear_or_cytoplasmic.train
Homo_sapiens.GRCh38.cdna.all.fa.nuclear_or_cytoplasmic.valid
Homo_sapiens.GRCh38.cds.all.fa.nuclear_or_cytoplasmic.target.tsv.test
Homo_sapiens.GRCh38.cds.all.fa.nuclear_or_cytoplasmic.target.tsv.train
Homo_sapiens.GRCh38.cds.all.fa.nuclear_or_cytoplasmic.target.tsv.valid
Homo_sapiens.GRCh38.cds.all.fa.nuclear_or_cytoplasmic.test
Homo_sapiens.GRCh38.cds.all.fa.nuclear_or_cytoplasmic.train
Homo_sapiens.GRCh38.cds.all.fa.nuclear_or_cytoplasmic.valid
Homo_sapiens.GRCh38.pep.all.fa.nuclear_or_cytoplasmic.target.tsv.test
Homo_sapiens.GRCh38.pep.all.fa.nuclear_or_cytoplasmic.target.tsv.train
Homo_sapiens.GRCh38.pep.all.fa.nuclear_or_cytoplasmic.tar

## What's the difference between CDS and cDNA datasets?

In [23]:
! head $data_folder/*nuclear_or_cytoplasmic

==> /home/ubuntu/code/sequence-localization/data/021_get_nucleus_cytoplasm_sequences/Homo_sapiens.GRCh38.cdna.all.fa.nuclear_or_cytoplasmic <==
>ENST00000419783.2 cdna chromosome:GRCh38:3:49357171:49358600:-1 gene:ENSG00000233276.4 gene_biotype:polymorphic_pseudogene transcript_biotype:protein_coding gene_symbol:GPX1 description:glutathione peroxidase 1 [Source:HGNC Symbol;Acc:HGNC:4553]
GAGCCCTCGAGGGCCCCAGCCCTTGGAAGGGTAACCTGGACCGCTGCCGCCTGGTTGCCT
GGGCCAGACCAGACATGCCTGCTGCTCCTTCCGGCTTAGGAGGAGCACGCGTCCCGCTCG
GGCGCACTCTCCAGCCTTTTCCTGGCTGAGGAGGGGCCGAGCCCTCCGGGTAGGGCGGGG
GCCGGATGAGGCGGGACCCTCAGGCCCGGAAAACTGCCTGTGCCACGTGACCCGCCGCCG
GCCAGTTAAAAGGAGGCGCCTGCTGGCCTCCCCTTACAGTGCTTGTTCGGGGCGCTCCGC
TGGCTTCTTGGACAATTGCGCCATGTGTGCTGCTCGGCTAGCGGCGGCGGCGGCGGCGGC
CCAGTCGGTGTATGCCTTCTCGGCGCGCCCGCTGGCCGGCGGGGAGCCTGTGAGCCTGGG
CTCCCTGCGGGGCAAGGTACTACTTATCGAGAATGTGGCGTCCCTCTGAGGCACCACGGT
CCGGGACTACACCCAGATGAACGAGCTGCAGCGGCGCCTCGGACCCCGGGGCCTGGTGGT

==> /home/ubuntu/code/sequence-localization/dat


Search for `ATGTGTGCTGCT`, the very beginning of the CDS. It shows that the CDS is the pure coding sequence (exons only) while the CDNA is the whole CDNA molecule, including introns and UTRs. We want the whole UTRs too.

this means:

- CDS = coding exons only
- cDNA = entire transcript, including UTRs and exons