In [1]:
import os

DATA_FOLDER = os.path.abspath(os.path.join('..', 'data'))
FIGURE_FOLDER = os.path.abspath(os.path.join('..', 'figures'))

notebook_name = '022_make_train_valid_test_sets'

data_folder = os.path.join(DATA_FOLDER, notebook_name)
figure_folder = os.path.join(FIGURE_FOLDER, notebook_name)

! mkdir -p $data_folder
! mkdir -p $figure_folder

input_folder = os.path.join(DATA_FOLDER, '021_get_nucleus_cytoplasm_sequences')

In [2]:
fastas = ['Homo_sapiens.GRCh38.cdna.all.fa', 'Homo_sapiens.GRCh38.cds.all.fa', 'Homo_sapiens.GRCh38.pep.all.fa']

In [34]:
# %%time
import pandas as pd
import numpy as np

np.random.seed(0)

# %%time

from Bio import SeqIO

train_fraction = 0.8

for fasta in fastas:
    input_fasta = os.path.join(input_folder, f"{fasta}.nuclear_or_cytoplasmic")
    print(input_fasta)
    tsv = os.path.join(input_folder, f"{fasta}.nuclear_or_cytoplasmic.target.tsv")
    target = pd.read_table(tsv, header=None)
    print('\ttarget.shape', target.shape)
    
    total = len(target.index)
    eighty_percent = int(np.round(0.8*total))
    print(f"\teighty_percent: {eighty_percent}")
    
    # Create test set ids (80% of data)
    train_ids = np.random.choice(target.index, replace=False, size=eighty_percent)
    
    # Create training set ids (20% of data)
    test_ids = target.index[~target.index.isin(train_ids)]
    
    # Create validation set ids (1/8 of test set --> 10% of data)
    one_eighth_of_train = int(np.round(eighty_percent/8.))
    print(f"\tone_eighth_of_train: {one_eighth_of_train}")
    valid_ids = np.random.choice(train_ids, replace=False, size=one_eighth_of_train)
    
    # Update training set ids to not contain the validation set ids
    # --> now test set is 70% of the data
    train_ids = train_ids[~np.isin(train_ids, valid_ids)]

    train_target = target.loc[train_ids]
    test_target = target.loc[test_ids]
    valid_target = target.loc[valid_ids]
    
    id_to_dataset = pd.Series(index=target.index)
    id_to_dataset[train_ids] = 'train'
    id_to_dataset[test_ids] = 'test'
    id_to_dataset[valid_ids] = 'valid'
    
    print("\ttest_target.shape:", test_target.shape, f'{100*float(len(test_target.index))/total:.1f}%')
    print("\ttrain_target.shape:", train_target.shape, f'{100*float(len(train_target.index))/total:.1f}%')
    print("\tvalid_target.shape:", valid_target.shape, f'{100*float(len(valid_target.index))/total:.1f}%')
    
    test_target.to_csv(csv + ".test", index=False, header=False, sep='\t')
    train_target.to_csv(csv + ".train", index=False, header=False, sep='\t')
    
    targets = {'test': test_target, 'train': train_target, 'valid': valid_target}
    records = {'test': [], 'train': [], "valid": []}
    
    
    for i, record in enumerate(SeqIO.parse(input_fasta, "fasta")):
        dataset = id_to_dataset[i]
        records[dataset].append(record)
    
    for name, target_vector in targets.items():
        datset_tsv = tsv + "." + name
        print(f"\tWriting target vector {os.path.basename(datset_tsv)} ...")
        target_vector.to_csv(datset_tsv, index=False, header=False, sep='\t')
        
        dataset_fasta = input_fasta + '.' + name
        print(f"\tWriting sequences to {os.path.basename(dataset_fasta)} ...")
        SeqIO.write(records[name], dataset_fasta, 'fasta')
        
#         if train_index[i]:
#             train_records.append(record)
#         else:
#             test_records.append(record)
    
#     SeqIO.write(test_records, input_fasta + ".test", "fasta")
#     SeqIO.write(train_records, input_fasta + ".train", "fasta")

/src/myhome/code/sequence-localization/data/021_get_nucleus_cytoplasm_sequences/Homo_sapiens.GRCh38.cdna.all.fa.nuclear_or_cytoplasmic
	target.shape (21588, 2)
	eighty_percent: 17270
	one_eighth_of_train: 2159
	test_target.shape: (4318, 2) 20.0%
	train_target.shape: (15111, 2) 70.0%
	valid_target.shape: (2159, 2) 10.0%
	Writing target vector Homo_sapiens.GRCh38.cdna.all.fa.nuclear_or_cytoplasmic.target.tsv.test ...
	Writing sequences to Homo_sapiens.GRCh38.cdna.all.fa.nuclear_or_cytoplasmic.test ...
	Writing target vector Homo_sapiens.GRCh38.cdna.all.fa.nuclear_or_cytoplasmic.target.tsv.train ...
	Writing sequences to Homo_sapiens.GRCh38.cdna.all.fa.nuclear_or_cytoplasmic.train ...
	Writing target vector Homo_sapiens.GRCh38.cdna.all.fa.nuclear_or_cytoplasmic.target.tsv.valid ...
	Writing sequences to Homo_sapiens.GRCh38.cdna.all.fa.nuclear_or_cytoplasmic.valid ...
/src/myhome/code/sequence-localization/data/021_get_nucleus_cytoplasm_sequences/Homo_sapiens.GRCh38.cds.all.fa.nuclear_or_c

In [28]:
target.shape

(21588, 2)

In [16]:
    total = len(target.index)
    eighty_percent = int(np.round(0.8*total))
    print(f"eighty_percent: {eighty_percent}")
    
    train_ids = np.random.choice(target.index, replace=False, size=eighty_percent)
    
    one_eighth_of_train = int(np.round(eighty_percent/8.))
    print(f"one_eighth_of_train: {one_eighth_of_train}")
    valid_ids = np.random.choice(train_ids, replace=False, size=one_eighth_of_train)
    

eighty_percent: 10795
one_eighth_of_train: 1349


In [29]:
target

Unnamed: 0,0,1
0,1,0
1,1,0
2,1,0
3,1,0
4,1,0
5,1,0
6,0,1
7,1,0
8,0,1
9,0,1


In [24]:
(~np.isin(train_ids, valid_ids)).sum()

9446

In [22]:
train_ids[~np.isin(train_ids, valid_ids)].sum()

63805493

In [20]:
target.head()

Unnamed: 0,0,1
0,1,0
1,1,0
2,1,0
3,1,0
4,1,0


In [21]:
test_target.tail()

Unnamed: 0,0,1
13458,0,1
13467,1,0
13468,1,0
13484,1,0
13487,1,0


In [22]:
ls -lha $data_folder

total 137M
drwxrwxr-x 2 ubuntu ubuntu 4.0K May 16 23:10 [0m[01;34m.[0m/
drwxrwxr-x 8 ubuntu ubuntu 4.0K May 16 21:31 [01;34m..[0m/
-rw-rw-r-- 1 ubuntu ubuntu  42M May 18 21:40 Homo_sapiens.GRCh38.cdna.all.fa.nuclear_or_cytoplasmic
-rw-rw-r-- 1 ubuntu ubuntu  85K May 18 21:40 Homo_sapiens.GRCh38.cdna.all.fa.nuclear_or_cytoplasmic.target.tsv
-rw-rw-r-- 1 ubuntu ubuntu  17K May 18 23:20 Homo_sapiens.GRCh38.cdna.all.fa.nuclear_or_cytoplasmic.target.tsv.test
-rw-rw-r-- 1 ubuntu ubuntu  68K May 18 23:20 Homo_sapiens.GRCh38.cdna.all.fa.nuclear_or_cytoplasmic.target.tsv.train
-rw-rw-r-- 1 ubuntu ubuntu 8.4M May 18 23:20 Homo_sapiens.GRCh38.cdna.all.fa.nuclear_or_cytoplasmic.test
-rw-rw-r-- 1 ubuntu ubuntu  34M May 18 23:20 Homo_sapiens.GRCh38.cdna.all.fa.nuclear_or_cytoplasmic.train
-rw-rw-r-- 1 ubuntu ubuntu  18M May 18 21:41 Homo_sapiens.GRCh38.cds.all.fa.nuclear_or_cytoplasmic
-rw-rw-r-- 1 ubuntu ubuntu  53K May 18 21:41 Homo_sapiens.GRCh38.cds.all.fa.nuclear_or_cytoplasmic.t

In [23]:
! head $data_folder/*nuclear_or_cytoplasmic

==> /home/ubuntu/code/sequence-localization/data/021_get_nucleus_cytoplasm_sequences/Homo_sapiens.GRCh38.cdna.all.fa.nuclear_or_cytoplasmic <==
>ENST00000419783.2 cdna chromosome:GRCh38:3:49357171:49358600:-1 gene:ENSG00000233276.4 gene_biotype:polymorphic_pseudogene transcript_biotype:protein_coding gene_symbol:GPX1 description:glutathione peroxidase 1 [Source:HGNC Symbol;Acc:HGNC:4553]
GAGCCCTCGAGGGCCCCAGCCCTTGGAAGGGTAACCTGGACCGCTGCCGCCTGGTTGCCT
GGGCCAGACCAGACATGCCTGCTGCTCCTTCCGGCTTAGGAGGAGCACGCGTCCCGCTCG
GGCGCACTCTCCAGCCTTTTCCTGGCTGAGGAGGGGCCGAGCCCTCCGGGTAGGGCGGGG
GCCGGATGAGGCGGGACCCTCAGGCCCGGAAAACTGCCTGTGCCACGTGACCCGCCGCCG
GCCAGTTAAAAGGAGGCGCCTGCTGGCCTCCCCTTACAGTGCTTGTTCGGGGCGCTCCGC
TGGCTTCTTGGACAATTGCGCCATGTGTGCTGCTCGGCTAGCGGCGGCGGCGGCGGCGGC
CCAGTCGGTGTATGCCTTCTCGGCGCGCCCGCTGGCCGGCGGGGAGCCTGTGAGCCTGGG
CTCCCTGCGGGGCAAGGTACTACTTATCGAGAATGTGGCGTCCCTCTGAGGCACCACGGT
CCGGGACTACACCCAGATGAACGAGCTGCAGCGGCGCCTCGGACCCCGGGGCCTGGTGGT

==> /home/ubuntu/code/sequence-localization/dat

Search for `ATGTGTGCTGCT`, the very beginning of the CDS. It shows that the CDS is the pure coding sequence (exons only) while the CDNA is the whole CDNA molecule, including introns and UTRs. We want the whole UTRs too.