# Code

In [1]:
its_jupyter_notebook = True

In [2]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split
import sys
sys.path.insert(0, '..')
from util.encoding_sequences import build_kmers
from tqdm.notebook import tqdm

if its_jupyter_notebook:
    sys.argv = [''] #Remove this if it's not a jupyter notebook!

<class 'transformers.tokenization_dna.DNATokenizer'>


In [3]:
ROOT_DIR = os.path.dirname(os.path.abspath('.'))
dnabert_dir = os.path.join(ROOT_DIR, 'dataset', 'pre_trained_DNABERT')
processed_files_dir = os.path.join(ROOT_DIR, 'dataset', 'processed_files')
rnabert_data_dir = os.path.join(dnabert_dir, 'rna_data')

if not os.path.exists(rnabert_data_dir):
    os.mkdir(rnabert_data_dir)

In [4]:
df = pd.read_csv(os.path.join(processed_files_dir,"df_cdna.csv"))
#mmseq = pd.read_csv(os.path.join(processed_files_dir,"mmseq2_clusters.csv"))

In [5]:
def split_stratified_into_train_val_test(df_input, stratify_colname='y',
                                         frac_train=0.6, frac_val=0.15, frac_test=0.25,
                                         random_state=None):
    #credits: https://stackoverflow.com/questions/50781562/stratified-splitting-of-pandas-dataframe-into-training-validation-and-test-set
    '''
    Splits a Pandas dataframe into three subsets (train, val, and test)
    following fractional ratios provided by the user, where each subset is
    stratified by the values in a specific column (that is, each subset has
    the same relative frequency of the values in the column). It performs this
    splitting by running train_test_split() twice.

    Parameters
    ----------
    df_input : Pandas dataframe
        Input dataframe to be split.
    stratify_colname : str
        The name of the column that will be used for stratification. Usually
        this column would be for the label.
    frac_train : float
    frac_val   : float
    frac_test  : float
        The ratios with which the dataframe will be split into train, val, and
        test data. The values should be expressed as float fractions and should
        sum to 1.0.
    random_state : int, None, or RandomStateInstance
        Value to be passed to train_test_split().

    Returns
    -------
    df_train, df_val, df_test :
        Dataframes containing the three splits.
    '''

    if frac_train + frac_val + frac_test != 1.0:
        raise ValueError('fractions %f, %f, %f do not add up to 1.0' % \
                         (frac_train, frac_val, frac_test))

    if stratify_colname not in df_input.columns:
        raise ValueError('%s is not a column in the dataframe' % (stratify_colname))

    X = df_input # Contains all columns.
    y = df_input[[stratify_colname]] # Dataframe of just the column on which to stratify.

    # Split original dataframe into train and temp dataframes.
    df_train, df_temp, y_train, y_temp = train_test_split(X,
                                                          y,
                                                          stratify=y,
                                                          test_size=(1.0 - frac_train),
                                                          random_state=random_state)

    # Split the temp dataframe into val and test dataframes.
    relative_frac_test = frac_test / (frac_val + frac_test)
    df_val, df_test, y_val, y_test = train_test_split(df_temp,
                                                      y_temp,
                                                      stratify=y_temp,
                                                      test_size=relative_frac_test,
                                                      random_state=random_state)

    assert len(df_input) == len(df_train) + len(df_val) + len(df_test)

    return df_train, df_val, df_test


In [6]:
df_train, df_val, df_test = split_stratified_into_train_val_test(df, stratify_colname='species', frac_train=0.7, frac_val=0.15, frac_test=0.15, random_state=123)

In [7]:
assert set(df_train.gene_id).intersection(set(df_val.gene_id)) == set()
assert set(df_val.gene_id).intersection(set(df_test.gene_id)) == set()
assert set(df_train.gene_id).intersection(set(df_test.gene_id)) == set()

In [8]:
# DNABERT Supplementary materials
# The length of each sub-sequence lies in the range of 5 and 510. 
# Specifically, with a 50% probability, we set the length of a sub-sequence as 510. 
# With another 50% probability, we set its length as a random integer between 5 and 510.  

train_file = os.path.join(rnabert_data_dir, 'train_6mers.txt')
val_file = os.path.join(rnabert_data_dir, 'val_6mers.txt')
test_file = os.path.join(rnabert_data_dir, 'test_6mers.txt')

def xrange(x):
    return iter(range(x))

def window(a, win_size):
    return [a[i:i+win_size] for i in xrange(len(a)-(win_size-1))]

def write_sequences(split, path, frequency = 510):
    txtfile = open(path, "w")
    for _, row in tqdm(split.iterrows(), total=split.shape[0]):
        sequence = row.cdna
        if len(sequence) < 510:
            s = build_kmers(sequence, k = 6)
            txtfile.write(s + "\n")
        else:   
            c = 0
            for seq in window(sequence, 510):
                if c%frequency == 0:
                    if np.random.rand()>0.5:
                        seq = seq[0:np.random.randint(5, 510)]
                    s = build_kmers(seq, k = 6)
                    txtfile.write(s + "\n")
                c+=1
    txtfile.close()

In [9]:
write_sequences(df_val, val_file)

  0%|          | 0/6196 [00:00<?, ?it/s]

In [10]:
write_sequences(df_test, test_file)

  0%|          | 0/6196 [00:00<?, ?it/s]

In [11]:
write_sequences(df_train, train_file)

  0%|          | 0/28914 [00:00<?, ?it/s]

## Create a fake small set 

In [12]:
val_file = os.path.join(rnabert_data_dir, 'val_6mers.txt')
fake_file = os.path.join(rnabert_data_dir, 'fake_6mers.txt')

val = open(val_file, "r")
fake = open(fake_file, "w")

In [13]:
content = val.readlines()

In [14]:
content[0] == content[1]

False

In [15]:
to_write = ''
for i in range(500):
    to_write += content[i]
val.close()
fake.write(to_write)
fake.close()