In [7]:
!date
!hostname

Tue Dec 10 15:18:21 PST 2024
c4-dev1


In [2]:
%env CONDA_PREFIX

'/c4/home/derek/miniconda3/envs/deepripe'

In [3]:
%load_ext autoreload
%autoreload 2

# Generation of DeepRipe ExplaiNN input data from scSLR

In [4]:
import os
import sys
import glob

import pandas as pd
import numpy as np
import seaborn as sns
from pathlib import Path
import json

from sklearn.model_selection import GroupKFold, ShuffleSplit, LeavePOut, RepeatedKFold

import pysam 

In [5]:
import tensorflow as tf

In [6]:
#import modified basenji modules 
sys.path.append("/nowakowskilab/data1/derek/data_scSLR/prenatal_brain/deep_splicing")
from basenji_.basenji.dna_io import dna_1hot

def region_to_mat(region):
    region_len = len(region)
    region= region.replace('i','0')
    region= region.replace('c','1')
    region= region.replace('3','2')
    region= region.replace('5','3')
    region= region.replace('N','4')
    region_code = np.zeros((4,region_len), dtype='float16')
    for i in range(region_len):
        if int(region[i]) != 4:
            region_code[int(region[i]),i] = 1
        else:
            region_code[0:4,i] = np.tile(0,4)
    return np.transpose(region_code).astype('int8')

In [None]:
pwd

In [None]:
in_dir = '/nowakowskilab/data1/derek/data_scSLR/prenatal_brain/deep_splicing/transfer_learning/'

In [None]:
EN = pd.read_csv(in_dir+'sig_dif_EN_table_0.1_.csv', index_col=0)
RG = pd.read_csv(in_dir+'sig_dif_RG_table_0.1_.csv', index_col=0)

In [None]:
# EN_cas = pd.read_csv(in_dir+'sig_dif_EN_table_0.1_.csv', index_col=0)
# RG_cas = pd.read_csv(in_dir+'sig_dif_RG_table_0.1_.csv', index_col=0)

# EN_mxe = pd.read_csv(in_dir+'sig_dif_EN_table_MXE_0.1_.csv', index_col=0)
# RG_mxe = pd.read_csv(in_dir+'sig_dif_RG_table_MXE_0.1_.csv', index_col=0)

# EN_tand = pd.read_csv(in_dir+'sig_dif_EN_table_tand_0.1_.csv', index_col=0)
# RG_tand = pd.read_csv(in_dir+'sig_dif_RG_table_tand_0.1_.csv', index_col=0)

# EN_AFE = pd.read_csv(in_dir+'sig_dif_EN_table_AFE_0.1_.csv', index_col=0)
# RG_AFE = pd.read_csv(in_dir+'sig_dif_RG_table_AFE_0.1_.csv', index_col=0)

# EN = pd.concat([EN_cas,EN_mxe,EN_tand, EN_AFE]).drop_duplicates(subset='model_input')

# RG = pd.concat([RG_cas,RG_mxe,RG_tand, RG_AFE]).drop_duplicates(subset='model_input')

In [None]:
EN = EN[['model_input','delta_psi']].set_index('model_input')

# EN['delta_psi'] = EN['delta_psi']*-1

EN.columns = [0]
EN.index.name = None

In [None]:
RG = RG[['model_input','delta_psi']].set_index('model_input')

# RG['delta_psi'] = RG['delta_psi']*-1

RG.columns = [0]
RG.index.name = None

In [None]:
df = pd.concat([EN,RG],axis=1)
df.columns = [0,1]

In [None]:
exon_info = pd.DataFrame(df.index)[0].str.split('_', expand=True)
exon_info = exon_info.rename(columns={0: 'chr', 1: 'start', 2: 'end',
                                      3: 'ENSG', 4: 'strand'})

In [None]:
df = df.iloc[exon_info.index]

In [None]:
intron_flanking_length = 200

exon_info['start'] = exon_info['start'].astype(int) - intron_flanking_length
exon_info['end'] = exon_info['end'].astype(int) + intron_flanking_length

In [None]:
exon_info_set = exon_info.copy()

num_targets = 2
max_length = 800
region_length = 1000

extra = int((region_length - max_length)/8)


In [None]:
exon_info

In [None]:
break

In [None]:
intron_flanking_length = 200
exon_flanking_length = 200


def create_tfrecords_deepripe(PSI, exon_info, train_idxs, val_idxs, 
                     test_idxs, fold, general_out_dir, peaks,
                     max_length):
    
    fold_dir = general_out_dir + '/fold' + str(fold)
    tfr_dir = fold_dir + '/tfrecords' 
    Path(fold_dir).mkdir(parents=True, exist_ok=True)
    Path(tfr_dir).mkdir(parents=True, exist_ok=True)

    fold_indexes = [train_idxs, val_idxs, test_idxs]
    
    # Write the file genes.csv
    split = np.zeros((len(exon_info),1), dtype='<U5')
    split[train_idxs] = 'train'
    split[val_idxs] = 'valid'
    split[test_idxs] = 'test'
    target = PSI.values
    genes = pd.DataFrame(np.hstack((split,target)), 
                         index=PSI.index, columns=np.hstack((['split'], PSI.columns)))
    
    genes = genes[genes['split'] != '']
    genes.to_csv(fold_dir + '/genes.csv')
    
    # Options TFR writer
    tf_opts = tf.io.TFRecordOptions(compression_type='ZLIB')
    seqs_per_tfr = 256
    fold_labels = ['train', 'valid', 'test']

    
    # open FASTA
    fasta_open = pysam.Fastafile(file_fasta)
    
    def rc(seq):
        return seq.translate(str.maketrans("ATCGatcg","TAGCtagc"))[::-1]
    
    def _bytes_feature(value):
        return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
    
    # Write the TFRecords
    num_folds = 3 #train-val-test
    
    for fi in range(num_folds):
        exon_ID_set = PSI.index[fold_indexes[fi]]
        exon_info_set = exon_info.iloc[fold_indexes[fi]]
        PSI_val_set = PSI.iloc[fold_indexes[fi]]
        
        num_set = exon_ID_set.shape[0]
        
        num_set_tfrs = int(np.ceil(num_set / seqs_per_tfr)) 
        # print(num_set_tfrs)
        
        # gene sequence index
        si = 0
        
        for tfr_i in range(num_set_tfrs):
            # Create the file e.g 'tfr_records/test-0.tfr'
            tfr_file = '%s/%s-%d.tfr' % (tfr_dir, fold_labels[fi], tfr_i)
            # print(tfr_file)
            with tf.io.TFRecordWriter(tfr_file, tf_opts) as writer:
                # TFR index
                ti = 0
                
                # This is to make sure that the max. genes per file stays below 256
                # And that for the last batch we stop on time (si < num_set)
                while ti < seqs_per_tfr and si < num_set:
                    # Get the genes that should be in this set
                    
                    seq_chrm = exon_info_set['chr'].iloc[si]
                    seq_start = int(exon_info_set['start'].iloc[si])
                    seq_end = int(exon_info_set['end'].iloc[si])+1 #Because fasta.fetch doesn't include end bp
                    seq_strand = exon_info_set['strand'].iloc[si]
                    gene = exon_info_set['ENSG'].iloc[si]
                    
                    #start
                    seq_start_ = seq_start
                    #end
                    seq_end_ = seq_end
                    
                    
                    #get exon's range
                    seq_DNA = fasta_open.fetch(seq_chrm, seq_start_, seq_end_)
                    
                    #and its length
                    len_DNA = len(seq_DNA)
                    
                    # get positions of splice sites
                    splicing_ind = np.array([intron_flanking_length, max_length - intron_flanking_length], dtype=np.int64)

                    
                    # orient
                    if seq_strand == '-':
                        seq_DNA = rc(seq_DNA)
                        
                        
                    # define ranges for all regions of interest and pad if shorter than intron_flanking_length
                    
                    
                    # upstream_intron 
                    
                    upstream_intron = seq_DNA[0:intron_flanking_length]
                    
                    upstream_intron_region = ('i'*extra)+('i'*intron_flanking_length)+('c'*extra)
                    
                    #exon_start
                    
                    exon_coord = len(seq_DNA) // 2
                    
                    if exon_coord <= (exon_flanking_length * 2):
                    
                        exon_start = seq_DNA[intron_flanking_length:exon_coord]
                    
                        padding_start = exon_flanking_length - len(exon_start)

                        exon_start = exon_start + padding_start*'N'

                        
                    else: 
                        
                        exon_start =  seq_DNA[intron_flanking_length:(intron_flanking_length + exon_flanking_length)]


                    
                    
                    exon_start_region = ('i'*extra)+''.join(['c' if x != 'N' else 'N' for x in exon_start])+('N'*extra)
                    
                    # downstream_intron
                    
                    downstream_intron = seq_DNA[(len_DNA-intron_flanking_length):len_DNA]
                    
                    downstream_intron_region = ('c'*extra)+('i'*intron_flanking_length)+('i'*extra)

                    # exon_end
                    
                    exon_coord = len(seq_DNA) // 2 
                    
                    if exon_coord <= (exon_flanking_length * 2): 
                    
                        exon_end = seq_DNA[exon_coord:(len_DNA-intron_flanking_length)]
                        
                        padding_end = exon_flanking_length - len(exon_end)

                        exon_end = padding_end*'N' + exon_end

        
                        
                    else:
                        
                        exon_end = seq_DNA[(len_DNA-(intron_flanking_length + exon_flanking_length)):(len_DNA-intron_flanking_length)]
                        
                    exon_end_region = ('N'*extra)+''.join(['c' if x != 'N' else 'N' for x in exon_end])+('i'*extra)
                        
                        
                    seq_DNA = (upstream_intron + exon_start + exon_end + downstream_intron)
                      
                    new_LEN = len(seq_DNA)

                    
                    assert(new_LEN == max_length)
                    
                    seq_region = upstream_intron_region + exon_start_region + exon_end_region + downstream_intron_region
                    assert(len(seq_region) == region_length)
                
                  
                    
                    # one hot code
                    seq_1hot = dna_1hot(seq_DNA)
                    seq_len = np.array(len(seq_DNA), dtype=np.int64)
                    
                    # splicing
                    splicing = np.zeros((new_LEN ,1), dtype=np.int8)
                    splicing[splicing_ind] = 1
                    
                    # region 
                    region_1hot = region_to_mat(seq_region)
           
                    # get targets
                    targets = PSI_val_set.iloc[si].values
                    targets = targets.reshape((1,-1)).astype('float64')
                    
            ### generate example ###
                    example = tf.train.Example(features=tf.train.Features(feature={
                           
                            'length': _bytes_feature(seq_len.flatten().tobytes()),
                            'sequence': _bytes_feature(seq_1hot.flatten().tobytes()),
                            'target': _bytes_feature(targets.flatten().tobytes()),
                            'splicing': _bytes_feature(splicing.flatten().tobytes()), 
                            'region': _bytes_feature(region_1hot.flatten().tobytes())
                        }))
    
                    # write
                    writer.write(example.SerializeToString())
    
                    # advance indexes
                    ti += 1
                    si += 1
    
    
    fasta_open.close()
    
    
    # Write statistics.json
    
    stats_dict = {}
    stats_dict['num_targets'] = num_targets
    stats_dict['seq_length'] = max_length
    stats_dict['target_length'] = 4
    
    for fi in range(num_folds):
        stats_dict['%s_seqs' % fold_labels[fi]] = len(fold_indexes[fi])
    
    with open('%s/statistics.json' % fold_dir, 'w') as stats_json_out:
        json.dump(stats_dict, stats_json_out, indent=4)
    
    
    # Copy the params.json
    train_dict = {}
    train_dict['batch_size'] = 64
    train_dict['optimizer'] = 'adam'
    train_dict['loss'] = 'bce' #'mse'
    train_dict['learning_rate'] = 0.0001
    train_dict['adam_beta1'] = 0.90
    train_dict['adam_beta2'] = 0.998
    train_dict['global_clipnorm'] = 0.5
    train_dict['train_epochs_min'] = 100
    train_dict['train_epochs_max'] = 1000
    train_dict['patience'] = 100
    
    model_dict = {}
    model_dict['activation'] = 'relu'
    model_dict['spline'] = False
    model_dict['rnn_type'] = 'gru'
    model_dict['final_activation'] = 'relu'
    model_dict['residual'] = False
    model_dict['seq_length'] = max_length

    model_dict['seq_depth'] = 4 
    model_dict['augment_shift'] = 0
    model_dict['num_targets'] = num_targets
    model_dict['heads'] = 1
    model_dict['filters'] = 186
    model_dict['kernel_size'] = 6 #5
    model_dict['dropout'] = 0.3 
    model_dict['l2_scale'] = 0.001
    model_dict['ln_epsilon'] = 0.007
    model_dict['bn_momentum'] = 0.90
    
    params_dict = {}
    params_dict['train'] = train_dict
    params_dict['model'] = model_dict
    
    with open('%s/params.json' % fold_dir, 'w') as params_json_out:
        json.dump(params_dict, params_json_out, indent=4)

In [None]:
def create_tfrecords_deepripe(PSI, exon_info, train_idxs, val_idxs, 
                     test_idxs, fold, general_out_dir, peaks,
                     max_length):
    
    fold_dir = general_out_dir + '/fold' + str(fold)
    tfr_dir = fold_dir + '/tfrecords' 
    Path(fold_dir).mkdir(parents=True, exist_ok=True)
    Path(tfr_dir).mkdir(parents=True, exist_ok=True)

    fold_indexes = [train_idxs, val_idxs, test_idxs]
    
    # Write the file genes.csv
    split = np.zeros((len(exon_info),1), dtype='<U5')
    split[train_idxs] = 'train'
    split[val_idxs] = 'valid'
    split[test_idxs] = 'test'
    target = PSI.values
    genes = pd.DataFrame(np.hstack((split,target)), 
                         index=PSI.index, columns=np.hstack((['split'], PSI.columns)))
    
    genes = genes[genes['split'] != '']
    genes.to_csv(fold_dir + '/genes.csv')
    
    # Options TFR writer
    tf_opts = tf.io.TFRecordOptions(compression_type='ZLIB')
    seqs_per_tfr = 256
    fold_labels = ['train', 'valid', 'test']

    
    # open FASTA
    fasta_open = pysam.Fastafile(file_fasta)
    
    def rc(seq):
        return seq.translate(str.maketrans("ATCGatcg","TAGCtagc"))[::-1]
    
    def _bytes_feature(value):
        return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
    
    # Write the TFRecords
    num_folds = 3 #train-val-test
    
    for fi in range(num_folds):
        exon_ID_set = PSI.index[fold_indexes[fi]]
        exon_info_set = exon_info.iloc[fold_indexes[fi]]
        PSI_val_set = PSI.iloc[fold_indexes[fi]]
        
        num_set = exon_ID_set.shape[0]
        
        num_set_tfrs = int(np.ceil(num_set / seqs_per_tfr)) 
        # print(num_set_tfrs)
        
        # gene sequence index
        si = 0
        
        for tfr_i in range(num_set_tfrs):
            # Create the file e.g 'tfr_records/test-0.tfr'
            tfr_file = '%s/%s-%d.tfr' % (tfr_dir, fold_labels[fi], tfr_i)
            # print(tfr_file)
            with tf.io.TFRecordWriter(tfr_file, tf_opts) as writer:
                # TFR index
                ti = 0
                
                # This is to make sure that the max. genes per file stays below 256
                # And that for the last batch we stop on time (si < num_set)
                while ti < seqs_per_tfr and si < num_set:
                    # Get the genes that should be in this set
                    
                    seq_chrm = exon_info_set['chr'].iloc[si]
                    seq_start = int(exon_info_set['start'].iloc[si])
                    seq_end = int(exon_info_set['end'].iloc[si])+1 #Because fasta.fetch doesn't include end bp
                    seq_strand = exon_info_set['strand'].iloc[si]
                    gene = exon_info_set['ENSG'].iloc[si]
                    
                    #start
                    seq_start_ = seq_start
                    #end
                    seq_end_ = seq_end
                    
                    
                    #get exon's range
                    seq_DNA = fasta_open.fetch(seq_chrm, seq_start_, seq_end_)
                    
                    #and its length
                    len_DNA = len(seq_DNA)
                    
                    # get positions of splice sites
                    splicing_ind = np.array([intron_flanking_length, max_length - intron_flanking_length], dtype=np.int64)

                    
                    # orient
                    if seq_strand == '-':
                        seq_DNA = rc(seq_DNA)
                        
                        
                    # define ranges for all regions of interest and pad if shorter than intron_flanking_length
                    
                    
                    # upstream_intron 
                    
                    upstream_intron = seq_DNA[0:intron_flanking_length]
                    
                    upstream_intron_region = ('i'*extra)+('i'*intron_flanking_length)+('c'*extra)
                    
                    #exon_start
                    
                    exon_coord = len(seq_DNA) // 2
                    
                    if exon_coord <= (intron_flanking_length * 2):
                    
                        exon_start = seq_DNA[intron_flanking_length:exon_coord]
                    
                        padding_start = intron_flanking_length - len(exon_start)

                        exon_start = exon_start + padding_start*'N'

                        
                    else: 
                        
                        exon_start =  seq_DNA[intron_flanking_length:(intron_flanking_length * 2)]

                    
                    
                    exon_start_region = ('i'*extra)+''.join(['c' if x != 'N' else 'N' for x in exon_start])+('N'*extra)
                    
                    # downstream_intron
                    
                    downstream_intron = seq_DNA[(len_DNA-intron_flanking_length):len_DNA]
                    
                    downstream_intron_region = ('c'*extra)+('i'*intron_flanking_length)+('i'*extra)

                    # exon_end
                    
                    exon_coord = len(seq_DNA) // 2 
                
                    if exon_coord <= (intron_flanking_length * 2):
                    
                        exon_end = seq_DNA[exon_coord:(len_DNA-intron_flanking_length)]
                        
                        padding_end = intron_flanking_length - len(exon_end)

                        exon_end = padding_end*'N' + exon_end

        
                        
                    else:
                        
                        exon_end = seq_DNA[(len_DNA-(intron_flanking_length * 2)):(len_DNA-intron_flanking_length)]

                    exon_end_region = ('N'*extra)+''.join(['c' if x != 'N' else 'N' for x in exon_end])+('i'*extra)

                    
                        
                    seq_DNA = (upstream_intron + exon_start + exon_end + downstream_intron)
                      
                    new_LEN = len(seq_DNA)
                    assert(new_LEN == max_length)
                    
                    seq_region = upstream_intron_region + exon_start_region + exon_end_region + downstream_intron_region
                    assert(len(seq_region) == region_length)
                
                  
                    
                    # one hot code
                    seq_1hot = dna_1hot(seq_DNA)
                    seq_len = np.array(len(seq_DNA), dtype=np.int64)
                    
                    # splicing
                    splicing = np.zeros((new_LEN ,1), dtype=np.int8)
                    splicing[splicing_ind] = 1
                    
                    # region 
                    region_1hot = region_to_mat(seq_region)
           
                    # get targets
                    targets = PSI_val_set.iloc[si].values
                    targets = targets.reshape((1,-1)).astype('float64')
                    
            ### generate example ###
                    example = tf.train.Example(features=tf.train.Features(feature={
                           
                            'length': _bytes_feature(seq_len.flatten().tobytes()),
                            'sequence': _bytes_feature(seq_1hot.flatten().tobytes()),
                            'target': _bytes_feature(targets.flatten().tobytes()),
                            'splicing': _bytes_feature(splicing.flatten().tobytes()), 
                            'region': _bytes_feature(region_1hot.flatten().tobytes())
                        }))
    
                    # write
                    writer.write(example.SerializeToString())
    
                    # advance indexes
                    ti += 1
                    si += 1
    
    
    fasta_open.close()
    
    
    # Write statistics.json
    
    stats_dict = {}
    stats_dict['num_targets'] = num_targets
    stats_dict['seq_length'] = max_length
    stats_dict['target_length'] = 4
    
    for fi in range(num_folds):
        stats_dict['%s_seqs' % fold_labels[fi]] = len(fold_indexes[fi])
    
    with open('%s/statistics.json' % fold_dir, 'w') as stats_json_out:
        json.dump(stats_dict, stats_json_out, indent=4)
    
    
    # Copy the params.json
    train_dict = {}
    train_dict['batch_size'] = 64
    train_dict['optimizer'] = 'adam'
    train_dict['loss'] = 'bce' #'mse'
    train_dict['learning_rate'] = 0.0001
    train_dict['adam_beta1'] = 0.90
    train_dict['adam_beta2'] = 0.998
    train_dict['global_clipnorm'] = 0.5
    train_dict['train_epochs_min'] = 100
    train_dict['train_epochs_max'] = 1000
    train_dict['patience'] = 100
    
    model_dict = {}
    model_dict['activation'] = 'relu'
    model_dict['spline'] = False
    model_dict['rnn_type'] = 'gru'
    model_dict['final_activation'] = 'relu'
    model_dict['residual'] = False
    model_dict['seq_length'] = max_length

    model_dict['seq_depth'] = 4 
    model_dict['augment_shift'] = 0
    model_dict['num_targets'] = num_targets
    model_dict['heads'] = 1
    model_dict['filters'] = 186
    model_dict['kernel_size'] = 6 #5
    model_dict['dropout'] = 0.3 
    model_dict['l2_scale'] = 0.001
    model_dict['ln_epsilon'] = 0.007
    model_dict['bn_momentum'] = 0.90
    
    params_dict = {}
    params_dict['train'] = train_dict
    params_dict['model'] = model_dict
    
    with open('%s/params.json' % fold_dir, 'w') as params_json_out:
        json.dump(params_dict, params_json_out, indent=4)

In [None]:
genes = exon_info['ENSG']
num_fold = 10
file_folds = None
peaks=0


file_fasta = "/c4/home/derek/data1/derek/reference/human_hg38_reference/refdata-gex-GRCh38-2020-A/fasta/genome.fa"

## Kfold Cross validation data setup

In [23]:
# out_dir = 'data_out/EN_RG_multitask_DeltaPSI_sig_0.1_train_valid_deepripe_PARCLIP/'

# out_dir= 'data_out/EN_RG_multitask_DeltaPSI_sig_0.1_train_valid_deepripe_sho/'

### train, valid, test

In [24]:
# %%time
# # split into train valid and test
# cv = GroupKFold(n_splits = num_fold)

# for i in range(num_fold):

        
#     train_test_indices = list(cv.split(df, df, genes))
#     train_val_idxs, test_idxs = train_test_indices[i]
    
#     cv2 = GroupKFold(n_splits = 5)
    
#     train_val_indices = list(cv2.split(df.iloc[train_val_idxs],
#                                        df.iloc[train_val_idxs],
#                                        genes[train_val_idxs]))
    
#     train_idxs, val_idxs = train_val_indices[0]
#     train_idxs = train_val_idxs[train_idxs]
#     val_idxs = train_val_idxs[val_idxs]
    

#     create_tfrecords_deepripe(df, exon_info, train_idxs,
#                      val_idxs, test_idxs, i, 
#                      out_dir, peaks,
#                      max_length)


#     num_fold += 1

### train, valid

In [22]:
%%time
# split into train and test only
num_fold = 5

cv = GroupKFold(n_splits = num_fold)

## create data for single fold for testing
for i in range(num_fold):

    train_test_indices = list(cv.split(df, df, genes))
    train_idxs, val_idxs = train_test_indices[i]
    test_idxs= np.array([],dtype=int)
    
    val_idxs_ =np.concatenate([test_idxs,val_idxs]) #mixed val and test data

    # create_tfrecords_deepripe(df, exon_info, train_idxs,
    #                              val_idxs, test_idxs, i, 
    #                              out_dir, peaks,
    #                              max_length)
        

CPU times: user 8.1 s, sys: 69.6 ms, total: 8.17 s
Wall time: 8.69 s


## Repeated KFold Cross validation data setup

In [62]:
# out_dir = 'data_out/EN_RG_multitask_DeltaPSI_sig_0.1_train_valid_test_deepripe_repeatKfold/'

out_dir = 'data_out/EN_RG_multitask_DeltaPSI_sig_0.1_train_valid_deepripe_/'


'/nowakowskilab/data1/derek/data_scSLR/prenatal_brain/deep_splicing/ExplaiNN'

In [64]:
# %%time

# rkf = RepeatedKFold(n_splits=10, n_repeats=10, random_state=2652124)

# repeat = 0

# out_dir_ = out_dir+f'/repeat_{repeat}/'


# for i, (train_val_idxx, test_idxs) in enumerate(rkf.split(df)):
        
    
#     cv2 = GroupKFold(n_splits = 5)
    
#     train_val_indices = list(cv2.split(df.iloc[train_val_idxx],
#                                        df.iloc[train_val_idxx],
#                                        genes[train_val_idxx]))
    
#     train_idxs, val_idxs = train_val_indices[0] #- take just first fold split 
#     train_idxs = train_val_idxs[train_idxs]
#     val_idxs = train_val_idxs[val_idxs]
    
    
    
#     create_tfrecords_deepripe(df, exon_info, train_idxs,
#                      val_idxs, test_idxs, i, 
#                      out_dir_, peaks,
#                      max_length)
    
    
#     if (i+1) % 10 == 0:
        
#         repeat +=1
        
#         os.mkdir(out_dir+f'/repeat_{repeat}/')
        
#         out_dir_ = out_dir+f'/repeat_{repeat}/'


In [66]:
%%time

rkf = RepeatedKFold(n_splits=10, n_repeats=5, random_state=2652124)

repeat = 0

out_dir_ = out_dir+f'/repeat_{repeat}/'


for i, (train_idxx, val_idxs) in enumerate(rkf.split(df)):
                
    create_tfrecords_deepripe(df, exon_info, train_idxs,
                     val_idxs, test_idxs, i, 
                     out_dir_, peaks,
                     max_length)
    
    
    if (i+1) % 10 == 0:
        
        repeat +=1
        
        os.mkdir(out_dir+f'/repeat_{repeat}/')
        
        out_dir_ = out_dir+f'/repeat_{repeat}/'


CPU times: user 2min 6s, sys: 513 ms, total: 2min 6s
Wall time: 2min 7s


In [None]:
# %%time
# # split into train and test only
# num_fold = 5

# cv = GroupKFold(n_splits = num_fold)

# ## create data for single fold for testing
# for i in range(num_fold):

#     train_test_indices = list(cv.split(df, df, genes))
#     train_idxs, val_idxs = train_test_indices[i]
#     test_idxs= np.array([],dtype=int)
    
#     val_idxs_ =np.concatenate([test_idxs,val_idxs]) #mixed val and test data

#     create_tfrecords_deepripe(df, exon_info, train_idxs,
#                                  val_idxs, test_idxs, i, 
#                                  out_dir, peaks,
#                                  max_length)
    

### check tfrecord loading

In [153]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [24]:
import shutil

from basenji_.basenji import dataset_

from basenji_.basenji.metrics import PearsonR

In [25]:
out_dir

'data_out/EN_RG_multitask_DeltaPSI_sig_0.1_train_valid_deepripe_region/'

In [26]:
data_dir = [glob.glob(out_dir+'/fold*/')[0]] 
params_file = data_dir[0]+'params.json'

In [27]:
data_dir

['data_out/EN_RG_multitask_DeltaPSI_sig_0.1_train_valid_deepripe_region/fold0/']

In [28]:
params_file

'data_out/EN_RG_multitask_DeltaPSI_sig_0.1_train_valid_deepripe_region/fold0/params.json'

In [159]:
with open(params_file) as params_open:
          params = json.load(params_open)
params_model = params['model']
params_train = params['train']
os.makedirs(out_dir, exist_ok=True)
if params_file != '%s/params.json' % out_dir:
    shutil.copy(params_file, '%s/params.json' % out_dir)

In [160]:


for data_dir_ in data_dir:
    train_data.append(dataset_.ExonDataset(data_dir_,
        split_label='train',
        batch_size=params_train['batch_size'],
        shuffle_buffer=params_train.get('shuffle_buffer', params_model['seq_length']),
        mode='train',
        splice_track=False,
        annotation_tracks=True
                            ))
    