# Preprocessing sequence data



## Libraries

In [None]:
import pandas as pd
import os
import csv
from tqdm import tqdm
import numpy as np
import h5py

In [None]:
import tensorflow.keras as keras
from tensorflow.keras.utils import plot_model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.python.client import device_lib
from tensorflow.keras import Input
from tensorflow.keras.layers import  Dense, Conv2D, MaxPooling2D, UpSampling2D, Flatten , Conv1D, Concatenate , Permute
from tensorflow.keras.layers import Bidirectional,LSTM
from tensorflow.keras.models import Model
from tensorflow.keras.layers import BatchNormalization, Add , LeakyReLU ,Reshape , Activation , MaxPooling1D , Lambda , Dropout
from tensorflow.keras.regularizers import l1_l2
from tensorflow.keras.backend import conv1d
from tensorflow.python.keras.utils import conv_utils
from tensorflow.keras import backend as K

## Sequence processing

In [None]:
#Function for one-hot encoding sequences
def seq2feature(data):  # copied from https://github.com/1edv/evolution/blob/master/manuscript_code/model/tpu_model
    A_onehot = np.array([1,0,0,0] ,  dtype=np.bool)
    C_onehot = np.array([0,1,0,0] ,  dtype=np.bool)
    G_onehot = np.array([0,0,1,0] ,  dtype=np.bool)
    T_onehot = np.array([0,0,0,1] ,  dtype=np.bool)
    N_onehot = np.array([0,0,0,0] ,  dtype=np.bool)

    mapper = {'A':A_onehot,'C':C_onehot,'G':G_onehot,'T':T_onehot,'N':N_onehot}
    worddim = len(mapper['A'])


    transformed = np.asarray(([[mapper[k] for k in (data[i])] for i in (range(len(data)))]))
    return transformed

In [None]:
#Parse sequence file
with open(os.path.join('train_sequences.txt')) as f:
    reader = csv.reader(f, delimiter="\t")
    # Remove sequences with discrete expression value with 50% probability 
    d = []
    for di in reader:
      if (float(di[1]) % 1) == 0.:
        if np.random.choice([True, False], 1)[0]:
          d.append(di)
      else:
        d.append(di)         
sequences = [di[0] for di in d]


#Padding with N's is sequences are not 110 bp long
for i in tqdm(range(0,len(sequences))) : 
    if (len(sequences[i]) > 110) :
        sequences[i] = sequences[i][-110:]
    if (len(sequences[i]) < 110) : 
        while (len(sequences[i]) < 110) :
            sequences[i] = 'N'+sequences[i]
            


100%|██████████| 2861270/2861270 [00:01<00:00, 1719771.92it/s]


In [None]:
# Apply the one-hot encoding
seqdata_transformed = seq2feature(sequences)

In [None]:
# Store sequences in a h5py file
with h5py.File('train_onehot_sequences_bool_half.h5', 'w') as hf:
    hf.create_dataset("onehot_sequences_bool",  data=seqdata_transformed)
print(type(seqdata_transformed[0][0][0]))

## Expression processing

In [None]:
# Store expression values in a h5py file
expressions = [di[1] for di in d]
expdata = np.asarray(expressions)
expdata = expdata.astype('float')  
expressions = expdata

In [None]:
#Make sure no expressioin values are NA before scaling
#Copied from https://github.com/1edv/evolution/blob/master/manuscript_code/model/tpu_model/data_processing.ipynb
def clean_exp(Y) :
    exp_NA = [(a=='NA') for a in Y]
    exp_NA = np.array(exp_NA)

    Y = np.array(Y)

    clean_exp = Y[~exp_NA]
    clean_exp = [float(a) for a in clean_exp ]
    return clean_exp
    
clean_trY = np.array(clean_exp(expressions)).reshape(-1, 1)

In [None]:
# Apply and dump scaler
from sklearn.preprocessing import StandardScaler
from joblib import dump, load
scaler = StandardScaler()
scaler.fit(clean_trY)
expressions = scaler.transform(np.array(expressions).reshape(-1, 1))
dump(scaler,'scaler_half.save' ) 

In [None]:
# Store scaled expression values in a h5py file
with h5py.File('train_expression_half.h5', 'w') as hf:
    hf.create_dataset("expression",  data=expressions)  