In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from common.bio.amino_acid import *
from common.preprocessing.dataframe import *
import json
import os
import multiprocessing as mp
import psutil
import swifter

In [2]:
num_cores = mp.cpu_count()
print("This kernel has ",num_cores,"cores and you can find the information regarding the memory usage:",psutil.virtual_memory())

This kernel has  4 cores and you can find the information regarding the memory usage: svmem(total=8438435840, available=2001723392, percent=76.3, used=6436712448, free=2001723392)


In [3]:
ROOT = "../../data/protein/embedding/"
DATA_SOURCE = ROOT + "data_sources/all_sequences.tsv.gz"

## Loading data

In [4]:
original_data = pd.read_csv(DATA_SOURCE, sep='\t', chunksize = 500000, skipinitialspace=True, usecols= [1])

## Extra preprocessing steps

In [5]:
PATH = "{}/{}/".format(ROOT,"all")

In [6]:
os.makedirs(PATH, exist_ok=True)

### Sequences

In [14]:
def save_as_tfrecords(filename, data, options, extension="tfrecords.gz"):
    try:
        filename = "{}.{}".format(filename, extension)
        with tf.python_io.TFRecordWriter(filename,options) as writer:
            for row in data:
                example = tf.train.Example(features = tf.train.Features(
                    feature={
                        'length': tf.train.Feature(int64_list=tf.train.Int64List(value=[len(row)])),
                        'seq': tf.train.Feature(int64_list=tf.train.Int64List(value=row))
                    }
                ))
                writer.write(example.SerializeToString())
        print("Data was stored in {}".format(filename))
    except Exception as e:
        print("Something went wrong went writting in to tfrecords file")
        print(e)

In [15]:
def save_as_tfrecords_multithreaded(path, original_data):
    os.makedirs(path, exist_ok=True)
    threading_start = time.time()
    coord = tf.train.Coordinator()
    threads = []
    options = tf.python_io.TFRecordOptions(tf.python_io.TFRecordCompressionType.GZIP)
    for i, data in enumerate(original_data):
        filename = os.path.join(path, str(i))
        data = filter_non_standard_amino_acids(data, "Sequence")
        data = data["Sequence"].apply(lambda x: [ AMINO_ACID_TO_ID[c] for c in x])
        args = (filename, data, options)
        print("Completed all threads in {} seconds".format(time.time() - threading_start))
        t = threading.Thread(target=save_as_tfrecords, args=args)
        t.start()
        threads.append(t)
        if i == 0:
            break
    coord.join(threads)
    print("Completed all threads in {} seconds".format(time.time() - threading_start))

In [16]:
save_as_tfrecords_multithreaded(PATH, original_data)

Completed all threads in 19.44636106491089 seconds
Data was stored in ../../data/protein/embedding//all/0.tfrecords
Completed all threads in 87.57551789283752 seconds


To read

In [17]:
#tf.data.TFRecordDataset(filenames=filenames, compression_type='GZIP', buffer_size=buffer_size)
# features = tf.parse_single_example(
#     serialized=example.SerializeToString(),
#     features={
#         'label': tf.FixedLenFeature([1], tf.int64),
#         'seq': tf.FixedLenSequenceFeature([], tf.int64, allow_missing=True),
#     }
# )

# End of preprocessing