In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from common.bio.amino_acid import *
from common.preprocessing.dataframe import *
import json
import os
import multiprocessing as mp
import psutil
import swifter
import sys

  from ._conv import register_converters as _register_converters


In [2]:
num_cores = mp.cpu_count()
print("This kernel has ",num_cores,"cores and you can find the information regarding the memory usage:",psutil.virtual_memory())

This kernel has  8 cores and you can find the information regarding the memory usage: svmem(total=31616577536, available=30576070656, percent=3.3, used=637997056, free=30139428864, active=683048960, inactive=441868288, buffers=64339968, cached=774811648, shared=9695232, slab=141684736)


In [3]:
ROOT = "../../data/protein/embedding/"
DATA_SOURCE = ROOT + "data_sources/all_sequences.tsv.gz"
CHUNK_SIZE = 500000

## Loading data

In [4]:
original_data = pd.read_csv(DATA_SOURCE, sep='\t', chunksize = CHUNK_SIZE, skipinitialspace=True, usecols= [1])

In [5]:
int(69229940/500000)+1

139

## Extra preprocessing steps

In [6]:
PATH = "{}{}/".format(ROOT,"all")

In [7]:
os.makedirs(PATH, exist_ok=True)

### Sequences

In [8]:
def save_as_tfrecords(data, options=tf.python_io.TFRecordOptions(tf.python_io.TFRecordCompressionType.GZIP), 
                      extension="tfrecords.gz"):
    try:
        threading_start = time.time()
        data = filter_non_standard_amino_acids(data, "Sequence")
        data = data["Sequence"].apply(lambda x: [ AMINO_ACID_TO_ID[c] for c in x])
        filename = "{}{}.{}".format(PATH,str(data.index[0]//CHUNK_SIZE), extension)
        with tf.python_io.TFRecordWriter(filename,options) as writer:
            for row in data:
                example = tf.train.Example(features = tf.train.Features(
                    feature={
                        'length': tf.train.Feature(int64_list=tf.train.Int64List(value=[len(row)])),
                        'seq': tf.train.Feature(int64_list=tf.train.Int64List(value=row))
                    }
                ))
                writer.write(example.SerializeToString())
        print("Data was stored in {} (Took: {}s)".format(filename, time.time() - threading_start))
        sys.stdout.flush()
    except Exception as e:
        print("Something went wrong went writting in to tfrecords file")
        print(e)
        sys.stdout.flush()

In [9]:
def save_as_tfrecords_multithreaded(path, original_data):
    os.makedirs(path, exist_ok=True)
    threading_start = time.time()

    has_next = True
    while has_next:
            pool = mp.Pool(mp.cpu_count())
            try:
                results = []
                for i in range(mp.cpu_count()):
                    data = next(original_data)
                    results.append(pool.apply_async(save_as_tfrecords, [data]))
                output = [p.get() for p in results]
                pool.terminate()
                pool.join()
            except Exception as e:
                print(e)
                output = [p.get() for p in results]
                pool.terminate()
                pool.join()
                break
    print("Completed all threads in {} seconds".format(time.time() - threading_start))

In [10]:
save_as_tfrecords_multithreaded(PATH, original_data)

Data was stored in ../../data/protein/embedding/all/0.tfrecords.gz (Took: 96.35030341148376s)
Data was stored in ../../data/protein/embedding/all/1.tfrecords.gz (Took: 96.39233803749084s)
Data was stored in ../../data/protein/embedding/all/2.tfrecords.gz (Took: 102.22424125671387s)
Data was stored in ../../data/protein/embedding/all/5.tfrecords.gz (Took: 95.67152047157288s)
Data was stored in ../../data/protein/embedding/all/3.tfrecords.gz (Took: 105.38564085960388s)
Data was stored in ../../data/protein/embedding/all/4.tfrecords.gz (Took: 102.35522365570068s)
Data was stored in ../../data/protein/embedding/all/7.tfrecords.gz (Took: 94.01703810691833s)
Data was stored in ../../data/protein/embedding/all/6.tfrecords.gz (Took: 102.21989393234253s)
Data was stored in ../../data/protein/embedding/all/11.tfrecords.gz (Took: 85.23472023010254s)
Data was stored in ../../data/protein/embedding/all/10.tfrecords.gz (Took: 89.29180073738098s)
Data was stored in ../../data/protein/embedding/all/8.

Data was stored in ../../data/protein/embedding/all/87.tfrecords.gz (Took: 98.2398054599762s)
Data was stored in ../../data/protein/embedding/all/88.tfrecords.gz (Took: 94.05527591705322s)
Data was stored in ../../data/protein/embedding/all/89.tfrecords.gz (Took: 99.08749866485596s)
Data was stored in ../../data/protein/embedding/all/90.tfrecords.gz (Took: 97.65073776245117s)
Data was stored in ../../data/protein/embedding/all/92.tfrecords.gz (Took: 99.23654770851135s)
Data was stored in ../../data/protein/embedding/all/91.tfrecords.gz (Took: 105.05251121520996s)
Data was stored in ../../data/protein/embedding/all/94.tfrecords.gz (Took: 93.42280006408691s)
Data was stored in ../../data/protein/embedding/all/95.tfrecords.gz (Took: 91.97671103477478s)
Data was stored in ../../data/protein/embedding/all/93.tfrecords.gz (Took: 102.6677405834198s)
Data was stored in ../../data/protein/embedding/all/96.tfrecords.gz (Took: 87.55560088157654s)
Data was stored in ../../data/protein/embedding/al

To read

In [17]:
#tf.data.TFRecordDataset(filenames=filenames, compression_type='GZIP', buffer_size=buffer_size)
# features = tf.parse_single_example(
#     serialized=example.SerializeToString(),
#     features={
#         'label': tf.FixedLenFeature([1], tf.int64),
#         'seq': tf.FixedLenSequenceFeature([], tf.int64, allow_missing=True),
#     }
# )

# End of preprocessing