In [1]:
import numpy as np
import pandas as pd
import glob
import os
import pandas as pd
from common.bio.constants import ID_TO_AMINO_ACID
from tqdm import tqdm
import multiprocessing as mp
import time
import tensorflow as tf

## Reading results from file

In [2]:
ROOT="../../../bert/weights/eval_full/"
list_of_files = glob.glob(ROOT+ "*.npy") # * means all if need specific format then *.csv

## Preprocessing

In [3]:
def preprocess_score(scores):
    scores = np.reshape(scores, [-1,20])
#     scores = np.mean(scores, axis =1)
    return scores

In [4]:
from datetime import datetime, timedelta

def get_files_per_hour(list_of_files):
    latest_file = max(list_of_files, key=os.path.getctime)
    oldest_file = min(list_of_files, key=os.path.getctime)
    start = datetime.strptime((os.path.basename(oldest_file).split(".")[0]), '%Y%m%d_%H%M%S_%f')- timedelta(hours=1)
    finish = datetime.strptime((os.path.basename(latest_file).split(".")[0]), '%Y%m%d_%H%M%S_%f')

    while start <= finish:
        start = start + timedelta(hours=1)
        yield start.strftime("%Y%m%d_%H*")
    

In [5]:
def get_data_from_files(pattern):
    sequences, scores = None, None
    list_of_files = glob.glob(ROOT+ "/" + pattern+"*.npy") 
    for file in list_of_files:
        if sequences is None:
            scores,_,sequences = np.load(file)
            scores = preprocess_score(scores)
        else:
            s,_,seqs = np.load(file)
            sequences = np.append(sequences,seqs, axis=0)
            scores = np.append(scores, preprocess_score(s), axis=0)
    return sequences, scores

## Storing

In [6]:
def save_as_tfrecords(data, file_id, options=tf.python_io.TFRecordOptions(tf.python_io.TFRecordCompressionType.GZIP), 
                      extension="tfrecords.gz"):
    try:
        threading_start = time.time()
        filename = "{}{}.{}".format(PATH,str(file_id), extension)
        with tf.python_io.TFRecordWriter(filename,options) as writer:
            for row in data:
                row = row[np.nonzero(row)]
                example = tf.train.Example(features = tf.train.Features(
                    feature={
                        'length': tf.train.Feature(int64_list=tf.train.Int64List(value=[len(row)])),
                        'seq': tf.train.Feature(int64_list=tf.train.Int64List(value=row))
                    }
                ))
                writer.write(example.SerializeToString())
        print("Data was stored in {} (Took: {}s)".format(filename, time.time() - threading_start))
        sys.stdout.flush()
    except Exception as e:
        print("Something went wrong went writting in to tfrecords file")
        print(e)
        sys.stdout.flush()

In [13]:
def save_as_multithreaded(path, store_func):
    os.makedirs(path, exist_ok=True)
    threading_start = time.time()
    list_of_files = glob.glob(ROOT+ "*.npy") 
    files_patterns = get_files_per_hour(list_of_files)
    has_next = True
    while has_next:
            pool = mp.Pool(mp.cpu_count())
            try:
                results = []
                for i in range(mp.cpu_count()):
                    pattern = next(files_patterns)
                    sequences, scores = get_data_from_files(pattern)
#                     data = sequences[np.where(scores > 2.5)]
                    data = sequences, scores
                    results.append(pool.apply_async(store_func, [data, pattern]))
                output = [p.get() for p in results]
                pool.terminate()
                pool.join()
            except Exception as e:
                print(e)
                print("Waiting for threads to finish")
                output = [p.get() for p in results]
                pool.terminate()
                pool.join()
                print("Exiting loop")
                break
    print("Completed all threads in {} seconds".format(time.time() - threading_start))

In [10]:
def save_as_npz(data, pattern):
    try:
        np.savez_compressed(os.path.join(ROOT, pattern), seq=data[0], scores=data[1])
        threading_start = time.time()
        print("Data was stored in {}({} rows) (Took: {}s)".format(filename, len(data[1]),time.time() - threading_start))
        sys.stdout.flush()
    except Exception as e:
        print("Something went wrong went writting in to tfrecords file")
        print(e)
        sys.stdout.flush()


In [None]:
save_as_multithreaded("../../data/protein/embedding/hard", save_as_npz)

'>' not supported between instances of 'NoneType' and 'float'
Waiting for threads to finish


## Analysis

In [22]:
"".join([ID_TO_AMINO_ACID[x] for x in  sequences[0] if x != 0])

'MTVEARVDRRAVPVHDDLDAGHRVSGAADSHFGCVVRSFSTMFPARRFGGGALAVYLDGRPVVDVWTGWSDRGGDRPWSADSAPMVFSATKGMAATVIHRLADRGLVDYEAPVAEYWPEFGANGKANLTVRDVMRHAAGLSGLRGARSEDLLDHVVMEERLAAAAPGRLLGKSAYHALTFGWLMSGLARGVTGKGMRALIREELAEPLGTDGLYLGRPPAGAPTRVAEIIAPQNLVRNPLLSCVTRKVANELSGGFRSMYFPGMVAAVQGDTPLLDAEIPAANGVATARGLARMYGAIANGGEVDGIRFLSREMVAGLTGRRSLRPDRNLFMPLAFHLGYHSLPIGNVMPGFGHVGLGGSLGWTDPPSGLAFSLVHNRLLTPFVMTDHAAFVALYALIRNAAEKARKRGFEPVTEFGAPYFQPGAVAG'

In [21]:
scores.shape

(5120,)

In [100]:
scores[[210,  46,   3, 162,  86, 184, 135, 189,  80,  53]]
# (array([210,  46,   3, 162,  86], dtype=int64),
#  array([123, 136, 138, 118,  22], dtype=int64))

array([0.85874426, 0.8723316 , 1.7435211 , 0.9066566 , 1.7519985 ,
       1.2749206 , 0.9214524 , 1.9551178 , 0.63511056, 2.0029273 ],
      dtype=float32)

In [101]:
scores[[142,  24, 117, 137, 128, 123, 136, 138, 118,  22]]

array([2.5441122, 2.888978 , 2.7500596, 2.771639 , 2.7904115, 2.992539 ,
       2.569421 , 2.8541427, 2.8504214, 2.9498117], dtype=float32)

In [103]:
scores.argsort(axis=0)[:10], scores.argsort(axis=0)[-10:]

(array([336,  80, 466, 210, 302,  46, 418, 162, 391, 135], dtype=int64),
 array([506, 250, 147, 403,  63, 319, 204, 460, 171, 427], dtype=int64))

In [108]:
a_scores = [5.763989448547363,16.34835720062256,16.912442684173584,41.02971601486206,5.302796363830566,5.70671272277832,25.228277683258057,7.146716117858887,5.096601486206055,3.9381494522094727,12.996936798095703,9.719063758850098,3.5146589279174805,3.3181838989257812,3.999410629272461,15.865331172943115,5.400596618652344,8.180281162261963,13.174888134002686,11.942434310913086,22.709521293640137,15.048619270324707,4.226090431213379,11.418336391448975,6.203150749206543,8.175767421722412,43.127867221832275,9.926605224609375,44.87994194030762,18.72093391418457,9.535984516143799,3.790769577026367,9.873002529144287,28.970005989074707,9.007229328155518,4.817741394042969,12.549857139587402,3.652944564819336,16.239166259765625,22.187833309173584,4.388591766357422,18.096747875213623,12.497518062591553,7.754159927368164,5.7018232345581055,5.385113716125488,31.97294521331787,6.330876350402832,25.328668117523193,6.5586442947387695,8.033462524414062,17.22890615463257,6.064653396606445,25.55495595932007,22.334675788879395,7.816396713256836,3.3266677856445312,4.149695873260498,15.471694946289062,6.731616020202637,9.297334671020508,25.512556552886963,5.05133056640625,4.916384696960449,6.1556243896484375,27.4039626121521,18.870399475097656,3.659233570098877,22.16541051864624,6.014122009277344,29.155909538269043,24.865150928497314,6.148896217346191,4.456441879272461,36.230844020843506,21.893184661865234,4.679718017578125,8.474009037017822,17.973877906799316,10.448594093322754,32.776719093322754,10.42057991027832,6.495640754699707,5.384268283843994,9.843405723571777,2.1211185455322266,35.047926902770996,9.187013149261475,3.6614198684692383,6.333958625793457,7.550034999847412,13.512449264526367,20.336669921875,4.178987979888916,3.365877628326416,21.484695434570312,7.349902153015137,9.73277997970581,25.623199939727783,9.450332164764404,15.263635158538818,23.229173183441162,20.681382656097412,4.582851409912109,20.545052528381348,2.867743492126465,11.519810676574707,20.407901763916016,3.23898983001709,18.20640468597412,5.586087703704834,10.088312149047852,19.812275409698486,4.547212600708008,26.56522512435913,30.005449295043945,37.34163856506348,8.872427940368652,5.690378189086914,45.78540325164795,9.842347145080566,38.25021409988403,7.921853542327881,3.9880475997924805,7.606886386871338,11.440752029418945,24.917625427246094,10.747987747192383,3.1897478103637695,3.3536224365234375,7.165192604064941,4.481685638427734,26.078386306762695,10.525011539459229,3.2045955657958984,34.19850778579712,5.21912956237793,6.198361396789551,5.190456390380859,5.720827579498291,17.15915822982788,13.962836265563965,6.191702365875244,6.513998031616211,26.759228229522705,23.074337482452393,4.451974868774414,5.091446876525879,14.548539161682129,16.64423418045044,5.378070831298828,21.36000156402588,3.6596765518188477,7.9518232345581055,4.218654632568359,6.371386528015137,9.05506944656372,7.523493766784668,14.402921199798584,5.650209426879883,7.506370544433594,16.435704231262207,29.36879062652588,6.972742080688477,6.135900497436523,7.812517166137695,30.80285358428955,19.008928298950195,22.563151836395264,9.925041675567627,8.549678802490234,4.409868240356445,7.124042987823486,10.693761348724365,10.838569164276123,7.467924118041992,3.3315248489379883,10.90412425994873,8.984777450561523,9.5506272315979,11.115557670593262,4.262716293334961,8.036658763885498,18.585110664367676,27.57838726043701,4.624139785766602,18.324599266052246,8.86291217803955,20.439035415649414,35.062543869018555,9.112467765808105,24.089383125305176,23.790390968322754,16.31876850128174,3.7423229217529297,9.917150020599365,17.03303861618042,12.932655334472656,17.178348541259766,10.088364601135254,4.725707054138184,4.925791263580322,6.185546875,6.976569175720215,3.702641487121582,9.513192176818848,10.202939510345459,4.839154243469238,6.771781921386719,9.581177234649658,30.46738576889038,24.702009677886963,17.939661502838135,7.759855270385742,12.423149108886719,7.261037826538086,4.9377241134643555,4.556290626525879,5.789980411529541,10.63567066192627,9.840155124664307,8.37946605682373,3.245781898498535,14.573859691619873,12.270698070526123,7.986940860748291,7.5322041511535645,4.5632524490356445,12.521952152252197,5.322998046875,5.59165096282959,3.4178380966186523,7.896445274353027,8.388476371765137,17.809837818145752,10.974207878112793,10.277129650115967,7.909133434295654,20.99509334564209,5.896093368530273,14.895615577697754,7.073772430419922,12.100501537322998,13.862523555755615,7.526375770568848,5.502490997314453,10.848763465881348,31.82371997833252,6.382550239562988,15.99025821685791,4.569219589233398,9.46271276473999,4.057765007019043,5.025197982788086,21.002982139587402,12.877090454101562]

In [109]:
o_scores = scores[:256]

In [113]:
df = pd.DataFrame([o_scores, a_scores]).transpose()

In [122]:
df[(df[0] > 2.5)]

Unnamed: 0,0,1
4,2.979533,5.302796
5,2.579096,5.706713
7,2.550770,7.146716
8,2.544019,5.096601
9,2.887515,3.938149
12,2.765860,3.514659
13,2.876865,3.318184
14,2.845802,3.999411
16,2.547663,5.400597
18,2.582014,13.174888
