In [1]:
import pandas as pd
import os

In [2]:
s3_prefix = "s3://aegovan-data/pubmed_asbtract/predictions_multi_00/"
s3_data ="s3://aegovan-data/human_output/human_interactions_ppi_v2.json"

In [3]:
local_temp = "temp"
local_temp_pred_dir = os.path.join( local_temp, "pred_results")
local_temp_wk_dir = os.path.join( local_temp, "wk")

In [4]:
!rm -rf $local_temp
!mkdir -p $local_temp_pred_dir
!mkdir -p $local_temp_wk_dir

In [5]:
#!aws s3 cp s3://aegovan-data/pubmed_asbtract/predictions_multi_95/pubmed19n0538.json.txt.json.prediction.json .

In [6]:
import boto3
import glob
from multiprocessing.dummy import Pool as ThreadPool
import argparse
import datetime 
import os


def uploadfile(localpath, s3path):
        """
Uploads a file to s3
        :param localpath: The local path
        :param s3path: The s3 path in format s3://mybucket/mydir/mysample.txt
        """

        bucket, key = get_bucketname_key(s3path)

        if key.endswith("/"):
            key = "{}{}".format(key, os.path.basename(localpath))
        
        s3 = boto3.client('s3')
        
        s3.upload_file(localpath, bucket, key)

def get_bucketname_key(uripath):
    assert uripath.startswith("s3://")

    path_without_scheme = uripath[5:]
    bucket_end_index = path_without_scheme.find("/")

    bucket_name = path_without_scheme
    key = "/"
    if bucket_end_index > -1:
        bucket_name = path_without_scheme[0:bucket_end_index]
        key = path_without_scheme[bucket_end_index + 1:]

    return bucket_name, key


def download_file(s3path, local_dir):
    bucket, key = get_bucketname_key(s3path)
    
    s3 = boto3.client('s3')
    
    local_file = os.path.join(local_dir, s3path.split("/")[-1])
    

    s3.download_file(bucket, key, local_file)
    
def download_object(s3path):
    bucket, key = get_bucketname_key(s3path)
    
    s3 = boto3.client('s3')    

    s3_response_object = s3.get_object(Bucket=bucket, Key=key)
    object_content = s3_response_object['Body'].read()
    
    return len(object_content)



def list_files(s3path_prefix):
    assert s3path_prefix.startswith("s3://")
    assert s3path_prefix.endswith("/")
    
    bucket, key = get_bucketname_key(s3path_prefix)
    
   
   
    s3 = boto3.resource('s3')
    
    bucket = s3.Bucket(name=bucket)

    return ( (o.bucket_name, o.key) for o in bucket.objects.filter(Prefix=key))





def upload_files(local_dir, s3_prefix, num_threads=20):    
    input_tuples = ( (f,  s3_prefix) for f in glob.glob("{}/*".format(local_dir)))
    
    with ThreadPool(num_threads) as pool:
        pool.starmap(uploadfile, input_tuples)
    


def download_files(s3_prefix, local_dir, num_threads=20):    
    input_tuples = ( ("s3://{}/{}".format(s3_bucket,s3_key),  local_dir) for s3_bucket, s3_key in list_files(s3_prefix))
    
    with ThreadPool(num_threads) as pool:
        results = pool.starmap(download_file, input_tuples)
        
        

def download_objects(s3_prefix, num_threads=20):    
    s3_files = ( "s3://{}/{}".format(s3_bucket,s3_key) for s3_bucket, s3_key in list_files(s3_prefix))
    
    with ThreadPool(num_threads) as pool:
        results = pool.map(download_object, s3_files)
        
    return sum(results)/1024
        

def get_directory_size(start_path):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(start_path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            # skip if it is symbolic link
            if not os.path.islink(fp):
                total_size += os.path.getsize(fp)
    return total_size

def get_s3file_size(bucket, key):
    s3 = boto3.client('s3')
    response = s3.head_object(Bucket=bucket, Key=key)
    size = response['ContentLength']
    return size
    
def download_files_min_files(s3_prefix, local_dir, min_file_size=310, num_threads=20):    
    input_tuples = ( ("s3://{}/{}".format(s3_bucket,s3_key),  local_dir) for s3_bucket, s3_key in list_files(s3_prefix) if get_s3file_size(s3_bucket, s3_key) > min_file_size )
    
    with ThreadPool(num_threads) as pool:
        results = pool.starmap(download_file, input_tuples)
        


In [None]:
%%time

download_files(s3_prefix, local_temp_pred_dir)

In [None]:
!ls -l $local_temp_dir | wc -l

In [None]:
total = 0
largest_df = None
full_df = None
value_dict={}
for f in os.listdir(local_temp_pred_dir):
    df = pd.read_json(os.path.join(local_temp_pred_dir, f), orient="records", lines=True )
    
    if largest_df is None:
        largest_df = df
    
    if df.shape[0] > largest_df.shape[0]:
        largest_df=df
    
    if full_df is None:
        full_df = df
    else:
        full_df = pd.concat([df, full_df])
        
    prediction_counts = df["predicted"].value_counts().to_dict()
    for k,v in prediction_counts.items():      
        value_dict [k] =  v +  value_dict.get(k, 0)
    
    #total records
    total += df.shape[0]

In [None]:
threshold_config = {
    "acetylation" : 0.83,
    "deubiquitination" :0.35,
    "methylation" :.82,
    "phosphorylation" : .98,
    "demethylation" :0.0,
    "dephosphorylation" :0.0,
    "ubiquitination":0.1
}

In [None]:
value_dict

In [None]:
full_df.groupby(["predicted"])["predicted_confidence"].count()

In [None]:
total

In [None]:
import copy

high_quality_frames = []
for k,t in threshold_config.items():
    high_quality_frames.append(full_df.query("predicted == '{}' and predicted_confidence > {}".format(k, t)))

high_quality_df = pd.concat(high_quality_frames)

In [None]:
high_quality_df.groupby(["predicted"])["predicted_confidence"].count()

In [None]:
download_file(s3_data, local_temp_wk_dir)



In [None]:
data_file = os.path.join(local_temp_wk_dir, s3_data.split("/")[-1])
data_training_full_df = pd.read_json(data_file)

In [None]:
data_training_full_df.shape

In [None]:
full_df.shape

In [None]:
data_training_full_df.head(n=2)

In [None]:
full_df.head(n=2)

In [None]:
full_df[~full_df.pubmedId.isin(data_training_full_df.pubmedId)].shape

In [None]:
full_df["PubmedInTrainingData"] = full_df.pubmedId.isin(data_training_full_df.pubmedId)

In [None]:
high_quality_df[~high_quality_df.pubmedId.isin(data_training_full_df.pubmedId)].shape

In [None]:
high_quality_df.query("PubmedInTrainingData != True").groupby(["predicted"])["predicted"].count()

In [None]:
high_quality_df["PubmedInTrainingData"] = high_quality_df.pubmedId.isin(data_training_full_df.pubmedId)

In [None]:

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 10000)
pd.set_option('display.max_rows', 100)

import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 12})

In [None]:
full_df.to_csv("predictions.csv", index=False, header=True)

In [None]:
full_df[["abstract", "normalised_abstract", "participant1Id","participant2Id", "pubmedId", "predicted" ,"predicted_confidence" ]].sample(n=20)