In [1]:
import pandas as pd
import os

In [2]:
ground_truth_jobs = [
    "ppi-20210507230827-acetylation",
"ppi-20210507230827-dephosphorylation",
"ppi-20210507230827-deubiquitination",
"ppi-20210507230827-methylation",
"ppi-20210507230827-phosphorylation",
"ppi-20210507230827-ubiquitination"    
]

s3_ground_truth_output_prefix = "s3://aegovan-data/processed_dataset_trainingdata_groundtruth/"


In [3]:
local_temp = "temp"
local_temp_pred_dir = os.path.join( local_temp, "pred_results")
local_temp_wk_dir = os.path.join( local_temp, "wk")

In [4]:
!rm -rf $local_temp
!mkdir -p $local_temp_pred_dir
!mkdir -p $local_temp_wk_dir

In [5]:
import boto3
import glob
from multiprocessing.dummy import Pool as ThreadPool
import argparse
import datetime 
import os


def upload_file(localpath, s3path):
        """
Uploads a file to s3
        :param localpath: The local path
        :param s3path: The s3 path in format s3://mybucket/mydir/mysample.txt
        """

        bucket, key = get_bucketname_key(s3path)

        if key.endswith("/"):
            key = "{}{}".format(key, os.path.basename(localpath))
        
        s3 = boto3.client('s3')
        
        s3.upload_file(localpath, bucket, key)

def get_bucketname_key(uripath):
    assert uripath.startswith("s3://")

    path_without_scheme = uripath[5:]
    bucket_end_index = path_without_scheme.find("/")

    bucket_name = path_without_scheme
    key = "/"
    if bucket_end_index > -1:
        bucket_name = path_without_scheme[0:bucket_end_index]
        key = path_without_scheme[bucket_end_index + 1:]

    return bucket_name, key


def download_file(s3path, local_dir):
    bucket, key = get_bucketname_key(s3path)
    
    s3 = boto3.client('s3')
    
    local_file = os.path.join(local_dir, s3path.split("/")[-1])
    

    s3.download_file(bucket, key, local_file)
    
def download_object(s3path):
    bucket, key = get_bucketname_key(s3path)
    
    s3 = boto3.client('s3')    

    s3_response_object = s3.get_object(Bucket=bucket, Key=key)
    object_content = s3_response_object['Body'].read()
    
    return len(object_content)



def list_files(s3path_prefix):
    assert s3path_prefix.startswith("s3://")
    assert s3path_prefix.endswith("/")
    
    bucket, key = get_bucketname_key(s3path_prefix)
    
   
   
    s3 = boto3.resource('s3')
    
    bucket = s3.Bucket(name=bucket)

    return ( (o.bucket_name, o.key) for o in bucket.objects.filter(Prefix=key))





def upload_files(local_dir, s3_prefix, num_threads=20):    
    input_tuples = ( (f,  s3_prefix) for f in glob.glob("{}/*".format(local_dir)))
    
    with ThreadPool(num_threads) as pool:
        pool.starmap(uploadfile, input_tuples)
    


def download_files(s3_prefix, local_dir, num_threads=20):    
    input_tuples = ( ("s3://{}/{}".format(s3_bucket,s3_key),  local_dir) for s3_bucket, s3_key in list_files(s3_prefix))
    
    with ThreadPool(num_threads) as pool:
        results = pool.starmap(download_file, input_tuples)
        
        

def download_objects(s3_prefix, num_threads=20):    
    s3_files = ( "s3://{}/{}".format(s3_bucket,s3_key) for s3_bucket, s3_key in list_files(s3_prefix))
    
    with ThreadPool(num_threads) as pool:
        results = pool.map(download_object, s3_files)
        
    return sum(results)/1024
        

def get_directory_size(start_path):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(start_path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            # skip if it is symbolic link
            if not os.path.islink(fp):
                total_size += os.path.getsize(fp)
    return total_size

def get_s3file_size(bucket, key):
    s3 = boto3.client('s3')
    response = s3.head_object(Bucket=bucket, Key=key)
    size = response['ContentLength']
    return size
    
def download_files_min_files(s3_prefix, local_dir, min_file_size=310, num_threads=20):    
    input_tuples = ( ("s3://{}/{}".format(s3_bucket,s3_key),  local_dir) for s3_bucket, s3_key in list_files(s3_prefix) if get_s3file_size(s3_bucket, s3_key) > min_file_size )
    
    with ThreadPool(num_threads) as pool:
        results = pool.starmap(download_file, input_tuples)
        


In [6]:
import pandas, json,   ast


def load_manifest_file(manifest_file):
    with open(manifest_file) as f:
        df_list = []
        for l in  f.readlines():
            record = json.loads(l)
            result = json.loads(record["source"])
            meta_key = list([k for k,_ in  record.items() if k.endswith( "-metadata" )])[0]
            result ["human_result"] = record[meta_key.replace("-metadata","")]["result"]
            
            df_list.append(result)
        return df_list
    
def load_manifests(manifest_files):
    l = []
    if isinstance (manifest_files, str): manifest_files=[manifest_files]
    for f in manifest_files:
        l.extend(load_manifest_file(f))
        
    return pd.DataFrame(l)
    
def load_manifests_s3(s3_manifest_files, local_dir):
    l = []
    if isinstance (s3_manifest_files, str): manifest_files=[s3_manifest_files]
    for s3_file in s3_manifest_files:
        manifest_file=os.path.join(local_dir , s3_file.split("/")[-1] )
        download_file(s3_file, local_dir)
        l.extend(load_manifest_file(manifest_file))
        
    return pd.DataFrame(l)
    

In [7]:
df = load_manifests_s3(["{}/{}/manifests/output/output.manifest".format(s3_ground_truth_output_prefix.rstrip("/"),j ) for j in ground_truth_jobs], local_temp_pred_dir)

In [8]:
df.groupby(["class", "human_result"])["human_result"].count().unstack().fillna(0)

human_result,Correct,Not - sure
class,Unnamed: 1_level_1,Unnamed: 2_level_1
acetylation,4,1
dephosphorylation,6,4
deubiquitination,1,1
methylation,4,6
phosphorylation,6,4
ubiquitination,2,3


In [10]:
temp_df = df.groupby(["class", "human_result"])["human_result"].count().unstack().fillna(0)
temp_df["Total"]=temp_df.apply( lambda r: sum(r) , axis=1)
temp_df.loc["Summary"] = temp_df.apply( lambda c: sum(c) , axis=0)
print(temp_df.to_latex())

\begin{tabular}{lrrr}
\toprule
human\_result &  Correct &  Not - sure &  Total \\
class             &          &             &        \\
\midrule
acetylation       &        4 &           1 &      5 \\
dephosphorylation &        6 &           4 &     10 \\
deubiquitination  &        1 &           1 &      2 \\
methylation       &        4 &           6 &     10 \\
phosphorylation   &        6 &           4 &     10 \\
ubiquitination    &        2 &           3 &      5 \\
Summary           &       23 &          19 &     42 \\
\bottomrule
\end{tabular}



In [11]:
df.sample()

Unnamed: 0,pubmedId,abstract,annotations,num_unique_gene_normalised_id,num_gene_normalised_id,normalised_abstract,normalised_abstract_annotations,participant1Id,participant2Id,gene_to_uniprot_map,class,missing_uniprot,human_result
24,19262565,Proper regulation of NF-kappaB activity is cri...,"[{'start': '21', 'end': '30', 'name': 'NF-kapp...",4,21,Proper regulation of P19838 activity is critic...,"[{'charOffset': 21, 'len': 6, 'text': 'P19838'...",Q8WTS6,Q04206,"{'7124': ['P01375', 'Q5STB3'], '80854': ['Q8WT...",methylation,False,Correct
