In [1]:
import pandas as pd
import os

In [2]:
s3_prefix = "s3://aegovan-data/pubmed_asbtract/predictions_largescale_ppimulticlass-bert-f1-2021-05-10-10_2021-07-01/"
s3_output_prefix = "{}_summary/".format(s3_prefix.rstrip("/"))
s3_data ="s3://aegovan-data/human_output/human_interactions_ppi_v2.json"

In [3]:
local_temp = "../temp"
local_temp_pred_dir = os.path.join( local_temp, "pred_results")
local_temp_wk_dir = os.path.join( local_temp, "wk")

In [None]:
!rm -rf $local_temp
!mkdir -p $local_temp_pred_dir
!mkdir -p $local_temp_wk_dir

In [4]:
import boto3
import glob
from multiprocessing.dummy import Pool as ThreadPool
import argparse
import datetime 
import os


def upload_file(localpath, s3path):
        """
Uploads a file to s3
        :param localpath: The local path
        :param s3path: The s3 path in format s3://mybucket/mydir/mysample.txt
        """

        bucket, key = get_bucketname_key(s3path)

        if key.endswith("/"):
            key = "{}{}".format(key, os.path.basename(localpath))
        
        s3 = boto3.client('s3')
        
        s3.upload_file(localpath, bucket, key)

def get_bucketname_key(uripath):
    assert uripath.startswith("s3://")

    path_without_scheme = uripath[5:]
    bucket_end_index = path_without_scheme.find("/")

    bucket_name = path_without_scheme
    key = "/"
    if bucket_end_index > -1:
        bucket_name = path_without_scheme[0:bucket_end_index]
        key = path_without_scheme[bucket_end_index + 1:]

    return bucket_name, key


def download_file(s3path, local_dir):
    bucket, key = get_bucketname_key(s3path)
    
    s3 = boto3.client('s3')
    
    local_file = os.path.join(local_dir, s3path.split("/")[-1])
    

    s3.download_file(bucket, key, local_file)
    
def download_object(s3path):
    bucket, key = get_bucketname_key(s3path)
    
    s3 = boto3.client('s3')    

    s3_response_object = s3.get_object(Bucket=bucket, Key=key)
    object_content = s3_response_object['Body'].read()
    
    return len(object_content)



def list_files(s3path_prefix):
    assert s3path_prefix.startswith("s3://")
    assert s3path_prefix.endswith("/")
    
    bucket, key = get_bucketname_key(s3path_prefix)
    
   
   
    s3 = boto3.resource('s3')
    
    bucket = s3.Bucket(name=bucket)

    return ( (o.bucket_name, o.key) for o in bucket.objects.filter(Prefix=key))





def upload_files(local_dir, s3_prefix, num_threads=20):    
    input_tuples = ( (f,  s3_prefix) for f in glob.glob("{}/*".format(local_dir)))
    
    with ThreadPool(num_threads) as pool:
        pool.starmap(uploadfile, input_tuples)
    


def download_files(s3_prefix, local_dir, num_threads=20):    
    input_tuples = ( ("s3://{}/{}".format(s3_bucket,s3_key),  local_dir) for s3_bucket, s3_key in list_files(s3_prefix))
    
    with ThreadPool(num_threads) as pool:
        results = pool.starmap(download_file, input_tuples)
        
        

def download_objects(s3_prefix, num_threads=20):    
    s3_files = ( "s3://{}/{}".format(s3_bucket,s3_key) for s3_bucket, s3_key in list_files(s3_prefix))
    
    with ThreadPool(num_threads) as pool:
        results = pool.map(download_object, s3_files)
        
    return sum(results)/1024
        

def get_directory_size(start_path):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(start_path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            # skip if it is symbolic link
            if not os.path.islink(fp):
                total_size += os.path.getsize(fp)
    return total_size

def get_s3file_size(bucket, key):
    s3 = boto3.client('s3')
    response = s3.head_object(Bucket=bucket, Key=key)
    size = response['ContentLength']
    return size
    
def download_files_min_files(s3_prefix, local_dir, min_file_size=310, num_threads=20):    
    input_tuples = ( ("s3://{}/{}".format(s3_bucket,s3_key),  local_dir) for s3_bucket, s3_key in list_files(s3_prefix) if get_s3file_size(s3_bucket, s3_key) > min_file_size )
    
    with ThreadPool(num_threads) as pool:
        results = pool.starmap(download_file, input_tuples)
        


In [None]:
%%time

download_files(s3_prefix, local_temp_pred_dir)

In [5]:
!ls -l $local_temp_dir | wc -l

      20


In [6]:
threshold_config = {'acetylation': {('confidence_std', 'count'): 5.0,
  ('confidence_std', 'mean'): 0.20221626758575403,
  ('confidence_std', 'std'): 0.010931891264366925,
  ('confidence_std', 'min'): 0.186287313699722,
  ('confidence_std', '25%'): 0.19900196790695102,
  ('confidence_std', '50%'): 0.20083150267601002,
  ('confidence_std', '75%'): 0.21166041493415802,
  ('confidence_std', 'max'): 0.21330013871192902,
  ('confidence', 'count'): 5.0,
  ('confidence', 'mean'): 0.5777932286262508,
  ('confidence', 'std'): 0.017217069729986746,
  ('confidence', 'min'): 0.555882334709167,
  ('confidence', '25%'): 0.572159707546234,
  ('confidence', '50%'): 0.57360166311264,
  ('confidence', '75%'): 0.5849224925041191,
  ('confidence', 'max'): 0.602399945259094},
 'dephosphorylation': {('confidence_std', 'count'): 29.0,
  ('confidence_std', 'mean'): 0.21133935400124204,
  ('confidence_std', 'std'): 0.07727310272430898,
  ('confidence_std', 'min'): 0.11373741179704601,
  ('confidence_std', '25%'): 0.141093701124191,
  ('confidence_std', '50%'): 0.18553803861141202,
  ('confidence_std', '75%'): 0.255215793848037,
  ('confidence_std', 'max'): 0.41554290056228604,
  ('confidence', 'count'): 29.0,
  ('confidence', 'mean'): 0.8063360175182077,
  ('confidence', 'std'): 0.13017405501056736,
  ('confidence', 'min'): 0.32787588238716103,
  ('confidence', '25%'): 0.7709274291992181,
  ('confidence', '50%'): 0.8457427024841301,
  ('confidence', '75%'): 0.8899683952331541,
  ('confidence', 'max'): 0.914224922657012},
 'deubiquitination': {('confidence_std', 'count'): 2.0,
  ('confidence_std', 'mean'): 0.1863851696252815,
  ('confidence_std', 'std'): 0.002637275825296506,
  ('confidence_std', 'min'): 0.184520334005355,
  ('confidence_std', '25%'): 0.18545275181531826,
  ('confidence_std', '50%'): 0.1863851696252815,
  ('confidence_std', '75%'): 0.18731758743524476,
  ('confidence_std', 'max'): 0.18825000524520802,
  ('confidence', 'count'): 2.0,
  ('confidence', 'mean'): 0.4542059451341625,
  ('confidence', 'std'): 0.010661192844799884,
  ('confidence', 'min'): 0.446667343378067,
  ('confidence', '25%'): 0.45043664425611474,
  ('confidence', '50%'): 0.4542059451341625,
  ('confidence', '75%'): 0.4579752460122103,
  ('confidence', 'max'): 0.461744546890258},
 'methylation': {('confidence_std', 'count'): 9.0,
  ('confidence_std', 'mean'): 0.20187029076947058,
  ('confidence_std', 'std'): 0.011129410572280824,
  ('confidence_std', 'min'): 0.185736715793609,
  ('confidence_std', '25%'): 0.19583970308303802,
  ('confidence_std', '50%'): 0.19923907518386802,
  ('confidence_std', '75%'): 0.210324048995971,
  ('confidence_std', 'max'): 0.21908366680145203,
  ('confidence', 'count'): 9.0,
  ('confidence', 'mean'): 0.7966000636418656,
  ('confidence', 'std'): 0.01647822043812186,
  ('confidence', 'min'): 0.7693868875503541,
  ('confidence', '25%'): 0.780769705772399,
  ('confidence', '50%'): 0.798503041267395,
  ('confidence', '75%'): 0.809625148773193,
  ('confidence', 'max'): 0.8169981241226191},
 'other': {('confidence_std', 'count'): 1116.0,
  ('confidence_std', 'mean'): 0.015799599189941234,
  ('confidence_std', 'std'): 0.0407698558038574,
  ('confidence_std', 'min'): 0.0008510624757030001,
  ('confidence_std', '25%'): 0.00117629769374575,
  ('confidence_std', '50%'): 0.0021780409151680004,
  ('confidence_std', '75%'): 0.007007123087532501,
  ('confidence_std', 'max'): 0.303181886672973,
  ('confidence', 'count'): 1116.0,
  ('confidence', 'mean'): 0.9903799230800303,
  ('confidence', 'std'): 0.026711376001797505,
  ('confidence', 'min'): 0.5133088827133171,
  ('confidence', '25%'): 0.9944566637277598,
  ('confidence', '50%'): 0.9974353015422815,
  ('confidence', '75%'): 0.998221039772033,
  ('confidence', 'max'): 0.9985265731811521},
 'phosphorylation': {('confidence_std', 'count'): 139.0,
  ('confidence_std', 'mean'): 0.09348429794416534,
  ('confidence_std', 'std'): 0.11442879568269237,
  ('confidence_std', 'min'): 0.006378921680152001,
  ('confidence_std', '25%'): 0.013172945939004001,
  ('confidence_std', '50%'): 0.034169171005487005,
  ('confidence_std', '75%'): 0.13673919439315751,
  ('confidence_std', 'max'): 0.469867438077926,
  ('confidence', 'count'): 139.0,
  ('confidence', 'mean'): 0.9306538756802781,
  ('confidence', 'std'): 0.09291076266425286,
  ('confidence', 'min'): 0.548133730888366,
  ('confidence', '25%'): 0.9297615289688106,
  ('confidence', '50%'): 0.9758448600769041,
  ('confidence', '75%'): 0.98560282588005,
  ('confidence', 'max'): 0.990909934043884},
 'ubiquitination': {('confidence_std', 'count'): 5.0,
  ('confidence_std', 'mean'): 0.1845212131738658,
  ('confidence_std', 'std'): 0.010037806334405529,
  ('confidence_std', 'min'): 0.174075484275817,
  ('confidence_std', '25%'): 0.177953422069549,
  ('confidence_std', '50%'): 0.18058878183364802,
  ('confidence_std', '75%'): 0.19217012822628002,
  ('confidence_std', 'max'): 0.197818249464035,
  ('confidence', 'count'): 5.0,
  ('confidence', 'mean'): 0.5571501374244686,
  ('confidence', 'std'): 0.07332355556921501,
  ('confidence', 'min'): 0.42922157049179005,
  ('confidence', '25%'): 0.5765218138694761,
  ('confidence', '50%'): 0.579930007457733,
  ('confidence', '75%'): 0.58320677280426,
  ('confidence', 'max'): 0.616870522499084}}

In [7]:
def get_summary_df(local_temp_pred_dir):
    list_df_high_quality = []
    list_df_summary = []
    for f in os.listdir(local_temp_pred_dir):
        df = pd.read_json(os.path.join(local_temp_pred_dir, f), orient="records" )


        list_df_summary.append(df[["prediction", "confidence", "confidence_std"]])

        # Filter below threshold items
        high_quality_frames = []
        for k,t in threshold_config.items():
            conf_median = t[('confidence', '50%')]
            conf_std_median = t[('confidence_std', '50%')]
            high_quality_frames.append(df.query("prediction == '{}' and confidence >= {} and confidence_std <= {}"
                                                .format(k, conf_median, conf_std_median)))

        high_quality_df = pd.concat(high_quality_frames)

        list_df_high_quality.append(high_quality_df)

    
    return pd.concat(list_df_high_quality), pd.concat(list_df_summary)



In [8]:
%%time 


df_high_quality, df_summary = get_summary_df (local_temp_pred_dir)

CPU times: user 1min 58s, sys: 5.55 s, total: 2min 4s
Wall time: 2min 5s


In [9]:
df_summary.groupby("prediction").describe().T

Unnamed: 0,prediction,acetylation,dephosphorylation,deubiquitination,methylation,phosphorylation,ubiquitination
confidence,count,7814.0,85996.0,512.0,52622.0,1301106.0,152053.0
confidence,mean,0.312708,0.377087,0.266125,0.396856,0.5697892,0.325775
confidence,std,0.063875,0.10496,0.035357,0.127154,0.1720569,0.059669
confidence,min,0.167926,0.172143,0.19255,0.171925,0.1720937,0.163623
confidence,25%,0.264918,0.299028,0.241865,0.295094,0.4383482,0.282141
confidence,50%,0.304162,0.362114,0.26209,0.368534,0.5513105,0.3204
confidence,75%,0.353461,0.433888,0.284913,0.480175,0.6917892,0.36315
confidence,max,0.60065,0.917424,0.461744,0.826972,0.9907943,0.614968
confidence_std,count,7814.0,85996.0,512.0,52622.0,1301106.0,152053.0
confidence_std,mean,0.21051,0.336032,0.145046,0.294212,0.3583199,0.213993


In [10]:
df_high_quality.groupby(["prediction"])[["prediction", "confidence", "confidence_std"]].describe().T

Unnamed: 0,prediction,acetylation,dephosphorylation,methylation,phosphorylation,ubiquitination
confidence,count,3.0,42.0,23.0,5721.0,5.0
confidence,mean,0.577767,0.881514,0.809059,0.982277,0.584419
confidence,std,0.003032,0.019359,0.006814,0.003611,0.005255
confidence,min,0.574266,0.849926,0.799061,0.97585,0.580424
confidence,25%,0.576875,0.868206,0.805046,0.979242,0.580954
confidence,50%,0.579485,0.881211,0.807967,0.982206,0.583207
confidence,75%,0.579518,0.892138,0.811937,0.985243,0.584096
confidence,max,0.579551,0.917424,0.826972,0.990794,0.593414
confidence_std,count,3.0,42.0,23.0,5721.0,5.0
confidence_std,mean,0.180173,0.14287,0.183372,0.01755,0.169572


In [11]:
download_file(s3_data, local_temp_wk_dir)

In [12]:
data_file = os.path.join(local_temp_wk_dir, s3_data.split("/")[-1])
data_training_full_df = pd.read_json(data_file)

In [13]:
data_training_full_df.shape

(3381, 7)

In [14]:
data_training_full_df.head(n=2)

Unnamed: 0,interactionId,interactionType,isNegative,participants,pubmedId,pubmedTitle,pubmedabstract
0,1585513,phosphorylation,False,"[{'uniprotid': 'Q10728', 'alias': [['mypt1_rat...",17126281,,Zipper-interacting protein kinase (ZIP kinase)...
1,1585516,phosphorylation,False,"[{'uniprotid': 'O43293-1', 'alias': [['o43293-...",17126281,,Zipper-interacting protein kinase (ZIP kinase)...


In [15]:
data_training_full_df["participants"].sample(n=2).iloc[0]

[{'uniprotid': 'P49841',
  'alias': [['gsk3b_human'],
   ['Glycogen synthase kinase-3 beta'],
   ['GSK3B'],
   ['Serine/threonine-protein kinase GSK3B']],
  'alternative_uniprots': ['Q9UL47', 'D3DN89', 'Q9BWH3']},
 {'uniprotid': 'P67809',
  'alias': [['ybox1_human'],
   ['Nuclease-sensitive element-binding protein 1'],
   ['YBX1'],
   ['YB1'],
   ['NSEP1'],
   ['Y-box-binding protein 1'],
   ['Y-box transcription factor'],
   ['CCAAT-binding transcription factor I subunit A'],
   ['Enhancer factor I subunit A'],
   ['DNA-binding protein B']],
  'alternative_uniprots': ['Q5FVF0', 'Q14972', 'Q15325', 'P16990', 'P16991']}]

In [16]:
def get_partipants_key_raw(participants):
    """
    Example input
    [{'uniprotid': 'P19388',
  'alias': [['rpab1_human'],
   ['DNA-directed RNA polymerases I, II, and III subunit RPABC1'],
   ['POLR2E'],
   ['DNA-directed RNA polymerase II subunit E'],
   ['RPB5 homolog'],
   ['DNA-directed RNA polymerase II 23 kDa polypeptide'],
   ['XAP4']],
  'alternative_uniprots': ['Q6PIH5', 'Q9BT06', 'O43380', 'B2R6L4', 'D6W5Y1']},
 {'uniprotid': 'Q96SB4',
  'alias': [['srpk1_human'],
   ['SRSF protein kinase 1'],
   ['Serine/arginine-rich protein-specific kinase 1'],
   ['SFRS protein kinase 1'],
   ['SRPK1']],
  'alternative_uniprots': ['Q5R365', 'Q5R364', 'B4DS61', 'Q8IY12', 'Q12890']}]
    """
    participant_uniprot = []
    for p in participants:
        
        participant_uniprot.append(str(p["uniprotid"]))
        
        
    result = get_partipants_key(participant_uniprot)
    
    return result

def get_partipants_key(list_uniprot):
    participant_uniprot=sorted(filter (lambda x: x is not None, list_uniprot))
    
    result = "#".join(participant_uniprot)
    
    return result


def is_in_training(df, training_df):
    training_participants = training_df["participants"].apply(get_partipants_key_raw)
    df_particpiants = df[["participant2Id", "participant1Id"]].apply(lambda x : get_partipants_key([x["participant2Id"],x["participant2Id"]]), axis=1)
    return df_particpiants.isin(training_participants)

def is_in_training_pubmed(df, training_df):
    return df["pubmedId"].isin(training_df["pubmedId"])

In [17]:
df_high_quality["PubmedInTrainingData"] = is_in_training_pubmed( df_high_quality, data_training_full_df)

In [18]:
c_df=pd.DataFrame(df_summary.query("prediction != 'other'")\
                  .groupby([ "prediction"]).size()).rename(columns={0: "all_count"})
tmp_df = pd.DataFrame(df_high_quality[~df_high_quality.PubmedInTrainingData]\
      .groupby('prediction').size())\
      .rename(columns={0: "filter_count"})

tmp_df = tmp_df.merge(c_df, left_index=True,  right_index=True, how="right")\
      [[ "all_count", "filter_count"]]

print(tmp_df.to_latex( index=True))

tmp_df

\begin{tabular}{lrr}
\toprule
{} &  all\_count &  filter\_count \\
prediction        &            &               \\
\midrule
acetylation       &       7814 &           1.0 \\
dephosphorylation &      85996 &          29.0 \\
deubiquitination  &        512 &           NaN \\
methylation       &      52622 &          19.0 \\
phosphorylation   &    1301106 &        5623.0 \\
ubiquitination    &     152053 &           4.0 \\
\bottomrule
\end{tabular}



Unnamed: 0_level_0,all_count,filter_count
prediction,Unnamed: 1_level_1,Unnamed: 2_level_1
acetylation,7814,1.0
dephosphorylation,85996,29.0
deubiquitination,512,
methylation,52622,19.0
phosphorylation,1301106,5623.0
ubiquitination,152053,4.0


In [19]:

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 12})

In [20]:
df_high_quality.sample(n=10)

Unnamed: 0,pubmedId,participant1Id,participant1Name,participant2Id,participant2Name,abstract,normalised_abstract,annotations,gene_to_uniprot_map,normalised_abstract_annotations,other,phosphorylation,dephosphorylation,methylation,ubiquitination,acetylation,deubiquitination,prediction,confidence,confidence_std,raw_confidence,PubmedInTrainingData
986,18762583,P28482,extracellular signal-regulated kinase,Q9BQQ3,GRASP65,Directed cell migration requires the orientati...,Directed cell migration requires the orientati...,"[{'start': '228', 'end': '265', 'name': 'extra...","{'5594': 'P28482', '64689': 'Q9BQQ3'}","[{'charOffset': 228, 'len': 6, 'text': 'P28482...",0.003823,0.979349,0.004437,0.003474,0.003418,0.00266,0.002838,phosphorylation,0.979349,0.015756,"[0.962265491485595, 0.9840977787971491, 0.9954...",False
428,14572648,P49840,GSK3alpha,Q92837,Frequently rearranged in advanced T-cell lymph...,"Recently, LiCl has been shown to inhibit amylo...","Recently, LiCl has been shown to inhibit amylo...","[{'start': '134', 'end': '142', 'name': 'GSK3b...","{'2932': 'P49841', '10023': 'Q92837', '2931': ...","[{'charOffset': 134, 'len': 6, 'text': 'P49841...",0.003381,0.985824,0.001976,0.003138,0.002016,0.001771,0.001895,phosphorylation,0.985824,0.01536,"[0.9924962520599361, 0.955715119838714, 0.9951...",False
1590,21865166,P29597,TYK2,Q13563,PKD2,Type 1 interferons (including IFN/) activate t...,Type 1 interferons (including IFN/) activate t...,"[{'start': '51', 'end': '72', 'name': 'cell su...","{'57126': 'Q8N6Q3', '3454': 'P17181', '7297': ...","[{'charOffset': 51, 'len': 6, 'text': 'Q8N6Q3'...",0.005458,0.982941,0.003592,0.002618,0.001736,0.001755,0.0019,phosphorylation,0.982941,0.014906,"[0.94586956501007, 0.970860302448272, 0.987115...",False
81,19740742,P07910,heterogeneous nuclear ribonucleoprotein C,P60484,PTEN,PTEN (phosphatase and tensin homolog deleted o...,P60484 (phosphatase and tensin homolog deleted...,"[{'start': '0', 'end': '4', 'name': 'PTEN', 't...","{'5728': 'P60484', '3183': 'P07910'}","[{'charOffset': 0, 'len': 6, 'text': 'P60484'}...",0.004929,0.977165,0.003143,0.00478,0.003966,0.00216,0.003857,phosphorylation,0.977165,0.022844,"[0.9928929209709161, 0.993029773235321, 0.9493...",False
2284,16402022,Q15835,G-protein-coupled receptor kinase 1,Q8WTQ7,GRK7,Visual pigment is phosphorylated and inactivat...,Visual pigment is phosphorylated and inactivat...,"[{'start': '126', 'end': '161', 'name': 'G-pro...","{'6011': 'Q15835', '131890': 'Q8WTQ7'}","[{'charOffset': 126, 'len': 6, 'text': 'Q15835...",0.004069,0.986416,0.001603,0.002351,0.001969,0.001575,0.002017,phosphorylation,0.986416,0.010638,"[0.9950130581855771, 0.994061768054962, 0.9947...",False
164,12242661,O96017,Chk2,Q13315,ataxia telangiectasia-mutated,The Polo-like kinases (Plks) are a conserved f...,The Polo-like kinases (Plks) are a conserved f...,"[{'start': '219', 'end': '223', 'name': 'Plk3'...","{'1263': 'Q9H4B4', '472': 'Q13315', '11200': '...","[{'charOffset': 219, 'len': 6, 'text': 'Q9H4B4...",0.005519,0.984055,0.001693,0.002104,0.002536,0.002366,0.001727,phosphorylation,0.984055,0.020458,"[0.991664588451385, 0.9699286222457881, 0.9923...",False
268,24563481,P17302,Cx43,P31749,AKT,Connexin (Cx) 43 hemichannels in osteocytes ar...,Connexin (Cx) 43 hemichannels in osteocytes ar...,"[{'start': '341', 'end': '345', 'name': 'Cx43'...","{'2697': 'P17302', '5290': 'P42336', '207': 'P...","[{'charOffset': 341, 'len': 6, 'text': 'P17302...",0.005362,0.984539,0.002413,0.00271,0.001704,0.001532,0.001739,phosphorylation,0.984539,0.01058,"[0.974123060703277, 0.994778156280517, 0.99434...",False
1959,1849074,P06493,p34cdc2,Q03252,lamin B2,While the p34cdc2 kinase is considered to be a...,While the P06493 kinase is considered to be a ...,"[{'start': '10', 'end': '17', 'name': 'p34cdc2...","{'983': 'P06493', '84823': 'Q03252'}","[{'charOffset': 10, 'len': 6, 'text': 'P06493'...",0.004129,0.984059,0.003423,0.002735,0.002018,0.001739,0.001897,phosphorylation,0.984059,0.014873,"[0.9876155257225031, 0.947423100471496, 0.9893...",False
1233,17255101,P42345,mammalian target of rapamycin,Q13153,p21-activated kinase 1,Cellular mechanisms that regulate the replicat...,Cellular mechanisms that regulate the replicat...,"[{'start': '53', 'end': '75', 'name': 'hepatit...","{'5058': 'Q13153', '3661': 'Q14653', '2475': '...","[{'charOffset': 104, 'len': 6, 'text': 'Q13153...",0.003391,0.987296,0.00225,0.001969,0.002009,0.001528,0.001557,phosphorylation,0.987296,0.013447,"[0.9534298777580261, 0.9742354154586791, 0.995...",False
2102,1850414,P21796,porin,Q16635,Taz1,The Tar-EnvZ hybrid molecule (Taz1) is an inne...,The Tar-EnvZ hybrid molecule (Q16635) is an in...,"[{'start': '30', 'end': '34', 'name': 'Taz1', ...","{'6901': 'Q16635', '7416': 'P21796'}","[{'charOffset': 30, 'len': 6, 'text': 'Q16635'...",0.008981,0.981444,0.002203,0.00168,0.002459,0.001686,0.001549,phosphorylation,0.981444,0.026255,"[0.9943168759346, 0.9089921116828911, 0.989969...",False


In [21]:
df_high_quality["unique_gene_count"] = df_high_quality["gene_to_uniprot_map"].apply(lambda x: len(x))

In [22]:
df_high_quality.groupby("prediction")[["confidence","unique_gene_count"]].describe()

Unnamed: 0_level_0,confidence,confidence,confidence,confidence,confidence,confidence,confidence,confidence,unique_gene_count,unique_gene_count,unique_gene_count,unique_gene_count,unique_gene_count,unique_gene_count,unique_gene_count,unique_gene_count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
prediction,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
acetylation,3.0,0.577767,0.003032,0.574266,0.576875,0.579485,0.579518,0.579551,3.0,4.333333,1.527525,3.0,3.5,4.0,5.0,6.0
dephosphorylation,42.0,0.881514,0.019359,0.849926,0.868206,0.881211,0.892138,0.917424,42.0,3.380952,1.342593,2.0,3.0,3.0,4.0,7.0
methylation,23.0,0.809059,0.006814,0.799061,0.805046,0.807967,0.811937,0.826972,23.0,2.304348,0.470472,2.0,2.0,2.0,3.0,3.0
phosphorylation,5721.0,0.982277,0.003611,0.97585,0.979242,0.982206,0.985243,0.990794,5721.0,3.081804,1.30757,2.0,2.0,3.0,4.0,11.0
ubiquitination,5.0,0.584419,0.005255,0.580424,0.580954,0.583207,0.584096,0.593414,5.0,2.6,0.547723,2.0,2.0,3.0,3.0,3.0


In [23]:
predictions_above_threshold_file = "predictions_above_threshold.json"
df_high_quality.to_json(predictions_above_threshold_file, orient='records')

In [24]:
upload_file(predictions_above_threshold_file, "{}/".format(s3_output_prefix.rstrip("/")))

In [25]:
samples_per_interaction = 20

samples_subset = df_high_quality.query("PubmedInTrainingData == False")\
                .groupby('prediction', group_keys=False)\
                .apply(lambda x: x.sample(min(len(x), samples_per_interaction),random_state=45))

samples_subset.groupby(["prediction"])["prediction"].count()

prediction
acetylation           1
dephosphorylation    20
methylation          19
phosphorylation      20
ubiquitination        4
Name: prediction, dtype: int64

In [26]:
samples_subset.columns

Index(['pubmedId', 'participant1Id', 'participant1Name', 'participant2Id',
       'participant2Name', 'abstract', 'normalised_abstract', 'annotations',
       'gene_to_uniprot_map', 'normalised_abstract_annotations', 'other',
       'phosphorylation', 'dephosphorylation', 'methylation', 'ubiquitination',
       'acetylation', 'deubiquitination', 'prediction', 'confidence',
       'confidence_std', 'raw_confidence', 'PubmedInTrainingData',
       'unique_gene_count'],
      dtype='object')

### Create ground truth training jobs

In [27]:
import json
import json
def create_manifest_file(df, outfile):
    items = df.to_dict(orient='records' )
    with open(outfile , "w") as f:
        for item in items:
            # Write without new lines
            item_m  = {}
            item_m["source"] = json.dumps(item)
            f.write(json.dumps(item_m).replace("\n", "\t"))
            f.write("\n")

In [48]:
def create_manifest_per_interaction(samples_subset_df, s3_output_prefix):
    samples_subset_file = "predictions_sample_subset.json"
    samples_subset_df.to_json(samples_subset_file, orient='records')
    upload_file(samples_subset_file, "{}/".format(s3_output_prefix.rstrip("/")))


    manifest_file = "predictions_sample_subset.mainfest"
    create_manifest_file(samples_subset_df, manifest_file)
    upload_file(manifest_file, "{}/".format(s3_output_prefix.rstrip("/")))

    # Create one manifest file per interaction type
    m_files=[]
    for i in list(samples_subset_df["prediction"].unique()):
        manifest_file = "predictions_sample_subset_{}.mainfest".format(i)
        s3_manifest_file = "{}/{}".format(s3_output_prefix.rstrip("/"), manifest_file)
        create_manifest_file(samples_subset_df.query("prediction == '{}'".format(i)), manifest_file)
        upload_file(manifest_file, s3_manifest_file)
        
        m_files.append(s3_manifest_file)
    return m_files


In [49]:
s3_manifests = create_manifest_per_interaction(samples_subset, s3_output_prefix)

In [37]:
import boto3
import sagemaker

In [44]:
from datetime import datetime

def create_groundtruth_labelling_job(s3_manifest, s3_gt_output, s3_template, pre_lambda, post_lambda, role, workforce_name, job_name, label_attribute_name="prediction", workforce_type= "private-crowd" ):
    client = boto3.client('sagemaker')
    
    sagemaker_session = sagemaker.Session()
    account_id =  boto3.client('sts').get_caller_identity().get('Account')
    region = boto3.session.Session().region_name
    
    workforce_arn = "arn:aws:sagemaker:{}:{}:workteam/{}/{}".format(region, account_id, workforce_type, workforce_name)
    role_arn = "arn:aws:iam::{}:role/{}".format( account_id,  role)
    pre_lambda_arn = "arn:aws:lambda:{}:{}:function:{}".format(region, account_id,  pre_lambda)
    post_lambda_arn = "arn:aws:lambda:{}:{}:function:{}".format(region, account_id,  post_lambda)
    
    num_workers_per_object = 1
    task_time_limit_sec = 60  * 60 * 5
    task_availablity_sec =60  * 60 * 24 * 10
    
    job = client.create_labeling_job(LabelingJobName=job_name
                                    ,LabelAttributeName = label_attribute_name
                                    ,InputConfig = {
                                        "DataSource": {
                                            'S3DataSource': {
                                            'ManifestS3Uri': s3_manifest
                                            }
                                        }
                                        
                                    }
                                  ,OutputConfig={
                                        'S3OutputPath': s3_gt_output
                                    }

                                  , RoleArn = role_arn
                                  , HumanTaskConfig={
                                    'WorkteamArn': workforce_arn,
                                    'UiConfig': {
                                        'UiTemplateS3Uri': s3_template
                                    },
                                    'PreHumanTaskLambdaArn': pre_lambda_arn,
                                    'TaskKeywords': [
                                        'PPI',
                                    ],
                                    'TaskTitle': 'Verify PPI extraction for protein {}'.format(s3_manifest.split("/")[-1]),
                                    'TaskDescription': 'Verifies PPi extraction',
                                    'NumberOfHumanWorkersPerDataObject': num_workers_per_object,
                                    'TaskTimeLimitInSeconds': task_time_limit_sec,
                                    'TaskAvailabilityLifetimeInSeconds': task_availablity_sec,
                                    'MaxConcurrentTaskCount': 10,
                                    'AnnotationConsolidationConfig': {
                                        'AnnotationConsolidationLambdaArn': post_lambda_arn
                                    }
                                }
                            )
    
    return job
    
    

def create_groundtruth_labelling_multiple_jobs(lst_s3_manifests, s3_gt_output, s3_template, pre_lambda, post_lambda, role, workforce_name, job_prefix ="ppi", label_attribute_name="class"):
    job_prefix = "{}-{}".format(job_prefix , datetime.now().strftime("%Y%m%d%H%M%S"))
    for s3_manifest in lst_s3_manifests:
        job_name = "{}-{}".format( job_prefix, s3_manifest.split("/")[-1].split("_")[-1].split(".")[0])
        print(f"Creating job {job_name}")
        create_groundtruth_labelling_job(s3_manifest, s3_gt_output, s3_template, pre_lambda, post_lambda, role, workforce_name, job_name)

In [45]:
import urllib.request

def download_template(template_url):
    with urllib.request.urlopen(template_url) as f:
        html = f.read().decode('utf-8')

    with open("template.html", "w") as f:
        f.write(html)
    
download_template('http://raw.githubusercontent.com/elangovana/ppi-sagemaker-groundtruth-verification/main/src/template/template.html')

In [50]:
role_name = "service-role/AmazonSageMaker-ExecutionRole-20210104T161547"
pre_lambda="Sagemaker-ppipreprocessing"
post_lambda="sagemaker-ppipostprocessing"
s3_gt_output = "{}/gt_output/".format(s3_prefix.rstrip("/"))
workforce_name = "ppi-team"
s3_template_file = "{}/template.html".format(s3_prefix.rstrip("/"))

upload_file("template.html", s3_template_file )
create_groundtruth_labelling_multiple_jobs (s3_manifests,
                                            s3_gt_output, 
                                            s3_template_file,
                                            pre_lambda, 
                                            post_lambda, 
                                            role_name,
                                            workforce_name)

Creating job ppi-20210619144548-acetylation ppi-20210619144548-acetylation
Creating job ppi-20210619144548-dephosphorylation ppi-20210619144548-dephosphorylation
Creating job ppi-20210619144548-methylation ppi-20210619144548-methylation
Creating job ppi-20210619144548-phosphorylation ppi-20210619144548-phosphorylation
Creating job ppi-20210619144548-ubiquitination ppi-20210619144548-ubiquitination
