In [1]:
s3_file="s3://aegovan-data/human_output/interactions_except_phys_associ.json"

s3_dest="s3://aegovan-data/processed_dataset/"
interesting_interactions = ["dephosphorylation","ubiquitination", "phosphorylation", "acetylation", "deubiquitination", "demethylation","deacetylation"]

In [2]:
import boto3

def download_single_file(bucket_name_path, local_path):
    index = bucket_name_path.find("://")

    # remove the s3:// if present
    if index > -1:
        bucket_name_path = bucket_name_path[index + 3:]

    key_start_index = bucket_name_path.find("/")
    bucket_name = bucket_name_path
    key = "/"
    if key_start_index > -1:
        bucket_name = bucket_name_path[0:key_start_index]
        key = bucket_name_path[key_start_index + 1:]
        
    client = boto3.resource('s3')
    client.Bucket(bucket_name).download_file(key, local_path)



In [3]:
data_file="classification_raw_data.json"


In [4]:
%%time

download_single_file(s3_file, data_file)

CPU times: user 869 ms, sys: 998 ms, total: 1.87 s
Wall time: 24.4 s


In [5]:
import pandas as pd

df=pd.read_json(data_file)

In [6]:
df.head(n=3)

Unnamed: 0,interactionId,interactionType,isNegative,participants,pubmedId,pubmedTitle,pubmedabstract
0,1459695,colocalization,True,"[{'uniprotid': 'P29590-8', 'alias': [['p29590-...",19567472,,Maintaining proper telomere length requires th...
1,1459700,colocalization,True,"[{'uniprotid': 'P29590-1', 'alias': [['p29590-...",19567472,,Maintaining proper telomere length requires th...
10,1197912,direct interaction,False,"[{'uniprotid': 'P51617', 'alias': [['irak1_hum...",21743479,,Toll-like receptors (TLRs) shape innate and ad...


In [7]:
df.shape

(23151, 7)

In [8]:
df.pubmedId.unique().shape

(5766,)

In [9]:
df.interactionType.value_counts()

direct interaction                 17799
colocalization                      3790
dephosphorylation                    437
ubiquitination                       226
enzymatic reaction                   141
methylation                          134
adp ribosylation                      84
gtpase reaction                       75
acetylation                           70
deacetylation                         45
demethylation                         37
disulfide bond                        33
redox reaction                        33
deubiquitination                      31
hydroxylation                         30
atpase reaction                       29
physical interaction                  27
glycosylation                         22
putative self interaction             14
sumoylation                           13
rna cleavage                          11
genetic interaction                    9
self interaction                       9
lipid cleavage                         7
phosphotransfer 

In [10]:
def normalise_interaction_type(interaction_type):
    if interaction_type in interesting_interactions:
        return interaction_type
    else:
        return "other"

### Clean up interactions

#### Replace all except the key interactions with "other"

In [11]:
df["interactionType"] = df.apply( lambda r: normalise_interaction_type(r["interactionType"] ) , axis = 1)

In [12]:
df["interactionType"].value_counts()

other                22305
dephosphorylation      437
ubiquitination         226
acetylation             70
deacetylation           45
demethylation           37
deubiquitination        31
Name: interactionType, dtype: int64

### Make Unique as per pubmedids

In [13]:
df_clean = df.drop_duplicates(subset=['pubmedId'])
df_clean.shape

(5766, 7)

### Remove if abstract cannot be found

In [14]:
df_clean = df_clean[ df_clean.apply(lambda x: x['pubmedabstract'] is not None, axis=1) ]

### Split Train / Test / Val 

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split

train, val  = train_test_split(df_clean, test_size=.1, random_state=777, stratify=df_clean["interactionType"])


train, test = train_test_split(train,  test_size=.2,
                                                        random_state=777, stratify=train["interactionType"])

In [16]:
import matplotlib.pyplot as plt
fix, axs = plt.subplots(1,3, figsize=(15,5))


ax=axs[0]
ax.set_title("Train")
ax.set_yscale('log')
ax.xaxis.set_major_locator(plt.MaxNLocator(10, prune='lower'))
train.interactionType.value_counts().plot.bar(ax=ax)



ax=axs[1]
ax.set_title("Val")
ax.set_yscale('log')
ax.xaxis.set_major_locator(plt.MaxNLocator(10, prune='lower'))
val.interactionType.value_counts().plot.bar(ax=ax)


ax=axs[2]
ax.set_title("Test")
ax.set_yscale('log')
ax.xaxis.set_major_locator(plt.MaxNLocator(10, prune='lower'))
test.interactionType.value_counts().plot.bar(ax=ax)



plt.show()

<Figure size 1500x500 with 3 Axes>

In [17]:
train.head(n=2)

Unnamed: 0,interactionId,interactionType,isNegative,participants,pubmedId,pubmedTitle,pubmedabstract
9554,709246,other,False,"[{'uniprotid': 'P03243', 'alias': [['e1b55_ade...",14557665,,The adenovirus E1B 55-kDa protein impairs the ...
10338,6612,other,False,"[{'uniprotid': 'P60953', 'alias': [['cdc42_hum...",8625410,,The Rho family of GTPases control diverse biol...


In [18]:
train_file = "train_classification.json"
train.to_json(train_file)

test_file = "test_classification.json"
test.to_json(test_file)

val_file = "val_classification.json"
val.to_json(val_file)

In [19]:
train.sample(n=50).to_json("sample_classification.json")

In [20]:
!aws s3 cp $train_file $s3_dest
!aws s3 cp $val_file $s3_dest
!aws s3 cp $test_file $s3_dest

upload: ./train_classification.json to s3://aegovan-data/processed_dataset/train_classification.json
upload: ./val_classification.json to s3://aegovan-data/processed_dataset/val_classification.json
upload: ./test_classification.json to s3://aegovan-data/processed_dataset/test_classification.json


## Create binary classification dataset

In [21]:
def binaraise_interaction_type(interaction_type):
    if interaction_type in interesting_interactions:
        return True
    else:
        return False

In [22]:
import copy


df_binary_clean = copy.deepcopy(df_clean)

In [23]:
df_binary_clean["label"] = df_binary_clean.apply( lambda r: binaraise_interaction_type(r["interactionType"] ) , axis = 1)

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split

train_binary, val_binary  = train_test_split(df_binary_clean, test_size=.1, random_state=777, stratify=df_binary_clean["interactionType"])


train_binary, test_binary = train_test_split(train_binary,  test_size=.2,
                                                        random_state=777, stratify=train_binary["interactionType"])

In [25]:
train_binary.sample(n=2)

Unnamed: 0,interactionId,interactionType,isNegative,participants,pubmedId,pubmedTitle,pubmedabstract,label
227,2681532,other,False,"[{'uniprotid': 'Q05193', 'alias': [['dyn1_huma...",21927000,Crystal structure of nucleotide-free dynamin.,Dynamin is a mechanochemical GTPase that oligo...,False
1818,172515,other,False,"[{'uniprotid': 'Q9BYE9', 'alias': [['cdhr2_hum...",12117771,,Protocadherins are a major subfamily of the ca...,False


In [26]:
train_binary_file = "train_binary_classification.json"
train_binary.to_json(train_binary_file)

test_binary_file = "test_binary_classification.json"
test_binary.to_json(test_binary_file)

val_binary_file = "val_binary_classification.json"
val_binary.to_json(val_binary_file)

train.sample(n=50).to_json("sample_binary_classification.json")

In [27]:
!aws s3 cp $train_binary_file $s3_dest
!aws s3 cp $val_binary_file $s3_dest
!aws s3 cp $test_binary_file $s3_dest

upload: ./train_binary_classification.json to s3://aegovan-data/processed_dataset/train_binary_classification.json
upload: ./val_binary_classification.json to s3://aegovan-data/processed_dataset/val_binary_classification.json
upload: ./test_binary_classification.json to s3://aegovan-data/processed_dataset/test_binary_classification.json
