In [2]:
s3_file="s3://aegovan-data/human_output/interactions_except_phys_associ.json"

s3_dest="s3://aegovan-data/processed_dataset/"
interesting_interactions = ["dephosphorylation","ubiquitination", "phosphorylation", "acetylation", "deubiquitination", "demethylation","deacetylation"]

In [2]:
import boto3

def download_single_file(bucket_name_path, local_path):
    index = bucket_name_path.find("://")

    # remove the s3:// if present
    if index > -1:
        bucket_name_path = bucket_name_path[index + 3:]

    key_start_index = bucket_name_path.find("/")
    bucket_name = bucket_name_path
    key = "/"
    if key_start_index > -1:
        bucket_name = bucket_name_path[0:key_start_index]
        key = bucket_name_path[key_start_index + 1:]
        
    client = boto3.resource('s3')
    client.Bucket(bucket_name).download_file(key, local_path)



In [4]:
data_file="classification_raw_data.json"


In [4]:
%%time

download_single_file(s3_file, data_file)

In [5]:
import pandas as pd

df=pd.read_json(data_file)

In [6]:
df.head(n=3)

Unnamed: 0,interactionId,interactionType,isNegative,participants,pubmedId,pubmedTitle,pubmedabstract
0,1459695,colocalization,True,"[{'uniprotid': 'P29590-8', 'alias': [['p29590-...",19567472,,Maintaining proper telomere length requires th...
1,1459700,colocalization,True,"[{'uniprotid': 'P29590-1', 'alias': [['p29590-...",19567472,,Maintaining proper telomere length requires th...
10,1197912,direct interaction,False,"[{'uniprotid': 'P51617', 'alias': [['irak1_hum...",21743479,,Toll-like receptors (TLRs) shape innate and ad...


In [7]:
df.shape

(23151, 7)

In [8]:
df.pubmedId.unique().shape

(5766,)

In [9]:
df.interactionType.value_counts()

direct interaction                 17799
colocalization                      3790
dephosphorylation                    437
ubiquitination                       226
enzymatic reaction                   141
methylation                          134
adp ribosylation                      84
gtpase reaction                       75
acetylation                           70
deacetylation                         45
demethylation                         37
redox reaction                        33
disulfide bond                        33
deubiquitination                      31
hydroxylation                         30
atpase reaction                       29
physical interaction                  27
glycosylation                         22
putative self interaction             14
sumoylation                           13
rna cleavage                          11
genetic interaction                    9
self interaction                       9
lipid cleavage                         7
phosphotransfer 

In [10]:
def normalise_interaction_type(interaction_type):
    if interaction_type in interesting_interactions:
        return interaction_type
    else:
        return "other"

### Clean up interactions

#### Replace all except the key interactions with "other"

In [11]:
df["interactionType"] = df.apply( lambda r: normalise_interaction_type(r["interactionType"] ) , axis = 1)

In [12]:
df["interactionType"].value_counts()

other                22305
dephosphorylation      437
ubiquitination         226
acetylation             70
deacetylation           45
demethylation           37
deubiquitination        31
Name: interactionType, dtype: int64

### Make Unique as per pubmedids

In [13]:
df_clean = df.drop_duplicates(subset=['pubmedId'])
df_clean.shape

(5766, 7)

### Remove if abstract cannot be found

In [21]:
df_clean = df_clean[ df_clean.apply(lambda x: x['pubmedabstract'] is not None, axis=1) ]

### Split Train / Test / Val 

In [23]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split

train, val  = train_test_split(df_clean, test_size=.1, random_state=777, stratify=df_clean["interactionType"])


train, test = train_test_split(df_clean,  test_size=.2,
                                                        random_state=777, stratify=df_clean["interactionType"])

In [24]:
import matplotlib.pyplot as plt
fix, axs = plt.subplots(1,3, figsize=(15,5))


ax=axs[0]
ax.set_title("Train")
ax.set_yscale('log')
ax.xaxis.set_major_locator(plt.MaxNLocator(10, prune='lower'))
train.interactionType.value_counts().plot.bar(ax=ax)



ax=axs[1]
ax.set_title("Val")
ax.set_yscale('log')
ax.xaxis.set_major_locator(plt.MaxNLocator(10, prune='lower'))
val.interactionType.value_counts().plot.bar(ax=ax)


ax=axs[2]
ax.set_title("Test")
ax.set_yscale('log')
ax.xaxis.set_major_locator(plt.MaxNLocator(10, prune='lower'))
test.interactionType.value_counts().plot.bar(ax=ax)



plt.show()

<Figure size 1500x500 with 3 Axes>

In [25]:
train.head(n=2)

Unnamed: 0,interactionId,interactionType,isNegative,participants,pubmedId,pubmedTitle,pubmedabstract
18300,541700,other,False,"[{'uniprotid': 'P46937', 'alias': [['yap1_huma...",11278422,,To understand the role of the Yes-associated p...
10806,255999,other,False,"[{'uniprotid': 'P03122', 'alias': [['ve2_bpv1'...",14966293,,"Using a yeast two-hybrid screen, we identified..."


In [26]:
train_file = "train_classification.json"
train.to_json(train_file)

test_file = "test_classification.json"
test.to_json(test_file)

val_file = "val_classification.json"
val.to_json(val_file)

In [27]:
train.sample(n=50).to_json("sample_classification.json")

In [None]:
!aws s3 cp $train_file $s3_dest
!aws s3 cp $val_file $s3_dest
!aws s3 cp $test_file $s3_dest