### Create self supervised dataset

In [1]:
import pandas as pd
import numpy as np
import json
import sagemaker,  io
import re

In [2]:
train_data_file = "../chemprot_abstract_train.json"
test_data_file = "../chemprot_abstract_test.json"
val_data_file = "../chemprot_abstract_val.json"

In [3]:
keywords1 = ["activation", "trigger", "interact", "inhibit", "regulat", "supress"]

keywords2 = ["gene", "protein", "chemical"]



    
def load_unique_abstract(datafile):
    with open(datafile) as f:
        data = json.load(f)
    
    abstracts = set()
    results = []
    
    for r in data:
        if r["abstract_id"] not in abstracts:
            abstracts.add(r["abstract_id"])
            results.append({
                "abstract_id" : r["abstract_id"],
                 "abstract" : r["abstract"]
            })
    return pd.DataFrame(results)


def self_label(df):
   

    df["self_label"] = df["abstract"].apply(lambda x: any([ k.lower() in x.lower() for k in keywords1]) 
                                        and sum([  k.lower() in x.lower() for k in keywords2])>=2
                                     
                                       )
    
    
    return df


def randomly_replace_keywords(x):
    np.random.seed(2020)
    
    
    key_i = np.random.choice([0,1])
    if key_i == 0:
        for k in keywords1:
            insensitive = re.compile(re.escape(k), re.IGNORECASE)
            x = insensitive.sub("", x)
    else:
        for k in keywords2:
            insensitive = re.compile(re.escape(k), re.IGNORECASE)
            x = insensitive.sub("", x)
    return x

def create_negative_samples(df):
    
    remove_n = len(df.query("self_label == True"))
    df_negative = df.query("self_label == False")
    drop_indices =np.random.choice(df_negative.index, remove_n, replace=False)
    df_subset_negative = df_negative[ ~df_negative.index.isin (drop_indices)].copy(deep=True)
    df_subset_negative["is_fake"] = False
    
    df_fake = df.query("self_label == True").copy(deep=True)
    df_fake["abstract"] = df_fake["abstract"].apply(randomly_replace_keywords)
    df_fake["abstract_id"] = df_fake["abstract_id"].apply(lambda x: f"{x}_1")
    df_fake = df_fake.pipe(self_label)
    df_fake["is_fake"] = True
    
    df_true = df.query("self_label == True").copy(deep=True)
    df_true["is_fake"]= False
    
    df = pd.concat([df_subset_negative, 
                    df_true,
                    df_fake
                    
                   ])
    df = df.reset_index()
    return df

In [4]:
df_train_unique = load_unique_abstract(train_data_file).pipe(self_label)
df_test_unique = load_unique_abstract(test_data_file).pipe(self_label)
df_val_unique = load_unique_abstract(val_data_file).pipe(self_label)

In [5]:
df_train_fake = df_train_unique.pipe(create_negative_samples)
df_test_fake = df_test_unique.pipe(create_negative_samples)
df_val_fake = df_val_unique.pipe(create_negative_samples)

In [6]:
df_train_unique.head(n=3)

Unnamed: 0,abstract_id,abstract,self_label
0,10047461,Cyclin E-cdk2 activation is associated with ce...,False
1,10095983,New aspects in the management of obesity: oper...,False
2,10200320,Cyclopentenone prostaglandins suppress activat...,True


In [7]:
df_train_unique["self_label"].value_counts()

False    628
True     139
Name: self_label, dtype: int64

In [8]:
df_test_unique["self_label"].value_counts()

False    513
True     107
Name: self_label, dtype: int64

In [9]:
df_val_unique["self_label"].value_counts()

False    371
True      72
Name: self_label, dtype: int64

In [10]:
df_train_fake["self_label"].value_counts()

False    628
True     139
Name: self_label, dtype: int64

In [11]:
df_train_fake.groupby(["is_fake","self_label"]).size()

is_fake  self_label
False    False         489
         True          139
True     False         139
dtype: int64

In [12]:
df_train_fake.query("is_fake==True").head(n=2)

Unnamed: 0,index,abstract_id,abstract,self_label,is_fake
628,2,10200320_1,Cyclopentenone prostaglandins suppress of mic...,False,True
629,3,10207608_1,Phospholipase A2 ors p-bromophenacyl bromide a...,False,True


In [13]:
df_val_fake["self_label"].value_counts()

False    371
True      72
Name: self_label, dtype: int64

In [14]:
df_test_fake["self_label"].value_counts()

False    513
True     107
Name: self_label, dtype: int64

In [15]:
def upload_to_s3(df, s3_dest):
    b = io.StringIO()
    df.to_json(b)
    
    sagemaker.s3.S3Uploader.upload_string_as_file_body(b.getvalue(), s3_dest)

In [16]:
def create_sample(df):
    b = io.StringIO()
    df.sample(n=10).to_json("self-supervised-sample.json")
    


In [17]:
create_sample(df_train_unique)

In [18]:
upload_to_s3(df_train_unique, "s3://aegovan-data/self-supervised/train.json" )
upload_to_s3(df_val_unique, "s3://aegovan-data/self-supervised/val.json" )
upload_to_s3(df_test_unique, "s3://aegovan-data/self-supervised/test.json" )

In [19]:
upload_to_s3(df_train_fake, "s3://aegovan-data/self-supervised-fake/train.json" )
upload_to_s3(df_val_fake, "s3://aegovan-data/self-supervised-fake/val.json" )
upload_to_s3(df_test_fake, "s3://aegovan-data/self-supervised-fake/test.json" )