### Create self supervised dataset

In [1]:
import pandas as pd
import json
import sagemaker,  io

In [2]:
train_data_file = "../chemprot_abstract_train.json"
test_data_file = "../chemprot_abstract_test.json"
val_data_file = "../chemprot_abstract_val.json"

In [3]:
def load_unique_abstract(datafile):
    with open(datafile) as f:
        data = json.load(f)
    
    abstracts = set()
    results = []
    
    for r in data:
        if r["abstract_id"] not in abstracts:
            abstracts.add(r["abstract_id"])
            results.append({
                "abstract_id" : r["abstract_id"],
                 "abstract" : r["abstract"]
            })
    return pd.DataFrame(results)


def self_label(df):
    keywords1 = ["activation", "trigger", "interact", "inhibit", "regulat", "supress"]

    keywords2 = ["gene", "protein", "chemical"]


    df["self_label"] = df["abstract"].apply(lambda x: any([ k.lower() in x.lower() for k in keywords1]) 
                                        and sum([  k.lower() in x.lower() for k in keywords2])>=2
                                     
                                       )
    
    
    return df

In [4]:
df_train_unique = load_unique_abstract(train_data_file).pipe(self_label)
df_test_unique = load_unique_abstract(test_data_file).pipe(self_label)
df_val_unique = load_unique_abstract(val_data_file).pipe(self_label)

In [5]:
df_train_unique.head(n=3)

Unnamed: 0,abstract_id,abstract,self_label
0,10047461,Cyclin E-cdk2 activation is associated with ce...,False
1,10095983,New aspects in the management of obesity: oper...,False
2,10200320,Cyclopentenone prostaglandins suppress activat...,True


In [6]:
df_train_unique["self_label"].value_counts()

False    628
True     139
Name: self_label, dtype: int64

In [7]:
df_test_unique["self_label"].value_counts()

False    513
True     107
Name: self_label, dtype: int64

In [8]:
df_val_unique["self_label"].value_counts()

False    371
True      72
Name: self_label, dtype: int64

In [9]:
def upload_to_s3(df, s3_dest):
    b = io.StringIO()
    df.to_json(b)
    
    sagemaker.s3.S3Uploader.upload_string_as_file_body(b.getvalue(), s3_dest)

In [10]:
def create_sample(df):
    b = io.StringIO()
    df.sample(n=10).to_json("self-supervised-sample.json")
    


In [11]:
create_sample(df_train_unique)

In [12]:
upload_to_s3(df_train_unique, "s3://aegovan-data/self-supervised/train.json" )
upload_to_s3(df_val_unique, "s3://aegovan-data/self-supervised/val.json" )
upload_to_s3(df_test_unique, "s3://aegovan-data/self-supervised/test.json" )