### Create self supervised dataset

In [1]:
import pandas as pd
import numpy as np
import json
import sagemaker,  io
import re

In [2]:
train_data_file = "../chemprot_abstract_train.json"
test_data_file = "../chemprot_abstract_test.json"
val_data_file = "../chemprot_abstract_val.json"


s3_external_sources = ["s3://aegovan-data/pubmed-json/pubmed19n0908.json", 
                      "s3://aegovan-data/pubmed-json/pubmed19n0907.json",
                      "s3://aegovan-data/pubmed-json/pubmed19n0906.json"
                      ]

In [3]:
keywords1 = ["activation", "trigger", "interact", "inhibit", "regulat", "supress"]

keywords2 = ["gene", "protein", "chemical"]



    
def load_unique_abstract(datafile):
    with open(datafile) as f:
        data = json.load(f)
    
    abstracts = set()
    results = []
    
    for r in data:
        if r["abstract_id"] not in abstracts:
            abstracts.add(r["abstract_id"])
            results.append({
                "abstract_id" : r["abstract_id"],
                 "abstract" : r["abstract"]
            })
    return pd.DataFrame(results)


def self_label(df):
   

    df["self_label"] = df["abstract"].apply(lambda x: any([ k.lower() in x.lower() for k in keywords1]) 
                                        and sum([  k.lower() in x.lower() for k in keywords2])>=2
                                     
                                       )
    
    
    return df


def randomly_substitute_keywords(x):
    words = x.split(" ")
    
    key_i = np.random.choice([0,1])
    if key_i == 0:
        for k in keywords1:
            insensitive = re.compile(re.escape(k), re.IGNORECASE)
            w =  np.random.randint(0,len(words)-1)
            x = insensitive.sub(words[w], x)
    else:
        for k in keywords2:
            insensitive = re.compile(re.escape(k), re.IGNORECASE)
            w =  np.random.randint(0,len(words)-1)
            x = insensitive.sub(words[w], x)
    return x

def randomly_add_keywords(x):
    
    key_i1 = np.random.randint(0,len(keywords1)-1)
    key_i2 = np.random.randint(0,len(keywords2)-1)
    
    key_1 = keywords1[key_i1]
    keys_2 = keywords2[:key_i2] + keywords2[key_i2+1:]
        
    words = x.split(" ")
    l1 =  np.random.randint(0,len(words)-1)
    l2 =  np.random.randint(0,len(words)-1)
    l3 =  np.random.randint(0,len(words)-1)
    
    words.insert(l1, key_1)
    words.insert(l2, keys_2[0])
    words.insert(l3, keys_2[1])

    return " ".join(words)



In [4]:
df_train_unique = load_unique_abstract(train_data_file).pipe(self_label)
df_test_unique = load_unique_abstract(test_data_file).pipe(self_label)
df_val_unique = load_unique_abstract(val_data_file).pipe(self_label)

In [5]:
df_train_unique.head(n=3)

Unnamed: 0,abstract_id,abstract,self_label
0,10047461,Cyclin E-cdk2 activation is associated with ce...,False
1,10095983,New aspects in the management of obesity: oper...,False
2,10200320,Cyclopentenone prostaglandins suppress activat...,True


In [6]:
df_train_unique["self_label"].value_counts()

False    628
True     139
Name: self_label, dtype: int64

In [7]:
df_test_unique["self_label"].value_counts()

False    513
True     107
Name: self_label, dtype: int64

In [8]:
df_val_unique["self_label"].value_counts()

False    371
True      72
Name: self_label, dtype: int64

In [9]:
def upload_to_s3(df, s3_dest):
    b = io.StringIO()
    df.to_json(b)
    
    sagemaker.s3.S3Uploader.upload_string_as_file_body(b.getvalue(), s3_dest)
    
    
def s3_json_to_df(s3_src):
   
    
    json_str = sagemaker.s3.S3Downloader.read_file( s3_src)
    b = io.StringIO(json_str) 
    df = pd.read_json(b)
    return df



In [10]:
def create_sample(df):
    b = io.StringIO()
    df.sample(n=10).to_json("self-supervised-sample.json")
    


In [11]:
df_pubmed_extra = pd.concat([s3_json_to_df(s).rename(columns = {
    "article_abstract" : "abstract",
    "pubmed_id" : "abstract_id"
}) for s in s3_external_sources])
df_pubmed_extra = df_pubmed_extra.pipe(self_label)
df_pubmed_extra.head()

Unnamed: 0,abstract_id,article_title,abstract,pub_date,self_label
0,28552592,Concurrent anticipation of two object dimensio...,The anticipation of more than one object dimen...,"{'year': '2017', 'month': '08', 'day': None}",False
1,28552593,Tyrphostin A9 improves blastocyst development ...,Mitochondrial dynamics are associated with the...,"{'year': '2017', 'month': '07', 'day': None}",False
2,28552595,Both non-covalent and covalent interactions we...,Persimmon tannin (PT) has been shown to inhibi...,"{'year': '2017', 'month': 'Jul', 'day': None}",False
3,28552594,Plasma concentration and cardiovascular effect...,We investigated the plasma concentrations and ...,"{'year': '2017', 'month': 'May', 'day': None}",False
4,28552596,Gram-scale purification of aconitine and ident...,Aconitum karacolicum from northern Kyrgyzstan ...,"{'year': '2017', 'month': 'Jul', 'day': None}",False


In [12]:
df_pubmed_extra.self_label.value_counts()

False    72082
True      2578
Name: self_label, dtype: int64

In [13]:
create_sample(df_train_unique)

In [14]:
upload_to_s3(df_train_unique, "s3://aegovan-data/self-supervised/train.json" )
upload_to_s3(df_val_unique, "s3://aegovan-data/self-supervised/val.json" )
upload_to_s3(df_test_unique, "s3://aegovan-data/self-supervised/test.json" )

In [15]:
def synthetize_abstract(label, abstract):
    if label == True:
        return randomly_substitute_keywords(abstract)
    else:
        return randomly_add_keywords(abstract)

def create_n_synthetics(df,  synthetic_size ):
    synth_i = 0
    
    result = [df]
    
    while  synth_i < synthetic_size:
        sample_size = min(len(df),  synthetic_size-synth_i)
        
        synthetic_indices = np.random.choice(df.index, sample_size, replace=False)
        
   
        df_substitute = df[ df.index.isin (synthetic_indices)].copy(deep=True)

        df_substitute["abstract"] = df_substitute.apply(lambda r:synthetize_abstract(r["self_label"],r["abstract"]),  axis=1)
        df_substitute["abstract_id"] = df_substitute["abstract_id"].apply(lambda x: f"{x}_1")
        df_substitute["is_synthetic"] = True
        result.append(df_substitute)
        
        synth_i += sample_size

    
    df_result = pd.concat(result).reset_index()
    
    df_result = df_result.pipe(self_label)
    
    
    assert len(df_result) == len(df) + synthetic_size, f"Length do not match {len(df_result)} =={ len(df)+synthetic_size}"
    
    return df_result


def create_n_from_existing(df, df_source, synthetic_size ):
    
    result = [df]
 
    df_extra = df_source.sample(n=synthetic_size)[["abstract_id", "abstract"]].copy(deep=True)

    df_extra["is_extra"] = True
    
    result.append(df_extra)
        
    
    df_result = pd.concat(result).reset_index()
    
    df_result = df_result.pipe(self_label)
    
    
    assert len(df_result) == len(df) + synthetic_size, f"Length do not match {len(df_result)} =={ len(df)+synthetic_size}"
    
    return df_result


def create_synthetic_samples(df, duplicate_ratio = .1):
    
    df_orig_pos = df.query("self_label == True")
    df_orig_neg = df.query("self_label == False")

    synthetic_size= int(len(df_orig_neg) * duplicate_ratio)
    df_pos = create_n_synthetics(df_orig_pos, synthetic_size)
    
    synthetic_size= int(len(df_orig_pos) * duplicate_ratio)
    df_neg = create_n_synthetics(df_orig_neg,  synthetic_size)
    df = pd.concat([df_pos, df_neg]).reset_index()
    
    return df
    
    
def create_existing_samples(df, df_source, ratio=0.1):
    
    df_orig_pos = df.query("self_label == True")
    df_orig_neg = df.query("self_label == False")

    synthetic_size= int(len(df_orig_pos) * ratio)
    df_pos = create_n_from_existing(df_orig_pos, df_source.query("self_label == True"), synthetic_size)
    
    synthetic_size= int(len(df_orig_neg) * ratio)
    df_neg = create_n_from_existing(df_orig_neg, df_source.query("self_label == False"), synthetic_size)
    df = pd.concat([df_pos, df_neg]).reset_index()
    
    return df    

In [16]:
df_train_unique.query("self_label == True").shape

(139, 3)

In [17]:
def create_fake():
    ratios = [.5, 1,2,3,4,5,6,7, 8, 9, 10 ]
    for r in ratios:
        df_train_fake = df_train_unique.pipe(create_synthetic_samples, r).drop_duplicates(subset=['abstract'])
        df_test_fake = df_test_unique.pipe(create_synthetic_samples, r).drop_duplicates(subset=['abstract'])
        df_val_fake = df_val_unique.pipe(create_synthetic_samples, r).drop_duplicates(subset=['abstract'])

        true_label = df_train_fake.query("self_label == True").shape[0]
        unique = df_train_fake["abstract_id"].apply(lambda x: x.split("_")[0]).nunique()
        total = df_train_fake.shape[0]
        

        suffix=f"{total}_{unique}_{true_label}"
        
        print(suffix, true_label/total)
        
        upload_to_s3(df_train_fake, f"s3://aegovan-data/self-supervised-fake/{suffix}/train.json" )
        upload_to_s3(df_val_fake, f"s3://aegovan-data/self-supervised-fake/{suffix}/val.json" )
        upload_to_s3(df_test_fake, f"s3://aegovan-data/self-supervised-fake/{suffix}/test.json" )
        



def create_real_random():
    ratios = [.5, 1,2,3,4,5,6,7, 8, 9, 10 ]
    for r in ratios:
        df_train_fake = df_train_unique.pipe(create_existing_samples, df_pubmed_extra, r).drop_duplicates(subset=['abstract'])
        df_test_fake = df_test_unique.pipe(create_existing_samples, df_pubmed_extra, r).drop_duplicates(subset=['abstract'])
        df_val_fake = df_val_unique.pipe(create_existing_samples, df_pubmed_extra, r).drop_duplicates(subset=['abstract'])

        true_label = df_train_fake.query("self_label == True").shape[0]
        unique = df_train_fake["abstract_id"].nunique()
        total = df_train_fake.shape[0]
        

        suffix=f"{total}_{unique}_{true_label}"
        
        print(suffix, true_label/total)
        
        upload_to_s3(df_train_fake, f"s3://aegovan-data/self-supervised-real/{suffix}/train.json" )
        upload_to_s3(df_val_fake, f"s3://aegovan-data/self-supervised-real/{suffix}/val.json" )
        upload_to_s3(df_test_fake, f"s3://aegovan-data/self-supervised-real/{suffix}/test.json" )
 

In [18]:
create_real_random()

1150_1150_208 0.1808695652173913
1533_1533_278 0.18134377038486627
2301_2301_417 0.18122555410691005
3068_3068_556 0.18122555410691005
3832_3832_695 0.18136743215031315
4602_4602_834 0.18122555410691005
5368_5368_973 0.18125931445603577
6133_6133_1112 0.18131420185879668
6901_6901_1250 0.18113316910592667
7665_7665_1390 0.18134377038486627
8437_8437_1529 0.18122555410691005


In [19]:
# create_fake()

```
1149_767_213 0.185378590078329
1531_767_286 0.1868060091443501
2293_767_445 0.19406890536415178
3048_767_573 0.18799212598425197
3805_767_723 0.1900131406044678
4558_767_882 0.193505923650724
5271_767_1026 0.19464997154240182
6021_767_1164 0.1933233682112606
6754_767_1302 0.19277465205803967
7482_767_1465 0.19580326116011762
8217_767_1593 0.19386637458926614

```