# Specter Embeddings

Delvin So

This notebook

1) performs minimal pre-processing to get the datasets into a SPECTER inference ready format

2) uses the provided bash script in the SPECTER repo to perform inference, embedding the abstracts

3) cleans the `jsonl` file from SPECTER into a pickle for downstream modeling


NOTE: This is likely much easier to implement now with HF support for specter!




Follow instructions for setting up specter per: https://github.com/allenai/specter

Then, download datasets_complete and unzip into `specter`

In [1]:
import numpy as np
import pandas as pd
from glob import glob
import os
import json
import csv

In [2]:

root_dir = '.'
os.chdir(root_dir)

The following prepares the datasets for input into specter, namely creating a list of paper ids and a json with the paper id, abstract and titles of the paper.

In [3]:
data_dir = '../../cleaned_data/*tsv'
out_dir = '../specter/datasets_json'

if not os.path.exists(out_dir): os.makedirs(out_dir)

In [4]:
f = glob(data_dir)

In [5]:
for fn in f: 
    basen = os.path.basename(fn)
    txt = os.path.splitext(basen)[0]
    txt = txt.split('_')[0].replace(' ', '_')

    print(txt)


sample


In [5]:


length_d = {
    
    'ADIPP': 47491,
    'BBD': 43569,
    'Hydronephrosis': 4339,
    'NCDs': 17533,
    'Rehab': 12042,
    'Scaling': 10679,
    'VitaminD': 1397,
    'WASH': 5986,
    'mucosal': 3987,
    'pibd' : 7560,
    'ocd' : 347,
    'sample': 200
}


for fn in f:
    # ----- construct new filename for json and ids ----
    print(fn)
    basen = os.path.basename(fn)
    txt = os.path.splitext(basen)[0]
    txt = txt.split('_')[0].replace(' ', '_')
    
    out_json = os.path.join(out_dir, '{}.json'.format(txt))
    out_ids = os.path.join(out_dir, '{}.ids'.format(txt))
    out_labels_ids = os.path.join(out_dir, '{}_labels_ids.tsv'.format(txt))
    print(out_json)
    
    # ------ read in each file and ultimately dump to json per specter formatting -----
    if fn.endswith('tsv'):

        dat = pd.read_csv(fn, encoding = 'ISO-8859-1', sep = '\t')
        
    elif fn.endswith('xlsx'):
        
        dat = pd.read_excel(fn)
        
        

    # ----- processing identically to our datasets -----
    # TODO: if we have time before manuscript, fix this
    
    print('\tBefore ' + str(dat.shape[0]))

    # Removes the filler ' ' from the first notebook. Could go immediately from ' ' to None but useful as a sanity check.
    dat['Title'] = dat['Title'].replace(r'^\s*$', np.nan, regex=True)
    print('NULl TITlES:' + str(dat['Title'].isnull().sum()))
    dat['Title'] = dat['Title'].replace(np.nan, 'None') 
    
    print('\tAfter ' + str(dat.shape[0]))


    # ----- processing for specter -----
    

    print('Dataset: {} Size: {}'.format(txt, dat.shape[0]))
    assert(np.array(dat.index).shape[0] == length_d[txt]), 'Dataset sizes do not line up!!'

    dat.columns = map(str.lower, dat.columns) # lowercase for compatability w/ specter
    dat = dat.rename(columns = {'unq_id':'paper_id'}) # paper_id is the identifier for specter inference
    
    # ----- just json things -----
    dat.index = dat.paper_id # copy unique id over to the index column so it matches up with the json row #, for inference
    new_json = dat[['title', 'abstract', 'paper_id']].to_json(orient = 'index')
    parsed_json = json.loads(new_json)

    with open(out_json, 'w', encoding = 'utf-8') as f:
        json.dump(parsed_json, f, ensure_ascii=False, indent=4)
        
    # ---- create the '*.ids' file as needed for inference ----
    
    dat.paper_id.to_csv(out_ids, header=False, index = False)
    
    # ---- create the labels and ids to map the embeddings back to -----
    # quoting accounts for delimiters such as \r found in 'mucosal'
    dat.to_csv(out_labels_ids, index = False, sep = '\t', quoting=csv.QUOTE_NONNUMERIC) 
    
    print('\n')
    
    
print('Done!')

../../cleaned_data/sample_oct.tsv
../specter/datasets_json/sample.json
	Before 200
NULl TITlES:0
	After 200
Dataset: sample Size: 200


Done!


In [6]:
# sanity check to see "None" exists in the title and nothing else was replaced by accident
for f in glob('datasets_json/*labels_ids*'):
    test = pd.read_csv(f, sep = '\t')
    print(f + '\n\t')
    print(test[test.title.str.contains(r'\bNone\b', regex=True, case=True)])

Quick and dirty bash script to loop through each dataset and embed, saving to `out_dir`

In [None]:
%%bash
# make sure you conda activate specter, are in the specter dir, and
# the ids and output are absolute as specter doesn't like relative paths
data_dir=/home/delvinso/sra_sample/data_preprocessing/specter
out_dir=/home/delvinso/sra_sample/data_preprocessing/specter/embed_json
mkdir -p ${out_dir}

for f in $(ls  ${data_dir}/datasets_json/*ids | egrep -v 'ADIPP|Scaling|Rehab|mucosal')
#for f in $(ls  ${data_dir}/datasets_json/*ids | egrep 'ADIPP|Scaling|Rehab|mucosal|ocd')
do
    f=$(basename ${f} | sed 's/\.ids//g')
    echo ${f}
    
    
    python3 scripts/embed.py \
        --ids ${data_dir}/datasets_json/${f}.ids --metadata ${data_dir}/datasets_json/${f}.json \
        --model ./model.tar.gz \
        --output-file ${out_dir}/${f}.jsonl \
        --vocab-dir data/vocab/ \
        --batch-size 58 \
        --cuda-device 0

done
echo "Done!"

# Specter Embedding Cleaning

The following code creates a dictionary of abstract IDs, embedding, and target (abstract inclusion) for input into our models.

In [8]:
import pickle
from glob import glob
import os
import pandas as pd
import numpy as np
import json

In [9]:
embed_jsons = glob(os.path.join('..', 'specter', 'embed_json', '*jsonl'))

print(embed_jsons)

['../specter/embed_json/sample.jsonl']


In [10]:
# https://medium.com/@galea/how-to-love-jsonl-using-json-line-format-in-your-workflow-b6884f65175b
def load_jsonl(input_path) -> list:
    """
    Read list of objects from a JSON lines file.
    """
    data = []
    with open(input_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line.rstrip('\n|\r')))
    print('Loaded {} records from {}'.format(len(data), input_path))
    return data

In [12]:

out_pickle = os.path.join('..', '..', 'pickles', 'specter')

# dictionary for comparing dataset sizes 
length_d = {
    
    'ADIPP': 47491,
    'BBD': 43569,
    'Hydronephrosis': 4339,
    'NCDs': 17533,
    'Rehab': 12042,
    'Scaling': 10679,
    'VitaminD': 1397,
    'WASH': 5986,
    'mucosal': 3987,
    'pibd' : 7560,
    'ocd': 347,
    'sample': 200
}

if not os.path.exists(out_pickle): os.makedirs(out_pickle)
    
for f in embed_jsons:
    
    bn = os.path.basename(f).split('.')[0]
    out_fn = os.path.join(out_pickle, bn + '.p')
    print(bn)
    
    # ---- load in the specter embeddings (json) and convert to dataframe -----
    jsonl = load_jsonl(f)
    jsonl_df = pd.DataFrame(jsonl)


    # ----- read in the data used to created the specter embeddings ------
    og_dat = pd.read_csv(os.path.join('..', 'specter', 'datasets_json/' + bn + '_labels_ids.tsv'), sep = '\t' )

    print(og_dat.columns)

    try:
        og_dat = og_dat.rename(columns = {'covidence..': 'covidence #'})
    except e:
        print('Covidence.. not found in columns, skipped')

    # ----- get labels and join to embeddings ------
    
    try:
        jsonl_df2 = jsonl_df.set_index('paper_id')\
            .join(og_dat[['paper_id', 'inclusion', 'fulltext_inclusion', 'covidence #']]\
            .set_index('paper_id'), how = 'left')
    except KeyError:
        jsonl_df2 = jsonl_df.set_index('paper_id')\
            .join(og_dat[['paper_id', 'inclusion', 'covidence #']]\
            .set_index('paper_id'), how = 'left')

    jsonl_df2 = jsonl_df2.reset_index()  # turns index, or paper id back into a column

    # ----- validate -----
    print('Dataset: {} Size: {}'.format(bn, np.array(jsonl_df2.paper_id).shape[0]))
    assert(np.array(jsonl_df2.paper_id).shape[0] == length_d[bn]), 'Dataset size does not match known ones!!'


    # ----- create the dictionary of embeddings -----
    d = {}
    d['ids'] = np.array(jsonl_df2.paper_id)
    d['embeddings'] = np.vstack(jsonl_df2.embedding)
#     print(d['embeddings'][0].shape)
    d['labels'] = np.concatenate(np.vstack(jsonl_df2.inclusion))
    try:
        d['final_labels'] = np.concatenate(np.vstack(jsonl_df2.fulltext_inclusion))
    except AttributeError:
        d['final_labels'] = None
    d['title'] = np.concatenate(np.vstack(jsonl_df2.title))
    d['covidence #'] = np.concatenate(np.vstack(jsonl_df2['covidence #']))

    pickle.dump(d, open(out_fn, 'wb'))
    
print('Done!')




sample
Loaded 200 records from ../specter/embed_json/sample.jsonl
Index(['paper_id', 'all_text_clean', 'metadata_clean', 'inclusion',
       'fulltext_inclusion', 'covidence..', 'title', 'abstract', 'all_text',
       'metadata'],
      dtype='object')
Dataset: sample Size: 200
Done!
