In [3]:
import csv
import json
import os
import argparse
import logging
import random
import numpy as np
import re
from sklearn.model_selection import KFold

In [4]:
random.seed(5)

In [5]:
medsts_path = '/home/dc925/project/data/seq_pair/MEDSTS'

In [203]:
def preprocess_text(sent):
    sent = sent.lower()
    sent = re.sub(r'[\d]*', '', sent)
    sent = re.sub(r'[^\w\s]', ' ', sent)


    return sent

In [205]:
k = 6


In [210]:
original = os.path.join(medsts_path, 'data.jsonl')
with open(original, 'r') as f:
    lines = []
    for line in f:
        example = json.loads(line)
        example['sentence1'] = preprocess_text(example['sentence1'])
        example['sentence2'] = preprocess_text(example['sentence2'])
        lines.append(example)
    size = len(lines)
    
    
final_fold_path = os.path.join(medsts_path, 'k_fold_{}'.format(k))
os.makedirs(final_fold_path, exist_ok=True)
train_out = os.path.join(final_fold_path, 'train.jsonl')
dev_out = os.path.join(final_fold_path, 'dev.jsonl')
random.shuffle(lines)
train = lines[:1496]
dev = lines[1496:]


with open(train_out, 'w') as f:
    for line in train:
        json.dump(line, fp=f)
        f.write('\n')
with open(dev_out, 'w') as f:
    for line in dev:
        json.dump(line, fp=f)
        f.write('\n')

In [29]:
#so that's k_fold_5, finished running data pipeline

In [6]:
import pandas as pd

In [7]:
logits_dir = '/home/dc925/project/medsts/logits'

In [8]:
folds = [0, 1, 2, 3, 4]

In [9]:
logits = [pd.read_csv(os.path.join(logits_dir, 'fold_{}_logits.csv'.format(k)), index_col=0) for k in folds]

In [10]:
logits.__len__()

5

In [11]:
tables = []

In [20]:
k = 4

In [21]:
best_val = 0
best_n = 0
for n in range(3, 288 * 3 // 5, 2):
    ensemble_table = logits[k]
    top_cols = ensemble_table.corr().sort_values(by='label', axis=1, ascending=False).columns[1:n+1]
    ensemble_table['ensemble'] = ensemble_table[top_cols].mean(axis=1)
    val = ensemble_table.corr().iloc[0,-1:].item()
    if val > best_val:
        best_val = val
        best_n = n
        print('{} \t {}'.format(best_n, best_val))
print(best_val)
print(best_n)
top_cols = ensemble_table.corr().sort_values(by='label', axis=1, ascending=False).columns[1:best_n+1]
ensemble_table['ensemble'] = ensemble_table[top_cols].mean(axis=1)
tables.append(ensemble_table)

3 	 0.9723686026516355
5 	 0.9733891813705723
7 	 0.974915495046161
9 	 0.9752763831899556
11 	 0.9765627270055668
13 	 0.9766621111788025
15 	 0.977025612242817
17 	 0.9770821602887577
19 	 0.9772765435990991
21 	 0.9773606067189637
0.9773606067189637
21


In [63]:
#now, we want to extract pid:ensemble prediction for each fold and aggregate them across folds


In [22]:
soft_labels = {}

In [23]:
for k in folds:
    for pid, pred in tables[k]['ensemble'].items():
        pred = round(pred, 4)
        if pid in soft_labels:
            soft_labels[pid].append(pred)
        else:
            soft_labels[pid] = [pred]


In [24]:
soft_labels_ave = {k:round(np.mean(v), 4) for k, v in soft_labels.items()}

In [154]:
#read in train.jsonl from k_fold_5 and write in averaged soft label for each example

In [25]:
soft_labels_ave.__len__()

1641

In [28]:
train_path = os.path.join(medsts_path, 'k_fold_9/train.jsonl')
with open(train_path, 'r') as f:
    train = []
    for line in f:
        example = json.loads(line)
        train.append(example)
    
    

In [27]:
soft_labels_ave[1275] = 3.85

In [29]:
len(train)

1642

In [30]:
len(soft_labels_ave)

1642

In [31]:
t = 0
for example in train:
    for pid, soft_label in soft_labels_ave.items():
        if example['pid'] == pid:
            t +=1
            example['label_soft'] = soft_label
    if 'label_soft' not in example:
        example['label_soft'] = example['label']

In [32]:
t

1642

In [180]:
# for ex in train:
#     print('{} \t {} \t {}'.format(ex['pid'], ex['label'], ex['label_soft']))

In [2]:
soft_labels_ave[871]

NameError: name 'soft_labels_ave' is not defined

In [33]:
with open(train_path, 'w') as f:
    for line in train:
        json.dump(line, fp=f)
        f.write('\n')

In [34]:
def extract_strings(mappings, sentence):
    strings = []
    for phrase in mappings:
        if phrase['mapping']:
            for m in phrase['mapping']:
                text = m['preferred'].lower().split()
                strings += text
                
                if 'Clinical Drug' in m['semtypes']:
                    strings += ['clinical drug']
                if 'Pharmacologic Substance' in m['semtypes']:
                    strings += ['pharmacologic substance']
    out = " ".join(strings)
    out = re.sub(r'[^\w\s]', ' ', out)
    out = out.split()
    out = [t for t in out if t not in banned_words and t not in sentence]
    out = list(set(out))
    out = " ".join(out)
    return out

def augment_sentences(mm_output, data):

    assert len(mm_output) == len(data)*2
    i = 0
    j = 0
    augment_strings = []
    while i < len(mm_output)-1:
        aug = extract_strings(mm_output[i], data[j]['sentence1'])
        data[j]['augment1'] = aug
        i += 1
        aug = extract_strings(mm_output[i], data[j]['sentence2'])
        data[j]['augment2'] = aug
        i += 1
        j += 1


In [35]:
data_dir = '/home/dc925/project/data/seq_pair/MEDSTS/k_fold_9'

In [39]:
import pickle

In [48]:
test_mapping_path = 'filtered_test.p'
train_mapping_path = 'filtered_train.p'

In [49]:
with open(os.path.join(data_dir, test_mapping_path), 'rb') as f:
    test_mapping = pickle.load(f)
with open(os.path.join(data_dir, train_mapping_path), 'rb') as f:
    train_mapping = pickle.load(f)

In [50]:
test_path = os.path.join(data_dir, 'test.jsonl')
with open(test_path, 'r') as f:
    test = []
    for line in f:
        example = json.loads(line)
        test.append(example)

In [51]:
banned_words = ['contextual', 'qualifier', 'value', 'action', '-', 'patients', 'of', 'person', 'feels', 'time', 'to', \
               'contents', 'escalation', 'cavity', 'region', 'medical', 'discussion', 'procedure', 'unit', 'dose', 'appearance', \
               'feelings', 'has', 'does', 'finding', 'function', 'in', 'qualitative', 'changing', 'publication', 'educational', \
                'by', 'for', 'with', 'from', 'continuous', 'transducers', 'process', 'needs', 'individual', 'reporting', 'chief', \
               'relationships', '6', '10', 'syncytial', 'human', 'masks', 'muscle', 'training', 'virus',]

augment_sentences(test_mapping, test)
augment_sentences(train_mapping, train)

In [44]:
len(train_mapping)

3284

In [45]:
len(train)

1642

In [52]:
len(test_mapping)

824

In [47]:
len(test)

412

In [53]:
with open(test_path, 'w') as f:
    for line in test:
        json.dump(line, fp=f)
        f.write('\n')
with open(train_path, 'w') as f:
    for line in train:
        json.dump(line, fp=f)
        f.write('\n')