In [1]:
!pip install transformers

import json
from collections import Counter
from nltk.corpus import stopwords
from itertools import filterfalse as ifilterfalse
from sklearn.model_selection import train_test_split
import re
from time import time
from transformers import AutoTokenizer
import numpy as np
import pandas as pd
from transformers import TFAutoModelForSequenceClassification
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers.schedules import PolynomialDecay
from tensorflow.keras.optimizers import Adam


Collecting transformers
  Downloading transformers-4.10.0-py3-none-any.whl (2.8 MB)
[K     |████████████████████████████████| 2.8 MB 5.2 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 40.6 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 35.9 MB/s 
[?25hCollecting huggingface-hub>=0.0.12
  Downloading huggingface_hub-0.0.16-py3-none-any.whl (50 kB)
[K     |████████████████████████████████| 50 kB 6.0 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 46.3 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: Py

In [2]:
label2id = {
    'NEITHER': 0,
    'EVIDENCE': 1,
    'CLAIM': 2,
    'NONE': 0
}

def load_corpus(path, label_mapping=None):
    with open(path) as fp:
        corpus = json.load(fp)

    documents, texts, labels = [], [], []
    for abstract in corpus:
        documents.append(abstract)
        texts.append(corpus[abstract]['sentences'])
        if isinstance(label_mapping, dict):
            labels.append(
                [label_mapping[str(l).upper()]
                    for l in corpus[abstract]['labels']])
        else:
            labels.append([str(l).upper() for l in corpus[abstract]['labels']])

    assert len(texts) == len(labels)
    data = pd.DataFrame(
        zip(documents, texts, labels),
        columns=['document', 'sentences', 'labels'])

    return data

In [4]:
data1 = load_corpus('dataset_aueb_argument_v3.json' , label_mapping=label2id)
print(f'Dataset 1 length: {len(data1)} abstracts')

data2 = load_corpus('dataset.json' , label_mapping=label2id)
print(f'Dataset 2 length: {len(data2)} abstracts')

data = data1.append(data2)
print(f'Dataset length: {len(data)} abstracts')
data.head(3)

Dataset 1 length: 1017 abstracts
Dataset 2 length: 1669 abstracts
Dataset length: 2686 abstracts


Unnamed: 0,document,sentences,labels
0,doi: 10.1001/jamaneurol.2017.2814,[Concordance Between Different Amyloid Immunoa...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, ..."
1,doi: 10.1001/jamaneurol.2017.4913,[Association of Changes in Plasma Neurofilamen...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2]"
2,doi: 10.1002/2015gl067056,[Dynamically triggered slip leading to sustain...,"[0, 0, 0, 1, 1, 2]"


* Create a datafrme with the 'doc_id' & 'sentences' and a dataframe with the 'doc_id' & 'labels' for each unique sentence in our dataset.

In [5]:
#@title Split to sentences
sentences = data['sentences'].explode().reset_index().rename(
    columns={'index': 'doc_id', 'sentences': 'sentence'})
sentences.sentence = sentences.sentence.astype("string")
sentences.sentence = sentences.sentence.str.strip()

#@title and the corresponding labels
labels = pd.DataFrame(data['labels'].explode()).reset_index(drop = True).rename(
    columns={'labels': 'label'})

* Create a dataframe with the splitted sentences and labels

In [6]:
merged_data = pd.concat([sentences,labels['label']], axis = 1)
print(merged_data.shape)
merged_data.sample(5)

(32004, 3)


Unnamed: 0,doc_id,sentence,label
18710,604,This study combines panel data techniques with...,0
7100,737,"T1 values, T2 values, and ECV were assessed by...",0
27383,1327,The recommended regimens were 17 Gy in two fra...,0
29315,1471,Adverse events (AEs) at both dose levels were ...,1
12783,166,Adult VAW is associated with self-reported his...,2


* We observe that we have some sentences that are one word, so let's remove them.

In [7]:
for i in range(len(merged_data)):
    if (len(merged_data['sentence'][i].split()) < 2):
        merged_data = merged_data.drop(i)
        
merged_data.reset_index(inplace = True, drop = True)
merged_data.shape

(31093, 3)

* remove duplicates.

In [8]:
print('Shape before removing duplicates: ', merged_data.shape)
a = merged_data.shape[0]
merged_data.drop_duplicates(subset=['sentence'], inplace=True)
print('Shape after removing duplicates: ', merged_data.shape)
print('Rows Removed: ', a - merged_data.shape[0])

merged_data.reset_index(drop=True, inplace = True)

Shape before removing duplicates:  (31093, 3)
Shape after removing duplicates:  (30862, 3)
Rows Removed:  231


In [9]:
X = merged_data[['doc_id','sentence']]
y = merged_data['label']

In [10]:
import re

def clean_text(text):
    """
    Pre process and convert texts to a list of words
    :param text:
    :return:
    """

    text = str(text)
    text = text.lower()

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ", text)
    text = re.sub(r"\+", " ", text)
    text = re.sub(r"\-", " ", text)
    text = re.sub(r"\=", " ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)

    return text

In [11]:
X = pd.DataFrame(X)
X['sentence']=X['sentence'].apply(clean_text)

In [12]:
text = X.groupby('doc_id').agg(sentences = ('sentence',','.join)).reset_index()
text.head(5)

Unnamed: 0,doc_id,sentences
0,0,concordance between different amyloid immunoas...
1,1,association of changes in plasma neurofilament...
2,2,dynamically triggered slip leading to sustaine...
3,3,impacts of parameterized orographic drag on th...
4,4,climate model biases in jet streams blocking a...


In [20]:
!pip install sentence_transformers

from sentence_transformers import SentenceTransformer, util



* Load the pretrained transformer model 'allenai-specter'

In [14]:
model = SentenceTransformer('allenai-specter')

abstracts = list(text['sentences'])

embeddings = model.encode(abstracts, convert_to_tensor=True)

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.71k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/622 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/462k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/331 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/222k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [15]:
#We define a function, given title & abstract, searches our corpus for relevant (similar) papers
def search_papers(abstract):
    query_embedding = model.encode(abstract, convert_to_tensor=True)

    search_hits = util.semantic_search(query_embedding, embeddings)
    search_hits = search_hits[0]  #Get the hits for the first query
    search_hits = search_hits[1:] #The first abstract is always the input so we delete it
 
  #print("Paper:", title)
    print("Most similar papers:")
    print("{}\t{}\t{}".format('Score', 'Doc_id', 'Abstract'))
    for hit in search_hits:
        related_paper = text['sentences'][hit['corpus_id']]
        print("{:.2f}\t{}\t{}".format(hit['score'], hit['corpus_id'], related_paper))

* Let's find the most similar absstracts to our 2nd abstract

In [21]:
abstract=abstracts[1]
search_papers(abstract)

Most similar papers:
Score	Doc_id	Abstract
0.86	211	comparison of variables associated with cerebrospinal fluid neurofilament total tau and neurogranin,abstract introduction three cerebrospinal fluid csf markers of neurodegeneration n neurofilament light nfl total tau t tau and neurogranin ng have been proposed under the at n scheme of the national institute on aging alzheimer association research framework ,methods we examined in a community based population n 777 aged 50 95 1 what variables were associated with each of the csf n markers and 2 whether the variables associated with each marker differed by increased brain amyloid ,csf t tau was measured with an automated electrochemiluminescence elecsys immunoassay; nfl and ng were measured with in house enzyme linked immunosorbent assays ,results multiple variables were differentially associated with csf nfl and t tau levels but not ng ,most associations were attenuated after adjustment for age and sex ,t tau had the strongest associat