# Import Packages

In [None]:
from google.colab import drive 
from google.colab import files
import pandas as pd
import numpy as np
import pickle
import re
import unicodedata
import string

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [85]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from wordcloud import WordCloud

In [None]:
!pip install scispacy
!pip install spacy_langdetect
import spacy
import scispacy
from spacy_langdetect import LanguageDetector
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.3.0/en_core_sci_lg-0.3.0.tar.gz
import en_core_sci_lg

Collecting https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.3.0/en_core_sci_lg-0.3.0.tar.gz
  Using cached https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.3.0/en_core_sci_lg-0.3.0.tar.gz
Building wheels for collected packages: en-core-sci-lg
  Building wheel for en-core-sci-lg (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-sci-lg: filename=en_core_sci_lg-0.3.0-cp36-none-any.whl size=502173407 sha256=d58ece2524b1d6fb2557754c058cc473a9554fb15f1b4049f52a58f13c182d75
  Stored in directory: /root/.cache/pip/wheels/86/e8/9b/7eca1465cb6997429390e40ac9d4e1890863b0395deaddf2f0
Successfully built en-core-sci-lg


In [None]:
!apt install libomp-dev
!python -m pip install --upgrade faiss faiss-gpu
import faiss
!pip install sentence_transformers
import torch
from sentence_transformers import SentenceTransformer

Reading package lists... Done
Building dependency tree       
Reading state information... Done
libomp-dev is already the newest version (5.0.1-1).
0 upgraded, 0 newly installed, 0 to remove and 13 not upgraded.
Requirement already up-to-date: faiss in /usr/local/lib/python3.6/dist-packages (1.5.3)
Requirement already up-to-date: faiss-gpu in /usr/local/lib/python3.6/dist-packages (1.6.5)
fatal: destination path 'vector_engine' already exists and is not an empty directory.


In [None]:
!git clone https://github.com/kstathou/vector_engine

Cloning into 'vector_engine'...
remote: Enumerating objects: 58, done.[K
remote: Counting objects: 100% (58/58), done.[K
remote: Compressing objects: 100% (40/40), done.[K
remote: Total 58 (delta 22), reused 51 (delta 15), pack-reused 0[K
Unpacking objects: 100% (58/58), done.


In [None]:
cd vector_engine

/content/vector_engine/vector_engine/vector_engine


In [None]:
!pip install -r requirements.txt

Obtaining file:///content/vector_engine/vector_engine/vector_engine (from -r requirements.txt (line 9))
Installing collected packages: vector-engine
  Found existing installation: vector-engine 0.1.0
    Can't uninstall 'vector-engine'. No files were found to uninstall.
  Running setup.py develop for vector-engine
Successfully installed vector-engine


In [None]:
from vector_engine.utils import vector_search, id2details



In [None]:
%load_ext google.colab.data_table

The google.colab.data_table extension is already loaded. To reload it, use:
  %reload_ext google.colab.data_table


# Mount Google Drive To Access Uploaded Dataset

In [None]:
drive.mount('/content/gdrive')

Mounted at /content/gdrive


# Load Dataset

In [133]:
my_articles = pd.read_csv('/content/gdrive/MyDrive/cord19-subset-500.csv')

OSError: ignored

In [None]:
my_articles.shape

In [None]:
my_articles.head(3)  # Show first records

# Detect Article Language

In [None]:
nlp = en_core_sci_lg.load(disable=["tagger", "ner"])
nlp.max_length = 2000000
nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)

In [None]:
my_articles['text_language'] = my_articles.body_text.apply(lambda x: nlp(str(x[:500]))._.language['language'])

In [None]:
articles_by_lang = my_articles['text_language'].value_counts()

In [None]:
articles_by_lang

In [None]:
my_articles.shape

# Example Heuristics - Regular Expression To Identify Papers Referencing Clinical Trials

In [None]:
reg_expr_id_nct = 'NCT[0-9]{8}' # Define regular expression looking for clinical trial id in NCT format

In [None]:
found_trials = my_articles.body_text.str.findall(reg_expr_id_nct) # Search body text of all articles using regular expression with clinical id format

In [None]:
found_trials

In [None]:
number_trials_found = sum([len(trial) != 0 for trial in found_trials])

In [None]:
number_trials_found

In [None]:
trials_by_paper = found_trials.to_frame('trials')

In [None]:
trials_by_paper.set_index(my_articles.paper_id, inplace=True)
trials_by_paper = trials_by_paper[trials_by_paper.trials.str.len() != 0]

In [None]:
trials_by_paper.head()

# Cleansing - Define stop words and remove from article text

In [77]:
# Remove words that occur with high frequency but offer little substantive information about meaning of sentences
custom_stop_words = ['et', 'al', 'nttttusepackageamsfonts', 'nttttusepackageamssymb', 'nttttusepackageamsbsynttttusepackagemathrsfsnttttusepackageupgreeknttttsetlengthoddsidemargin69ptnttttbegindocument',
                     'level', 'using', 'two', 'group', 'change', 'table', 'fig', 'time', 'one', 'state', 'data', 'der', 'use', 'higher', 'feature', 'rate', 'different', 'may',
                     'effect', 'first', 'reported', 'case', 'number', 'used', 'new', 'participant', 'analysis', 'day', 'among', 'probe', 'case', 'system', 'variable', 'compared',
                     'usepackageamsmath', 'usepackagewasysym', 'usepackageamsfonts', 'usepackageamssymb', 'usepackageamsbsy', 'result', 'value', 'shown', 'figure', 'level', 'including', 'due',
                     'usepackagemathrsfs', 'usepackageupgreek', 'λex', 'nm', 'λem', 'writing', 'review', 'editing', 'positive', 'coping', 'present', 'study', 'centrality', 'measures', 'united', 'states',
                     'result', 'results', 'saa', 'day', 'change', 'method', 'within', 'based', 'from', 'year', 'form', 'features', 'will', 'model',
                     'documentclassminimal', 'hz', 'begindocument', 'also', 'values', 'however', 'average', 'models']

stopwords = nltk.corpus.stopwords.words('english')


In [78]:
stopwords.extend(custom_stop_words)
extended_stopwords = stopwords

In [None]:
extended_stopwords

# Example Wordcloud Showing Word Dominance/Importance

In [None]:
def clean_text(s):        
        words = str(s).lower()
        words = re.sub('\[.*?\]', '', words)
        words = re.sub('https?://\S+|www\.\S+', '', words)
        words = re.sub('<.*?>+', '', words)
        words = re.sub('[%s]' % re.escape(string.punctuation), '', words)
        words = re.sub('\n', '', words)
        words = re.sub('\w*\d\w*', '', words)
        words = word_tokenize(words)
        words = [w for w in words if not w in extended_stopwords]
        words = [w for w in words if w.isalpha()]
        words = ' '.join(words)
        return words

In [79]:
my_articles['processed_body_text'] = my_articles['body_text'].apply(lambda x: clean_text(x))

In [None]:
my_articles.head(1)

In [None]:
wordcloud = WordCloud(collocations=False, background_color='black', max_words=30, contour_width=3, contour_color='steelblue', width=800, height=400)
wordcloud.generate(''.join(str(my_articles['processed_body_text'].tolist())))
plt.figure( figsize=(30,15))
plt.axis("off")
plt.imshow(wordcloud, interpolation="bilinear")

# Example N-Grams

In [None]:
vec = CountVectorizer(ngram_range=(3, 3)).fit(my_articles['processed_body_text'])
bag_of_words = vec.transform(my_articles['processed_body_text'])
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
x, y = map(list, zip(*words_freq[:10]))
plt.style.use('ggplot')
plt.figure(figsize=(25, 8))
fig = sns.barplot(x=y, y=x, color='deeppink')
fig.figure.suptitle('Top Trigrams', fontsize=25)
fig.set(xlabel='# Occurences')
plt.show()

In [None]:
body_text = my_articles['processed_body_text'].tolist()
tf_vectorizer = CountVectorizer(max_df=0.80, min_df=3, max_features=1000,
                                        stop_words=custom_stop_words)
tf = tf_vectorizer.fit_transform(body_text)
lda = LatentDirichletAllocation(n_components=4,
                                max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=24)
lda.fit(tf)
tf_feature_names = tf_vectorizer.get_feature_names()
topic_collection = {}
for idx, topic in enumerate(lda.components_):
    topic_words = [tf_feature_names[i] for i in topic.argsort()[:-10 - 1:-1]]
    topic_collection[idx] = topic_words
    print('Topic {} : Words {}'.format(str(idx), topic_words))

In [114]:
doc_topic = lda.transform(tf)

In [None]:
for i, x in enumerate(doc_topic[:20]):
  print('Article {} highest probablilty of being topic {}'.format(my_articles.iloc[i]['paper_id'], x.argmax()))

In [116]:
# Instantiate the sentence-level DistilBERT
model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')
# Check if GPU is available and use it
if torch.cuda.is_available():
    model = model.to(torch.device("cuda"))
print(model.device)

100%|██████████| 245M/245M [00:33<00:00, 7.32MB/s]


cpu


In [117]:
embeddings = model.encode(my_articles.body_text.to_list(), show_progress_bar=True)

HBox(children=(FloatProgress(value=0.0, description='Batches', max=16.0, style=ProgressStyle(description_width…




In [118]:
my_articles['id'] = my_articles.index + 1

In [None]:
# Step 1: Change data type
embeddings = np.array([embedding for embedding in embeddings]).astype("float32")

# Step 2: Instantiate the index
index = faiss.IndexFlatL2(embeddings.shape[1])

# Step 3: Pass the index to IndexIDMap
index = faiss.IndexIDMap(index)

# Step 4: Add vectors and their IDs
index.add_with_ids(embeddings, my_articles.id.values)


In [129]:
user_query = """
what are the transmission dynamics of the virus, including the basic reproductive number, incubation period, serial interval, modes of transmission and environmental factors
"""

In [None]:
D, I = vector_search([user_query], model, index, num_results=10)
print(f'L2 distance: {D.flatten().tolist()}\n\nPaper IDs: {I.flatten().tolist()}')

In [None]:
id2details(my_articles, I, 'title')