In [7]:
import ftfy
import pandas as pd
import spacy
import textacy.preprocessing as pp
import torch

from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
torch.cuda.device_count()

3

In [3]:
model = SentenceTransformer('pritamdeka/S-PubMedBert-MS-MARCO').to('cuda:0')

In [4]:
path = '../data/full_data.xlsx'
df = pd.read_excel(path)

In [5]:
# Grab text columns
text_cols = df.filter(regex='^full_text')
# Merge text columns into a single column
df['full_text'] = text_cols.apply(
    lambda x: ' '.join(x.dropna()),
    axis=1)

In [8]:
def preprocess(text):
    """Normalize some distracting parts of text data.
    URLS, phone numbers and email addresses are remove to protect people's
    identities if that content ends up in our data. Accents are removed and
    everything is case-folded to reduce the character and type vocab that we
    deal with downstream.
    Parameters
    ----------
    text : str
    Returns
    -------
    str
    """
    text = ftfy.fix_text(text)
    text = text.lower()
    text = pp.normalize.whitespace(text)
    text = text.replace('\n', ' ')
    text = pp.replace.urls(text, repl='URL')
    text = pp.replace.phone_numbers(text, repl='PHONE')
    text = pp.replace.emails(text, repl='EMAIL')
    text = pp.replace.user_handles(text, repl='USER')
    text = pp.remove.accents(text)
    return text

In [9]:
processed = df['full_text'].apply(preprocess)

In [10]:
sentences = [sent_tokenize(processed[idx])
             for idx in range(len(processed))]
sentences = [sentence for article in sentences
             for sentence in article]

In [12]:
embeddings = model.encode(sentences[:100])

In [13]:
print(embeddings.shape)

(100, 768)


In [14]:
cosine_similarity(embeddings)

array([[0.99999946, 0.89983046, 0.93775976, ..., 0.866554  , 0.8755862 ,
        0.83921456],
       [0.89983046, 0.9999998 , 0.8850118 , ..., 0.87664735, 0.8483837 ,
        0.8276462 ],
       [0.93775976, 0.8850118 , 0.9999999 , ..., 0.8587393 , 0.8812106 ,
        0.82664096],
       ...,
       [0.866554  , 0.87664735, 0.8587393 , ..., 0.9999999 , 0.87030977,
        0.82861465],
       [0.8755862 , 0.8483837 , 0.8812106 , ..., 0.87030977, 0.99999976,
        0.7955004 ],
       [0.83921456, 0.8276462 , 0.82664096, ..., 0.82861465, 0.7955004 ,
        0.9999999 ]], dtype=float32)

In [16]:
print(sentences[0])
print(sentences[300])

covid-19: failure to control pandemic and inequalities made england worst affected in europe, says report shaun griffin a failure to control the coronavirus pandemic, coupled with widening inequality over the past decade, has led to england having the highest rate of excess deaths from covid-19 in europe, a new report concluded.1 rather than focusing on narrow economic goals, health and wellbeing should be at the heart of government strategy, it says.
my favourite resonates with the need to care for oneself, and is "at a cardiac arrest, the first procedure is to take your own pulse."
