In [14]:
import ftfy
import pandas as pd
import spacy

from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
model = SentenceTransformer('pritamdeka/S-PubMedBert-MS-MARCO')

In [3]:
path = '../data/full_data.xlsx'
df = pd.read_excel(path)

In [5]:
# Grab text columns
text_cols = df.filter(regex='^full_text')
# Merge text columns into a single column
df['full_text'] = text_cols.apply(
    lambda x: ' '.join(x.dropna()),
    axis=1)

In [6]:
def preprocess(text):
    """Normalize some distracting parts of text data.
    URLS, phone numbers and email addresses are remove to protect people's
    identities if that content ends up in our data. Accents are removed and
    everything is case-folded to reduce the character and type vocab that we
    deal with downstream.
    Parameters
    ----------
    text : str
    Returns
    -------
    str
    """
    text = ftfy.fix_text(text)
    text = text.lower()
    text = pp.normalize.whitespace(text)
    text = text.replace('\n', ' ')
    text = pp.replace.urls(text, repl='URL')
    text = pp.replace.phone_numbers(text, repl='PHONE')
    text = pp.replace.emails(text, repl='EMAIL')
    text = pp.replace.user_handles(text, repl='USER')
    text = pp.remove.accents(text)
    return text

In [7]:
processed = df['full_text'].apply(preprocess)

In [8]:
sentences = [sent_tokenize(processed[idx])
             for idx in range(len(processed))]
sentences = [sentence for article in sentences
             for sentence in article]

In [9]:
model = SentenceTransformer('pritamdeka/S-PubMedBert-MS-MARCO')

In [16]:
embeddings = model.encode(sentences[:5])

In [17]:
print(embeddings.shape)

(5, 768)


In [18]:
cosine_similarity(embeddings)

array([[0.9999997 , 0.8998304 , 0.9377596 , 0.9258569 , 0.89746404],
       [0.8998304 , 1.0000002 , 0.88501185, 0.92290795, 0.8345748 ],
       [0.9377596 , 0.88501185, 1.0000002 , 0.87836033, 0.8637343 ],
       [0.9258569 , 0.92290795, 0.87836033, 1.0000002 , 0.894078  ],
       [0.89746404, 0.8345748 , 0.8637343 , 0.894078  , 0.99999946]],
      dtype=float32)

In [19]:
print(sentences[0])
print(sentences[1])

covid-19: failure to control pandemic and inequalities made england worst affected in europe, says report shaun griffin a failure to control the coronavirus pandemic, coupled with widening inequality over the past decade, has led to england having the highest rate of excess deaths from covid-19 in europe, a new report concluded.1 rather than focusing on narrow economic goals, health and wellbeing should be at the heart of government strategy, it says.
the findings and recommendations from the report by michael marmot, director of university college london's institute of health equity, add to those made in a 10 year update, published in february,2 of the landmark 2010 marmot review of strategies for reducing health inequality.
