# Data Extraction

In [82]:
import spacy
import nltk
from spacy import displacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import wikipedia
import requests
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import nltk

## Extras Nouns from the sentence

In [23]:
message = "I heard Pfizer works pretty well"

In [24]:
def tag_sentence(sentence):
    wordsList = word_tokenize(sentence)
    print(wordsList)
    # wordsList = [w for w in wordsList if not w in stop_words]
    tagged = nltk.pos_tag(wordsList)
    return tagged

In [25]:
tokenized = sent_tokenize(message)
nouns = []
for sentence in tokenized:
    tagged = tag_sentence(sentence)
    print(tagged)
    nouns.extend([tag[0] for tag in tagged if tag[1][:2] in ['NN', 'CD']])
print(nouns)
topic_search_str = ' '.join(nouns)

['I', 'heard', 'Pfizer', 'works', 'pretty', 'well']
[('I', 'PRP'), ('heard', 'VBP'), ('Pfizer', 'NNP'), ('works', 'NNS'), ('pretty', 'RB'), ('well', 'RB')]
['Pfizer', 'works']


In [85]:
articles = wikipedia.search("game Barcelona match football yesterday day", results = 4)
article = articles[0]
articles

['2010–11 FC Barcelona season',
 'Marc Overmars',
 'Captain Tsubasa',
 'Rose Bowl (stadium)']

In [11]:
from itertools import islice

def chunk(it, size):
    it = iter(it)
    return iter(lambda: tuple(islice(it, size)), ())

# ========== 2. Using urllib & BeatifulSoup ==========
# Import packages
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import nltk

# Specify url of the web page
page = requests.get(f"https://en.wikipedia.org/wiki/{article}")

# scrape webpage
soup = BeautifulSoup(page.content, 'lxml')



# Extract the plain text content from paragraphs
paras = []
all_paragraphs = soup.find_all('p', class_=lambda x: x != 'mw-empty-elt')
intro_para = ""
inIntroPara = False

for p_id, paragraph in enumerate(all_paragraphs):
    p_text = re.sub(r"\[.*?\]+", '', paragraph.text)
    p_tok = nltk.tokenize.sent_tokenize(p_text)
    if p_id == 0:
        intro_para = p_text
        inIntroPara = True
    elif len(p_tok) > 1:
        paras.extend([' '.join(chunk) for chunk in chunk(p_tok, 8)])
        inIntroPara = False
"""
elif paragraph.previous_sibling is not None and paragraph.previous_sibling.name == 'p':
    if inIntroPara:
        intro_para = f"{intro_para} {str(p_text)}"
    else:
        paras[-1] = f"{paras[-1]} {str(p_text)}"
"""

# Extract text from paragraph headers
heads = []
for head in soup.find_all('span', attrs={'mw-headline'}):
    heads.append(str(head.text))

# The first paragraph is the introductory paragraph and doesn't have a heading
# Set its heading as the document title
heads.insert(0, article)
paras.insert(0, intro_para)

# Drop footnote superscripts in brackets
#text = 

# Replace '\n' (a new line) with '' and end the string at $1000.
#text = text.replace('\n', '')[:-11]
#print(text)
"""
for i in range(len(paras)):
    if len(nltk.tokenize.sent_tokenize(paras[i])) > 1:
        print(paras[i], "\n")
"""

'\nfor i in range(len(paras)):\n    if len(nltk.tokenize.sent_tokenize(paras[i])) > 1:\n        print(paras[i], "\n")\n'

# Find paragraph similarity

## Finding paragraph similarity using TF-IDF

In [14]:
message = "How long does it take for symptoms to appear?"

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import vstack
import time

def process_tfidf_similarity(base_document, documents):
    vectorizer = TfidfVectorizer()

    # To make uniformed vectors, both documents need to be combined first.
    d = [base_document]
    d.extend(documents)
    embeddings = vectorizer.fit_transform(documents)
    embeddings = vstack((vectorizer.transform([base_document]), embeddings))
    
    vectorizer = TfidfVectorizer(stop_words='english', binary=True, ngram_range=(1,3), analyzer='char_wb')
    eds = vectorizer.fit_transform(d)
    print(type(embeddings))
    print(embeddings.shape, len(d))

    cosine_similarities = cosine_similarity(embeddings[0:1], embeddings[1:]).flatten()
    print(cosine_similarities)
    
    cosine_similarities = cosine_similarity(eds[0:1], eds[1:]).flatten()
    print(cosine_similarities)
    return cosine_similarities

In [16]:
documents = paras
t1 = time.time()
c_sim = process_tfidf_similarity(message, documents)
selected_article_id = c_sim.argmax()
print(f"TIME: {time.time() - t1}")
#selected_article = heads[selected_article_id]
#print(f"'{heads[selected_article_id]}' has been selected as the most relevant article")
selected_document = documents[selected_article_id]
print(selected_document)

<class 'scipy.sparse._csr.csr_matrix'>
(117, 2350) 117
[0.         0.00595723 0.17353474 0.02860639 0.00385764 0.02271634
 0.03491407 0.07466478 0.10383455 0.05496484 0.         0.04936966
 0.10576509 0.02387282 0.02750236 0.01230172 0.00443719 0.01919357
 0.02881878 0.02218652 0.02740216 0.02918774 0.01569957 0.02268919
 0.00427336 0.02746613 0.03354062 0.00849238 0.02973214 0.02936456
 0.02349747 0.11530548 0.01210011 0.02067262 0.02744454 0.01983409
 0.02043159 0.05381977 0.0400098  0.03828553 0.00693764 0.01174831
 0.02354131 0.01464075 0.00419928 0.00845099 0.00494211 0.03807895
 0.00297784 0.0278317  0.03688372 0.00387621 0.0084644  0.00447949
 0.03369848 0.         0.0236053  0.0141225  0.05253197 0.03342193
 0.03572339 0.13659602 0.10684404 0.01981251 0.02236727 0.02584363
 0.02014085 0.0213917  0.         0.04091285 0.10466337 0.08619873
 0.00564629 0.04899473 0.10403157 0.         0.00524576 0.02128446
 0.00438791 0.00766479 0.01271297 0.         0.03436535 0.01120181
 0.0270

In [None]:
print(heads)
print(paras[0])

# Paragraph similarity using models

In [76]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import time
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
sents = []
for para in paras:
    sents.extend(nltk.tokenize.sent_tokenize(para))
#print('\n'.join(sents))
print(len(sents), len(paras))

464 116


## Using Sent Transformer Models

### Using MpNet

In [18]:
model = SentenceTransformer('../../models/all-mpnet-base-v2', device='cuda')
t1 = time.time()
doc_embedding = model.encode([message])
candidate_embeddings = model.encode(paras)
top_n = 2
distances = cosine_similarity(doc_embedding, candidate_embeddings).flatten()
keywords = [paras[index] for index in ((-distances).argsort())[:top_n]]
print(time.time() - t1)
print(paras[distances.argmax()])
print('\n'.join(keywords))

5.693247318267822
As is common with infections, there is a delay between the moment a person first becomes infected and the appearance of the first symptoms. The median delay for COVID-19 is four to five days possibly being infectious on 1-4 of those days. Most symptomatic people experience symptoms within two to seven days after exposure, and almost all will experience at least one symptom within 12 days.
As is common with infections, there is a delay between the moment a person first becomes infected and the appearance of the first symptoms. The median delay for COVID-19 is four to five days possibly being infectious on 1-4 of those days. Most symptomatic people experience symptoms within two to seven days after exposure, and almost all will experience at least one symptom within 12 days.
Some early studies suggest that 10–20% of people with COVID‑19 will experience symptoms lasting longer than a month. A majority of those who were admitted to hospital with severe disease report long

### Using all-MiniLM-L6-V2

In [77]:
model = SentenceTransformer('../../models/all-MiniLM-L6-v2', device='cuda')

In [84]:
t1 = time.time()
#m1 = ["I love cricket!", "I don't know if the vaccines are effective", "I love riding horses!", "Do you watch football?"]
#m2 = "Did you watch Australia vs Pakistan?"
m1 = ['match', 'football']
m2 = "game"
doc_embedding = model.encode([m2])
candidate_embeddings = model.encode(m1)
top_n = 2
distances = cosine_similarity(doc_embedding, candidate_embeddings).flatten()
print(time.time() - t1)
print(distances)

"""
keywords = [paras[index] for index in ((-distances).argsort())[:top_n]]
print(time.time() - t1)
print(paras[distances.argmax()])
print('\n'.join(keywords))
"""

0.34549570083618164
[0.55318475 0.54193956]


"\nkeywords = [paras[index] for index in ((-distances).argsort())[:top_n]]\nprint(time.time() - t1)\nprint(paras[distances.argmax()])\nprint('\n'.join(keywords))\n"

### Clustering cosine similarity

In [67]:
import numpy as np
import time

#y = [1,1,5,6,1,5,10,22,23,23,50,51,51,52,100,112,130,500,512,600,12000,12230]
t1 = time.time()
y = sorted([0.12140948, 0.426371, 0.11862079, 0.44534147, 0.17006755, 0.55, 0.00, 0.00, 0.00, 0.00])
#y = [x*30 for x in y]
print(y)
x = range(len(y))
m = np.matrix([x, y]).transpose()

from scipy.cluster.vq import kmeans
kclust = kmeans(m, 3)

cluster_indices = kclust[0][:, 0]
assigned_clusters = [abs(cluster_indices - e).argmin() for e in x]
print(f"time elapsed: {time.time() - t1}")
print(assigned_clusters)

[0.0, 0.0, 0.0, 0.0, 0.11862079, 0.12140948, 0.17006755, 0.426371, 0.44534147, 0.55]


NameError: name 'n' is not defined

### Clustering cosine similarities: Proper Implementation

In [72]:
from scipy.cluster.vq import kmeans

def find_highest_similarity_scores(scores, n=3):
    s_idxs = sorted(range(len(scores)), key=scores.__getitem__)
    s = [scores[i] for i in s_idxs]
    s_len = len(s)
    s_range = range(s_len)
    
    n = min(n, s_len)
    kclust = kmeans(np.matrix([s_range, s]).transpose(), n)
    assigned_clusters = [abs(kclust[0][:, 0] - e).argmin() for e in s_range]
    
    print(assigned_clusters)
    
    highest_cluster = assigned_clusters[-1]
    highest_idxs = []
    for i in range(s_len-1, -1, -1):
        if assigned_clusters[i] != highest_cluster:
            return highest_idxs
        highest_idxs.append(s_idxs[i])
    return highest_idxs

#t = [0.12140948, 0.426371, 0.11862079, 0.44534147, 0.17006755, 0.55, 0.00, 0.00, 0.00, 0.00]
t = [0, 0.25, 0.75, 1]
[t[i] for i in find_highest_similarity_scores(t)]

[0, 2, 1, 1]


[1, 0.75]

In [56]:
import numpy as np
from sklearn.cluster import MeanShift, estimate_bandwidth

#x = [1,1,5,6,1,5,10,22,23,23,50,51,51,52,100,112,130,500,512,600,12000,12230]
t1 = time.time()
x = [e*30 for e in y]
X = np.array(list(zip(x,np.zeros(len(x)))), dtype=np.int)
bandwidth = estimate_bandwidth(X, quantile=0.6)
ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
ms.fit(X)
labels = ms.labels_
cluster_centers = ms.cluster_centers_

labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)
print(f"time elapsed: {time.time() - t1}")

for k in range(n_clusters_):
    my_members = labels == k
    print ("cluster {0}: {1}".format(k, X[my_members]))

time elapsed: 0.008007287979125977
cluster 0: [[12  0]
 [13  0]
 [16  0]]
cluster 1: [[3 0]
 [3 0]
 [5 0]]


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  X = np.array(list(zip(x,np.zeros(len(x)))), dtype=np.int)


## Using BERT with averaging sent vectors

In [62]:
from sentence_transformers import SentenceTransformer
import numpy as np

# This will download and load the pretrained model offered by UKPLab.
model = SentenceTransformer('../../models/bert-base-cased-squad2', device='cpu')

Some weights of the model checkpoint at ../../models/bert-base-cased-squad2 were not used when initializing BertModel: ['qa_outputs.bias', 'qa_outputs.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [63]:
def calculate_bert_embeddings(docs):
    docs_embeddings = None
    for doc in docs:
        sentences = sent_tokenize(doc)
        base_embeddings_sentences = model.encode(sentences)
        base_embeddings = np.mean(np.array(base_embeddings_sentences), axis=0)
        print(base_embeddings.shape)
        if docs_embeddings is None:
            docs_embeddings = base_embeddings
        else:
            print(docs_embeddings.shape, base_embeddings.shape)
            docs_embeddings = np.row_stack((docs_embeddings, base_embeddings))
    return docs_embeddings

In [64]:
# m = "I love football"
# x = "I like football"
# y = "Computer science involves cryptography"
# message = "Which team beat New Zealand in the 1992 World Cup?"
t1 = time.time()
docs_embds = calculate_bert_embeddings(m1)
msg_embds = calculate_bert_embeddings([m2]).reshape(1,-1)
print(msg_embds.shape, docs_embds.shape)
c_sim = cosine_similarity(msg_embds, docs_embds).flatten()
print(f"TIME: {time.time() - t1}")
print(c_sim)
paras[c_sim.argmax()]

(768,)
(768,)
(768,) (768,)
(768,)
(2, 768) (768,)
(768,)
(3, 768) (768,)
(768,)
(4, 768) (768,)
(768,)
(5, 768) (768,)
(768,)
(6, 768) (768,)
(768,)
(7, 768) (768,)
(768,)
(8, 768) (768,)
(768,)
(9, 768) (768,)
(768,)
(10, 768) (768,)
(768,)
(11, 768) (768,)
(768,)
(12, 768) (768,)
(768,)
(13, 768) (768,)
(768,)
(14, 768) (768,)
(768,)
(1, 768) (15, 768)
TIME: 3.9620213508605957
[0.763141   0.7383532  0.7377845  0.7588626  0.76193196 0.75090337
 0.77208793 0.71912754 0.71682876 0.78318155 0.74859285 0.76855487
 0.7186981  0.73935056 0.71325564]


'Co-hosts New Zealand proved the surprise package of the tournament, winning their first seven consecutive games to finish on top of the table after the round-robin. The other hosts, Australia, one of the pre-tournament favourites lost their first two matches. They recovered somewhat to win four of the remaining six, but narrowly missed out on the semi-finals. The West Indies also finished with a 4–4 record, but were just behind Australia on run-rate. South Africa made a triumphant return to international cricket with a win over Australia at the Sydney Cricket Ground in their first match. They and England had solid campaigns and easily qualified for the semis, despite upset losses to Sri Lanka and Zimbabwe  respectively. India had a disappointing tournament and never looked likely to progress beyond the round-robin. Sri Lanka were still establishing themselves at the highest level and beat only Zimbabwe (who did not yet have Test status) and South Africa.'

In [20]:
import wikipedia
import nltk
page = wikipedia.page('COVID-19')
print(len(nltk.tokenize.word_tokenize(page.content)))

12293
