# Data Extraction

In [None]:
import spacy
import nltk
from spacy import displacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import wikipedia
import requests
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import nltk

In [3]:
import wikipedia

## Extras Nouns from the sentence

In [None]:
message = "I heard Pfizer works pretty well"

In [None]:
def tag_sentence(sentence):
    wordsList = word_tokenize(sentence)
    print(wordsList)
    # wordsList = [w for w in wordsList if not w in stop_words]
    tagged = nltk.pos_tag(wordsList)
    return tagged

In [None]:
tokenized = sent_tokenize(message)
nouns = []
for sentence in tokenized:
    tagged = tag_sentence(sentence)
    print(tagged)
    nouns.extend([tag[0] for tag in tagged if tag[1][:2] in ['NN', 'CD']])
print(nouns)
topic_search_str = ' '.join(nouns)

In [6]:
articles = wikipedia.search(' '.join(['actor', 'character', 'Skywalker']), results = 4)
article = articles[0]
articles

['Lego Star Wars: The Skywalker Saga',
 'Skywalker family',
 'Star Wars: The Rise of Skywalker',
 'Darth Vader']

In [None]:
from itertools import islice

def chunk(it, size):
    it = iter(it)
    return iter(lambda: tuple(islice(it, size)), ())

# ========== 2. Using urllib & BeatifulSoup ==========
# Import packages
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import nltk

# Specify url of the web page
page = requests.get(f"https://en.wikipedia.org/wiki/{article}")

# scrape webpage
soup = BeautifulSoup(page.content, 'lxml')



# Extract the plain text content from paragraphs
paras = []
all_paragraphs = soup.find_all('p', class_=lambda x: x != 'mw-empty-elt')
intro_para = ""
inIntroPara = False

for p_id, paragraph in enumerate(all_paragraphs):
    p_text = re.sub(r"\[.*?\]+", '', paragraph.text)
    p_tok = nltk.tokenize.sent_tokenize(p_text)
    if p_id == 0:
        intro_para = p_text
        inIntroPara = True
    elif len(p_tok) > 1:
        paras.extend([' '.join(chunk) for chunk in chunk(p_tok, 8)])
        inIntroPara = False
"""
elif paragraph.previous_sibling is not None and paragraph.previous_sibling.name == 'p':
    if inIntroPara:
        intro_para = f"{intro_para} {str(p_text)}"
    else:
        paras[-1] = f"{paras[-1]} {str(p_text)}"
"""

# Extract text from paragraph headers
heads = []
for head in soup.find_all('span', attrs={'mw-headline'}):
    heads.append(str(head.text))

# The first paragraph is the introductory paragraph and doesn't have a heading
# Set its heading as the document title
heads.insert(0, article)
paras.insert(0, intro_para)

# Drop footnote superscripts in brackets
#text = 

# Replace '\n' (a new line) with '' and end the string at $1000.
#text = text.replace('\n', '')[:-11]
#print(text)
"""
for i in range(len(paras)):
    if len(nltk.tokenize.sent_tokenize(paras[i])) > 1:
        print(paras[i], "\n")
"""

# Find paragraph similarity

## Finding paragraph similarity using TF-IDF

In [None]:
message = "How long does it take for symptoms to appear?"

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import vstack
import time

def process_tfidf_similarity(base_document, documents):
    vectorizer = TfidfVectorizer()

    # To make uniformed vectors, both documents need to be combined first.
    d = [base_document]
    d.extend(documents)
    embeddings = vectorizer.fit_transform(documents)
    embeddings = vstack((vectorizer.transform([base_document]), embeddings))
    
    vectorizer = TfidfVectorizer(stop_words='english', binary=True, ngram_range=(1,3), analyzer='char_wb')
    eds = vectorizer.fit_transform(d)
    print(type(embeddings))
    print(embeddings.shape, len(d))

    cosine_similarities = cosine_similarity(embeddings[0:1], embeddings[1:]).flatten()
    print(cosine_similarities)
    
    cosine_similarities = cosine_similarity(eds[0:1], eds[1:]).flatten()
    print(cosine_similarities)
    return cosine_similarities

In [None]:
documents = paras
t1 = time.time()
c_sim = process_tfidf_similarity(message, documents)
selected_article_id = c_sim.argmax()
print(f"TIME: {time.time() - t1}")
#selected_article = heads[selected_article_id]
#print(f"'{heads[selected_article_id]}' has been selected as the most relevant article")
selected_document = documents[selected_article_id]
print(selected_document)

In [None]:
print(heads)
print(paras[0])

# Paragraph similarity using models

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import time
import numpy as np

In [None]:
sents = []
for para in paras:
    sents.extend(nltk.tokenize.sent_tokenize(para))
#print('\n'.join(sents))
print(len(sents), len(paras))

## Using Sent Transformer Models

### Using MpNet

In [None]:
model = SentenceTransformer('../../models/all-mpnet-base-v2', device='cuda')
t1 = time.time()
doc_embedding = model.encode([message])
candidate_embeddings = model.encode(paras)
top_n = 2
distances = cosine_similarity(doc_embedding, candidate_embeddings).flatten()
keywords = [paras[index] for index in ((-distances).argsort())[:top_n]]
print(time.time() - t1)
print(paras[distances.argmax()])
print('\n'.join(keywords))

### Using all-MiniLM-L6-V2

In [None]:
model = SentenceTransformer('../../models/all-MiniLM-L6-v2', device='cuda')

In [None]:
t1 = time.time()
#m1 = ["I love cricket!", "I don't know if the vaccines are effective", "I love riding horses!", "Do you watch football?"]
#m2 = "Did you watch Australia vs Pakistan?"
m1 = ['match', 'football']
m2 = "game"
doc_embedding = model.encode([m2])
candidate_embeddings = model.encode(m1)
top_n = 2
distances = cosine_similarity(doc_embedding, candidate_embeddings).flatten()
print(time.time() - t1)
print(distances)

"""
keywords = [paras[index] for index in ((-distances).argsort())[:top_n]]
print(time.time() - t1)
print(paras[distances.argmax()])
print('\n'.join(keywords))
"""

### Clustering cosine similarity

In [None]:
import numpy as np
import time

#y = [1,1,5,6,1,5,10,22,23,23,50,51,51,52,100,112,130,500,512,600,12000,12230]
t1 = time.time()
y = sorted([0.12140948, 0.426371, 0.11862079, 0.44534147, 0.17006755, 0.55, 0.00, 0.00, 0.00, 0.00])
#y = [x*30 for x in y]
print(y)
x = range(len(y))
m = np.matrix([x, y]).transpose()

from scipy.cluster.vq import kmeans
kclust = kmeans(m, 3)

cluster_indices = kclust[0][:, 0]
assigned_clusters = [abs(cluster_indices - e).argmin() for e in x]
print(f"time elapsed: {time.time() - t1}")
print(assigned_clusters)

### Clustering cosine similarities: Proper Implementation

In [None]:
from scipy.cluster.vq import kmeans

def find_highest_similarity_scores(scores, n=3):
    s_idxs = sorted(range(len(scores)), key=scores.__getitem__)
    s = [scores[i] for i in s_idxs]
    s_len = len(s)
    s_range = range(s_len)
    
    n = min(n, s_len)
    kclust = kmeans(np.matrix([s_range, s]).transpose(), n)
    assigned_clusters = [abs(kclust[0][:, 0] - e).argmin() for e in s_range]
    
    print(assigned_clusters)
    
    highest_cluster = assigned_clusters[-1]
    highest_idxs = []
    for i in range(s_len-1, -1, -1):
        if assigned_clusters[i] != highest_cluster:
            return highest_idxs
        highest_idxs.append(s_idxs[i])
    return highest_idxs

#t = [0.12140948, 0.426371, 0.11862079, 0.44534147, 0.17006755, 0.55, 0.00, 0.00, 0.00, 0.00]
t = [0, 0.25, 0.75, 1]
[t[i] for i in find_highest_similarity_scores(t)]

In [None]:
import numpy as np
from sklearn.cluster import MeanShift, estimate_bandwidth

#x = [1,1,5,6,1,5,10,22,23,23,50,51,51,52,100,112,130,500,512,600,12000,12230]
t1 = time.time()
x = [e*30 for e in y]
X = np.array(list(zip(x,np.zeros(len(x)))), dtype=np.int)
bandwidth = estimate_bandwidth(X, quantile=0.6)
ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
ms.fit(X)
labels = ms.labels_
cluster_centers = ms.cluster_centers_

labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)
print(f"time elapsed: {time.time() - t1}")

for k in range(n_clusters_):
    my_members = labels == k
    print ("cluster {0}: {1}".format(k, X[my_members]))

## Using BERT with averaging sent vectors

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np

# This will download and load the pretrained model offered by UKPLab.
model = SentenceTransformer('../../models/bert-base-cased-squad2', device='cpu')

In [None]:
def calculate_bert_embeddings(docs):
    docs_embeddings = None
    for doc in docs:
        sentences = sent_tokenize(doc)
        base_embeddings_sentences = model.encode(sentences)
        base_embeddings = np.mean(np.array(base_embeddings_sentences), axis=0)
        print(base_embeddings.shape)
        if docs_embeddings is None:
            docs_embeddings = base_embeddings
        else:
            print(docs_embeddings.shape, base_embeddings.shape)
            docs_embeddings = np.row_stack((docs_embeddings, base_embeddings))
    return docs_embeddings

In [None]:
# m = "I love football"
# x = "I like football"
# y = "Computer science involves cryptography"
# message = "Which team beat New Zealand in the 1992 World Cup?"
t1 = time.time()
docs_embds = calculate_bert_embeddings(m1)
msg_embds = calculate_bert_embeddings([m2]).reshape(1,-1)
print(msg_embds.shape, docs_embds.shape)
c_sim = cosine_similarity(msg_embds, docs_embds).flatten()
print(f"TIME: {time.time() - t1}")
print(c_sim)
paras[c_sim.argmax()]

In [None]:
import wikipedia
import nltk
page = wikipedia.page('COVID-19')
print(len(nltk.tokenize.word_tokenize(page.content)))