# Document Selection

A notebook for offline document selection from the cached articles. This extends the functionality in the current KnowledgeSource.py class and adds offline methods for fetching the top 3 most relevant article from the DB in the given cache

### Imports

In [1]:
# Import for scraping data off the web
import requests
import wikipedia
from bs4 import BeautifulSoup
import re
import nltk
import numpy as np

# Import SentenceTransformer for using a Sentence Embedding model
from sentence_transformers import SentenceTransformer

# Imports for Document Similarity computation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# For dividing lists into smaller chunks
from itertools import islice

# For saving the KnowledgeSource object, so next time the cache can be loaded in a hot state
import pickle
from os.path import exists

# Importing the SentenceModel
from SentenceModel import SentenceModel

  from .autonotebook import tqdm as notebook_tqdm


### Constants

In [2]:
SINGLE_KS_FILENAME = 'knowledge_source.pkl'
MULTI_KS_FILENAME = 'multi_knowledge_source.pkl'

### Utility functions

In [3]:
def merge_db(db1_filename, db2_filename, target_filename):
    target_db = None
    db1 = read_object(db1_filename)
    db2 = read_object(db2_filename)

    if db1 is None and db2 is not None:
        target_db = db2
    elif db1 is not None and db2 is None:
        target_db = db1
    elif db1 is not None and db2 is not None:
        target_db = db1.copy()
        target_db.update(db2)

    if target_db is not None:
        save_object(target_db, target_filename)
        return True
    return False


def save_object(obj, filename):
    with open(filename, 'wb') as output:
        pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)


def read_object(filename):
    obj = None
    try:
        if exists(filename):
            print(f"{filename} exists")
            with open(filename, 'rb') as inp:
                obj = pickle.load(inp)
    except (FileNotFoundError, PermissionError):
        pass
    return obj


def chunk(it, size):
    it = iter(it)
    return iter(lambda: tuple(islice(it, size)), ())

### Extracting paragraphs and heads from the given wikipedia article

In [4]:
def extract_paras_and_heads(doc_title, chunk_size=8):
    """
      Returns the (heading, paragraphs) pairs from the page parsed by soup
      Placed a minimum number of sentence limit on paragraph length to ignore insignificant paragraphs
      which skew the results
    """

    # Fetch and scrape the contents of the wikipedia page corresponding to the title
    page = requests.get(f"https://en.wikipedia.org/wiki/{doc_title}")
    soup = BeautifulSoup(page.content, 'lxml')

    # Extract paragraph text, ignoring any empty class paragraphs
    # Fetch the intro paragraph separately since it isn't associated with a heading
    # Handles the case of multiple paragraphs under a single ehading
    paras = []
    all_paragraphs = soup.find_all('p', class_=lambda x: x != 'mw-empty-elt')
    intro_para = ""

    for p_id, paragraph in enumerate(all_paragraphs):
        p_text = re.sub(r"\[.*?\]+", '', paragraph.text)
        p_tok = nltk.tokenize.sent_tokenize(p_text)
        if p_id == 0:
            intro_para = p_text
        elif len(p_tok) > 1:
            paras.extend([' '.join(p_chunk) for p_chunk in chunk(p_tok, chunk_size)])

    # Extract text from paragraph headers
    heads = []
    for head in soup.find_all('span', attrs={'mw-headline'}):
        heads.append(str(head.text))

    if len(paras) == 0:
        return None

    # The first paragraph is the introductory paragraph and doesn't have a heading
    # Set its heading as the document title
    heads.insert(0, doc_title)
    paras.insert(0, intro_para)

    """
    for i in range(len(paras)):
        if len(nltk.tokenize.sent_tokenize(paras[i])) > 1:
            print(paras[i], "\n")
    """
    return heads, paras

### TF-IDF Similarity

In [5]:
def calculate_tfidf_similarity(base_document, documents):
    # To make uniformed vectors, both documents need to be combined first.
    d = [base_document]
    d.extend(documents)

    # TODO: Hyper-parameter tuning of TF-IDF vectorizer for calculating document embeddings
    vectorizer = TfidfVectorizer(stop_words='english', binary=True, ngram_range=(1, 3), analyzer='char', lowercase=True)
    embeddings = vectorizer.fit_transform(d)
    print(embeddings.shape, len(d))

    cosine_similarities = cosine_similarity(embeddings[0:1], embeddings[1:]).flatten()
    print(f"DOCUMENT COSINE SIM: {cosine_similarities}")
    return cosine_similarities

## KnowledgeSource class

In [6]:
class KnowledgeSource:
    def __init__(self, model=None, num_results=3, persist=True, persist_path='', use_hot_cache=True, offline=False):
        self.use_multi_source = num_results > 1
        if self.use_multi_source:
            self.ks_filename = MULTI_KS_FILENAME
        else:
            self.ks_filename = SINGLE_KS_FILENAME

        # Initialize knowledge source, optionally from a previously persisted file
        # For each cached article, this dictionary stores the article's overall content, its mini paragraphs, and the
        # precomputed embeddings corresponding to the mini paragraphs
        persist_location = persist_path + \
                           ('/' if persist_path != '' and persist_path[-1] != '/' else '') + self.ks_filename
        ks = read_object(persist_location) if use_hot_cache else None
        if ks is None:
            print("CREATING NEW KNOWLEDGE SOURCE")
            self.article_db = {}
        else:
            print("USING PERSISTED KNOWLEDGE SOURCE")
            self.article_db = ks

        self.num_results = num_results

        self.sentence_model = SentenceTransformer('../../../models/all-MiniLM-L6-v2', device='cuda') if model is None \
            else model

        # Whether the knowledge source object should be persisted at the end of execution
        self.persist = persist
        self.persist_location = persist_location if persist else None
        
        # Whether to use Wikipedia or to use cached database
        self.offline = offline

    def build_db(self, topics):
        """
          Given a list of topics, each consisting of a list of keywords corresponding to the topic, build a database of
          articles consisting of an num_articles articles for each topic in the list. Stores the overall content,
          breaks down the content of the article into mini docs, and precomputes embeddings corresponding to these mini
          docs.
        """
        for topic_keywords in topics:
            topic_search_str = ' '.join(topic_keywords)

            print(f"Fetching data for {topic_search_str}")
            articles = wikipedia.search(topic_search_str, results=self.num_results)
            print(f"Using the following relevant articles: {articles}")

            # For every article corresponding to the topic, fetch and save its overall content and embeddings
            # corresponding to the mini paragraphs of the article
            for article in articles:
                if self.use_multi_source:
                    self.__fetch_multi_source_article_data(article)
                else:
                    self.__fetch_single_source_article_data(article)

    def fetch_relevant_articles(self, topics):
        """
            Fetches the top {self.num_results} topics from either the wikipedia or 
            local database depending on the mode. Topics is a list of keywords to search from
        """
        if len(topics) == 0:
            return []

        articles = []
        
        # If we are not offline we search the keywords in Wikipedia
        if not self.offline:
            topic_search_str = ' '.join(topics)

            print(f"Fetching data for '{topic_search_str}'")
            try:
                articles = wikipedia.search(topic_search_str, results=self.num_results)
            except:
                articles = []
            print(f"Using the following relevant articles: {articles}")
            
        else:
            # Searching from the local database
            stored_articles = list(self.article_db.keys())
            topic_search_str = ' '.join(topics)
            all_articles = []
            
            for article in stored_articles:
                all_articles.append(self.article_db[article][0])
                
            doc_c_sim = calculate_tfidf_similarity(topic_search_str, all_articles)    
            selected_article_ids = doc_c_sim.argsort()
            
            added = 0
            for _id in np.flip(selected_article_ids):
                articles.append(stored_articles[_id])
                added += 1
                if added == self.num_results:
                    break
            # End for
        # End else

        return articles
    
    # End of function

    def fetch_topic_data(self, topics, message):
        selected_para_tok, selected_article_title = self.__fetch_multi_source_topic_data(topics, message) \
            if self.use_multi_source else self.__fetch_single_source_topic_data(topics, message)
        print(f"======= SELECTED PARAGRAPH:\n{selected_para_tok}")
        return selected_para_tok, selected_article_title

    def __fetch_multi_source_topic_data(self, topics, message):
        articles = self.fetch_relevant_articles(topics)

        if len(articles) == 0:
            return [], ""

        topic_paras = []    # Mini paragraphs comprising all mini paragraphs of each article relevant to the topic
        for article in articles:
            article_paras = self.__fetch_multi_source_article_data(article)

            if article_paras is not None:
                topic_paras.extend(article_paras)

        if len(topic_paras) == 0:
            return [], ""

        # Once the topic paras have been accumulated, the embeddings for them, and the most similar para can be found
        topic_paras_embeddings = self.sentence_model.encode(topic_paras)

        # Encode the input message, and use the topic para embeddings to calculate cosine similarity
        # The resulting scores can be used to find the most similar paragraph in all articles relevant to the topic
        para_c_sim = cosine_similarity(self.sentence_model.encode([message]), topic_paras_embeddings).flatten()
        selected_para_id = para_c_sim.argmax()

        return nltk.tokenize.sent_tokenize(topic_paras[selected_para_id]), articles[0]

    def __fetch_multi_source_article_data(self, title):
        article_paras = self.article_db.get(title, None)
        if article_paras is not None:
            print(f"Topic '{title}' has already been cached")
        else:
            print(f"Topic '{title}' has not been cached, fetching and building...")
            article_data = extract_paras_and_heads(title, chunk_size=3)

            if article_data is not None:
                heads, article_paras = article_data
                # Obtained mini docs from the article data, and filter out paragraphs that are too short
                processed_paras = []
                for i in range(len(article_paras)):
                    para_tok = nltk.tokenize.sent_tokenize(article_paras[i])
                    if len(para_tok) <= 1:
                        continue
                    processed_paras.append(article_paras[i])
                article_paras = processed_paras

                # Cache article paras
                self.article_db[title] = article_paras
            else:
                print(f"Data for '{title}' could not be parsed, ignoring this article")

        return article_paras

    def __fetch_single_source_topic_data(self, topics, message):
        articles = self.fetch_relevant_articles(topics)

        if articles is None or len(articles) == 0:
            return [], ""

        docs_content = []
        articles_data = []
        for article in articles:
            article_data = self.__fetch_single_source_article_data(article)

            # Ignore article if its data couldn't be scraped
            if article_data is None:
                continue

            docs_content.append(article_data[0])
            articles_data.append(article_data)

        # Obtain the cosine similarity of the message with the relevant articles for the current topic
        if len(docs_content) == 0:
            return [], ""

        doc_c_sim = calculate_tfidf_similarity(message, docs_content)

        # Select the most similar article
        selected_article_id = doc_c_sim.argmax()
        print(f"'{articles[selected_article_id]}' has been selected as the most relevant article")

        # Now, select the most relevant mini doc (chunk of one or more paragraphs) in the most relevant article
        selected_article_data = articles_data[selected_article_id]
        mini_doc_embeddings, mini_docs = selected_article_data[1], selected_article_data[2]

        # Calculate embedding for the input message using the saved vectorizer for this article
        # Use this with the pre-calculated embeddings for the paragraphs in the article to calculate cosine similarity
        para_c_sim = cosine_similarity(self.sentence_model.encode([message]), mini_doc_embeddings).flatten()
        selected_para_id = para_c_sim.argmax()

        return mini_docs[selected_para_id], articles[selected_article_id]

    def __fetch_single_source_article_data(self, title):
        article_db_entry = self.article_db.get(title, None)
        if article_db_entry is not None:
            print(f"Topic '{title}' has already been cached")
            return article_db_entry

        print(f"Topic '{title}' has not been cached, fetching and building...")

        article_data = extract_paras_and_heads(title, chunk_size=3)

        # If no paragraphs could be retrieved, this article is useless
        if article_data is None:
            return None

        heads, paras = article_data

        # Use the heads, paras pairs (mini-documents) to calculate TF-IDF embeddings...
        # for the set of mini-documents of the article corresponding to title
        # These embeddings can be used to find which mini_doc is most similar to a given document
        # the absence of the heading in a sentence doesn't make it irrelevant

        # Save tokenized paragraph sentences for QA, and process the paragraph text for calculating embeddings
        content = ""
        mini_docs = []
        for i in range(len(paras)):
            para_tok = nltk.tokenize.sent_tokenize(paras[i])
            if len(para_tok) <= 1:
                continue

            # Accumulate the total content of the article
            content = f"{content} {paras[i]}"

            # Save the tokenized sentences for this paragraph, NOTE: these use the original sentences
            # If this paragraph gets selected as the most relevant one...
            # Then this list of tokenized sentences will be used for question answering
            mini_docs.append(para_tok)

        # Calculate mini_doc embeddings using the processed paragraphs
        mini_doc_embeddings = self.sentence_model.encode(paras)

        # For this new article, save the following:
        # entire article text content: used to calculate document-level embeddings when selecting most relevant document
        # embeddings of the mini docs of this article: used when selecting the most-relevant paragraph
        # the tokenized sentences of the mini docs of this article: uses when applying QA to the most relevant paragraph
        article_db_entry = (content, mini_doc_embeddings, mini_docs)
        self.article_db[title] = article_db_entry

        return article_db_entry

    def close(self):
        """
         Save the knowledge acquired over the course of Knowledge Source's lifetime to disk
        """
        if not self.persist:
            return
        save_object(self.article_db, self.persist_location)

## Prototyping

#### Creating models

In [7]:
sentence_model = SentenceModel('../../models/all-mpnet-base-v2', use_cuda=False)
knowledge_db = KnowledgeSource(model=sentence_model.model, num_results=3, persist=True,
                                            persist_path='../../res/knowledge_presets', use_hot_cache=True, offline=True)

../../res/knowledge_presets/multi_knowledge_source.pkl exists
USING PERSISTED KNOWLEDGE SOURCE


In [8]:
knowledge_text, knowledge_article = knowledge_db.fetch_topic_data(
    ['vaccine', 'Pfizer', 'one'], 
    "I heard Pfizer's vaccine was one that was particularly effective"
)

(18, 3092) 18
DOCUMENT COSINE SIM: [0.18987818 0.1425059  0.13238943 0.11496557 0.10501283 0.09158884
 0.04689385 0.0574866  0.07880864 0.0667633  0.04302766 0.04029826
 0.04946291 0.04740884 0.05135179 0.06128076 0.05677339]
Topic 'Pfizer–BioNTech COVID-19 vaccine' has already been cached
Topic 'Pfizer' has already been cached
Topic 'COVID-19 vaccination in Bangladesh' has already been cached
["On November 9, 2020, Pfizer announced that BioNTech's COVID-19 vaccine, tested on 43,500 people, was found to be 90% effective at preventing symptomatic COVID-19.", 'The efficacy was updated to 95% a week later.', 'Akiko Iwasaki, an immunologist interviewed by the New York Times, described the efficacy figure as "really a spectacular number."']


### Printing out all the stored data

In [107]:
for key in knowledge_db.article_db.keys():
    print (key)

Pfizer–BioNTech COVID-19 vaccine
Pfizer
COVID-19 vaccination in Bangladesh
Moderna COVID-19 vaccine
Moderna
MRNA vaccine
COVID-19 pandemic in the United States
COVID-19 misinformation
John F. Kennedy assassination conspiracy theories
Conspiracy theories related to the Trump–Ukraine scandal
Spygate (conspiracy theory)
George Soros
Mandatory Fun
TikTok
Conspiracy Theories and Interior Design
Toyota in Formula One
The Lunar Injection Kool Aid Eclipse Conspiracy
