<a href="https://colab.research.google.com/github/tranhoangnguyen03/BSHR_Loop/blob/main/demo01/notebook/BSHR_Loop_Demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
!pip -q install openai loguru cohere tiktoken
# Allow print to wrap-line
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

# Remove max_columns and max_colwidth constraints
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option("max_colwidth", None)

In [11]:
import openai
import requests
from loguru import logger
import re
import time

class PromptTemplates:
    BRAINSTORM = (
        "# MISSION \n"
        "Generate a list of <number> search queries based on a given topic. "
        "Focus on comprehensive and counterfactual queries, leveraging information foraging techniques."
    )

    HYPOTHESIZE = (
        "Based on the provided search results, formulate a hypothesis as an answer to the main query."
    )

    CONDENSER = (
        "# MISSION \n"
        "Convert the given input into a Sparse Priming Representation (SPR). "
        "Distill the content into succinct statements, capturing core concepts with minimal words. "
        "Target audience: Advanced NLP models."
    )

    YES_NO_MACHINE = (
        "You are a binary validator. You respond strictly with either 'yes' or 'no'."
    )

    REFINE = (
        "Merge and refine the provided two hypotheses to form a comprehensive and more accurate answer."
    )

    RESPOND = (
        "Using the provided context and final hypothesis, craft a concise and conversational response to answer the query."
    )

class Utils:
    @staticmethod
    def clean_search_query(query):
        # Remove the preceding numbers and period
        cleaned = re.sub(r"^\d+\.\s*", "", query)
        # Remove surrounding quotes
        cleaned = cleaned.strip("\"")
        return cleaned

class BSHRPipeline:
    def __init__(self, api_key, model:str= None, mock_open_ai:bool=True):
        openai.api_key = api_key
        if mock_open_ai:
            openai.api_base = "https://openrouter.ai/api/v1"
            self.model = model or 'mistralai/mistral-7b-instruct'
            self.headers={
                "HTTP-Referer": 'http://localhost:3000', # To identify your app. Can be set to e.g. http://localhost:3000 for testing
                "X-Title": 'BSHRPipeline', # Optional. Shows on openrouter.ai
            }
        else:
            self.model = model or 'gpt-3.5-turbo'
            self.headers={}


        logger.info("Initialized BSHRPipeline")

    def brainstorm(self, main_query, n=5):
        logger.info(f"Brainstorming search queries for: {main_query}")
        messages = [
            {"role": "system", "content": PromptTemplates.BRAINSTORM.replace('<number>',f'{n}')},
            {"role": "user", "content": f"Brainstorm {n} search queries based on the topic: {main_query}."}
        ]
        response = openai.ChatCompletion.create(
            model=self.model, messages=messages, headers=self.headers
        )

        search_queries = response.choices[0].message['content'].split("\n")
        logger.info(f"Generated search queries: {search_queries}")
        return search_queries[:n]

    def search_wikipedia(self, query):
        cleaned_query = Utils.clean_search_query(query)
        logger.info(f"Searching Wikipedia for: {cleaned_query}")
        url = 'https://en.wikipedia.org/w/api.php'
        search_params = {
            'action': 'query',
            'list': 'search',
            'srsearch': cleaned_query,
            'format': 'json'
        }

        response = requests.get(url, params=search_params)
        data = response.json()

        # Check if there are search results
        if not data['query']['search']:
            logger.warning(f"No results found on Wikipedia for: {cleaned_query}")
            return None  # Return None if no results are found

        # Get the title of the first result
        title = data['query']['search'][0]['title']

        # Check relevance of the title to the original query
        if not self.relevancy_check(cleaned_query, title):
            logger.warning(f"Search title '{title}' was deemed irrelevant to the query: '{cleaned_query}'")
            return None

        content_params = {
            'action': 'query',
            'prop': 'extracts',
            'exintro': True,
            'explaintext': True,
            'titles': title,
            'format': 'json'
        }

        response = requests.get(url, params=content_params)
        data = response.json()

        # Get the page ID of the first page
        page_id = list(data['query']['pages'].keys())[0]

        # Extract content of the page
        results = data['query']['pages'][page_id]['extract']
        logger.info(f"Extracted Wikipedia content for: {title}")

        # Condense the extracted information
        condensed_results = self.condense_information(results)
        return condensed_results

    def condense_information(self, content):
        logger.info(f"Condensing information")
        messages = [
            {"role": "system", "content": PromptTemplates.CONDENSER},
            {"role": "user", "content": content}
        ]
        response = openai.ChatCompletion.create(
            model=self.model, messages=messages, headers=self.headers
        )
        condensed_content = response.choices[0].message['content']
        logger.info(f"Condensed content: {condensed_content}")
        return condensed_content

    def relevancy_check(self, search_query, search_title):
        logger.info(f"Checking relevance of search title: '{search_title}' for the query: '{search_query}'")
        messages = [
            {"role": "system", "content": PromptTemplates.YES_NO_MACHINE},
            {"role": "user", "content": f"Is it likely that an artile titled '{search_title}' contain relevant information to the search query '{search_query}'?"}
        ]
        response = openai.ChatCompletion.create(
            model=self.model, messages=messages, headers=self.headers
        )
        relevance_response = response.choices[0].message['content'].lower()
        time.sleep(1) # avoid going over api limits
        return 'yes' in relevance_response or 'relevant' in relevance_response  # Returns True if the title is relevant, False otherwise.

    def generate_hypothesis(self, search_results):
        logger.info(f"Generating hypothesis for search results")
        messages = [
            {"role": "system", "content": PromptTemplates.HYPOTHESIZE},
            {"role": "user", "content": f"Based on the following search results, formulate a hypothesis:\n{search_results}"}
        ]
        response = openai.ChatCompletion.create(
            model=self.model, messages=messages, headers=self.headers
        )

        hypothesis = response.choices[0].message['content']
        logger.info(f"Generated hypothesis: {hypothesis}")
        return hypothesis

    def hypothesis_tournament(self, hypotheses):
        """
        Conduct a tournament to determine the best hypothesis.

        The tournament is structured as follows:
        1. Pair up hypotheses to compete against each other.
        2. Use OpenAI GPT to determine the winner of each pair.
        3. If there’s an odd number of hypotheses, the last one gets a bye to the next round.
        4. Repeat the process until one hypothesis remains.

        :param hypotheses: List of hypotheses to compete.
        :return: The winning hypothesis.
        """
        logger.info(f"Starting hypothesis tournament with {len(hypotheses)} hypotheses")
        while len(hypotheses) > 1:
            next_round_hypotheses = []
            for i in range(0, len(hypotheses), 2):
                if i + 1 < len(hypotheses):
                    winner_hypothesis = self.compete_hypotheses(hypotheses[i], hypotheses[i + 1])
                    next_round_hypotheses.append(winner_hypothesis)
                else:
                    # If an odd number of hypotheses, pass the last one to the next round.
                    next_round_hypotheses.append(hypotheses[i])
            hypotheses = next_round_hypotheses
        winner = hypotheses[0]
        logger.info(f"Winner of the tournament: {winner}")
        return winner

    def compete_hypotheses(self, hypothesis_a, hypothesis_b):
        logger.info(f"Comparing hypothesis A: '{hypothesis_a}' with hypothesis B: '{hypothesis_b}'")
        messages = [
            {"role": "system", "content": PromptTemplates.REFINE},
            {"role": "user", "content": f"Refine and merge the following hypotheses to create a superior hypothesis:\n1. {hypothesis_a}\n2. {hypothesis_b}"}
        ]
        response = openai.ChatCompletion.create(
            model=self.model, messages=messages, headers=self.headers
        )

        refined_hypothesis = response.choices[0].message['content']
        logger.info(f"Refined hypothesis: {refined_hypothesis}")
        return refined_hypothesis

    def conversational_response(self, context, final_hypothesis):
        logger.info(f"Crafting a conversational response for the final hypothesis")
        messages = [
            {"role": "system", "content": PromptTemplates.RESPOND},
            {"role": "user", "content": f"Context: {context}\n\nFinal Hypothesis: {final_hypothesis}\n\nPlease respond in a natural conversational manner."}
        ]
        response = openai.ChatCompletion.create(
            model=self.model, messages=messages, headers=self.headers
        )

        conversational_answer = response.choices[0].message['content']
        logger.info(f"Conversational response: {conversational_answer}")
        return conversational_answer

    def run(self, main_query):
        logger.info(f"Running the pipeline for main query: {main_query}")
        # Step 1: Brainstorming
        search_queries = self.brainstorm(main_query)

        # Step 2 & 3: Searching and Hypothesizing
        hypotheses = [
            self.generate_hypothesis(self.search_wikipedia(query))
                for query in search_queries
                if self.search_wikipedia(query) is not None
        ]

        condensed_hypotheses = [
            self.condense_information(hypothesis)
                for hypothesis in hypotheses
        ]

        # Step 4: Refining through Hypothesis Tournament
        final_hypothesis = self.hypothesis_tournament(condensed_hypotheses) #hypotheses)
        logger.info(f"Final hypothesis for the query: {final_hypothesis}")

        final_response = self.conversational_response(
            context = f'Question: {main_query}',
            final_hypothesis = final_hypothesis
        )

        print(f"\n\nMain Query: {main_query}")
        print(f"Search Queries: {search_queries}")
        print(f"Hypotheses: {hypotheses}")
        print(f"Final Hypothesis: {final_hypothesis}")

        print('\n\n------------ FINAL RESPONSE ------------')
        print(final_response)
        return final_response

# Usage
pipeline = BSHRPipeline(
    api_key='go-to---openrouter.ai---to-sign-up-for-a-key',
    model = 'mistralai/mistral-7b-instruct', # currently this is free
    mock_open_ai = True
)

[32m2023-10-26 17:41:22.533[0m | [1mINFO    [0m | [36m__main__[0m:[36m__init__[0m:[36m61[0m - [1mInitialized BSHRPipeline[0m


In [12]:
import sys
logger.remove()
logger.add(sys.stderr, level= "INFO") #"DEBUG")

final_response = pipeline.run(main_query="How many people were affected by the two world wars?")

[32m2023-10-26 17:41:38.609[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun[0m:[36m224[0m - [1mRunning the pipeline for main query: How many people were affected by the two world wars?[0m
[32m2023-10-26 17:41:38.612[0m | [1mINFO    [0m | [36m__main__[0m:[36mbrainstorm[0m:[36m64[0m - [1mBrainstorming search queries for: How many people were affected by the two world wars?[0m
[32m2023-10-26 17:41:45.673[0m | [1mINFO    [0m | [36m__main__[0m:[36mbrainstorm[0m:[36m74[0m - [1mGenerated search queries: ['1. What were the total number of casualties in both world wars?', '2. How many people were displaced or forced to migrate due to the two world wars?', '3. What were the long-term effects of the two world wars on global population growth?', '4. How did the two world wars impact the distribution of wealth and resources globally?', '5. What were the demographic changes in different regions as a result of the two world wars?'][0m
[32m2023-10-26 17:41:45.677[0m



Main Query: How many people were affected by the two world wars?
Search Queries: ['1. What were the total number of casualties in both world wars?', '2. How many people were displaced or forced to migrate due to the two world wars?', '3. What were the long-term effects of the two world wars on global population growth?', '4. How did the two world wars impact the distribution of wealth and resources globally?', '5. What were the demographic changes in different regions as a result of the two world wars?']
Hypotheses: ['Hypothesis: Population transfer, as a type of mass migration, has been imposed by state policy or international authority throughout history, often on the basis of ethnicity or religion, resulting in the loss of immovable and movable property and causing substantial harm to the affected population.', 'Hypothesis: The population of Palestine has undergone changes in size and ethnic composition throughout history.']
Final Hypothesis: A comprehensive and more accurate hypo