In [20]:
from langchain.callbacks import get_openai_callback
from langchain.chains import LLMChain
from langchain_openai import OpenAI, ChatOpenAI
from langchain.prompts import PromptTemplate
# from langchain.chat_models import ChatOpenAI
from langchain.chains import SequentialChain
from langchain_experimental.pal_chain import PALChain
from sentence_transformers import SentenceTransformer, util

import asyncio
import nest_asyncio
import numpy as np
import os
import re

nest_asyncio.apply()
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

In [21]:
template ="""
Step1 :
 
Here is a question {input}. Could you brainstorm three distinct solutions? Please consider a variety of factors 
A:
"""

prompt = PromptTemplate(
    input_variables=["input"],
    template = template                      
)

chain1 = LLMChain(
    llm=ChatOpenAI(temperature=0, model="gpt-3.5-turbo"),
    prompt=prompt,
    output_key="solutions",
    verbose=False
)

template ="""
Step 2:

For each of the three proposed solutions, evaluate their potential. Consider their pros and cons, initial effort needed, implementation difficulty, potential challenges, and the expected outcomes. Assign a probability of success and a confidence level to each option based on these factors

{solutions}

A:"""

prompt = PromptTemplate(
    input_variables=["solutions"],
    template = template                      
)

chain2 = LLMChain(
    llm=ChatOpenAI(temperature=0, model="gpt-3.5-turbo"),
    prompt=prompt,
    output_key="review",
    verbose=False
)

template ="""
Step 3:

For each solution, deepen the thought process. Generate potential scenarios, strategies for implementation, any necessary partnerships or resources, and how potential obstacles might be overcome. Also, consider any potential unexpected outcomes and how they might be handled.

{review}

A:"""

prompt = PromptTemplate(
    input_variables=["review"],
    template = template                      
)

chain3 = LLMChain(
    llm=ChatOpenAI(temperature=0, model="gpt-3.5-turbo"),
    prompt=prompt,
    output_key="deepen_thought_process",
    verbose=False
)

template ="""
Step 4:

Based on the evaluations and scenarios, rank the solutions in order of promise. Provide a justification for each ranking and offer any final thoughts or considerations for each solution. Finally reword the response as possible steps to take to achieve the goal
{deepen_thought_process}

A:"""

prompt = PromptTemplate(
    input_variables=["deepen_thought_process"],
    template = template                      
)

chain4 = LLMChain(
    llm=ChatOpenAI(temperature=0, model="gpt-3.5-turbo"),
    prompt=prompt,
    verbose=False,
    output_key="ranked_solutions"
)

tot_chain = SequentialChain(
    chains=[chain1, chain2, chain3, chain4],
    input_variables=["input"],
    output_variables=["ranked_solutions"],
    verbose=False
)

In [22]:
weak_llm = "gpt-3.5-turbo-instruct"
strong_llm= "gpt-4"

# Load the pre-trained sentence embedding model
embedding_model = SentenceTransformer('paraphrase-MiniLM-L6-v2', cache_folder='/mnt/model_cache/') # change this location to where you want to store the embedding model

pal_llm_temperature = 0.7
llm = OpenAI(temperature=pal_llm_temperature, model=weak_llm)

pal_chain = PALChain.from_math_prompt(llm=llm, verbose=False)

In [23]:
template = """Question: {question}

Answer: Let's think step by step. If you do not know the answer or are not confident about the answer reply that you don't know the answer. Do not hallucinate an answer. Just return the number as an answer"""

cot_llm_temperature = 0.7
prompt = PromptTemplate.from_template(template)
cot_llm_chain = LLMChain(llm=OpenAI(temperature=cot_llm_temperature,
                                    model=weak_llm),
                         prompt=prompt)
cot_strong_llm_chain = LLMChain(llm=ChatOpenAI(model=strong_llm), 
                                prompt=prompt)

In [24]:
def get_final_answer(samples):
    """
    Identifies the most similar answer among a list of samples based on cosine similarity.

    Parameters:
    - samples (list of dicts): A list where each element is a dict containing an 'text' key.

    Returns:
    - tuple: The most similar answer and the matrix of similarity scores between all answers.
    """
    
    # Extract texts from samples
    answers = [sample['text'] for sample in samples]
    
    # Encode the answers into vectors and calculate similarity scores
    answer_embeddings = embedding_model.encode(answers, convert_to_tensor=True)
    similarity_scores = util.pytorch_cos_sim(answer_embeddings, answer_embeddings).numpy()

    # Determine the index of the most similar answer
    most_similar_idx = np.argmax(np.mean(similarity_scores, axis=1))
    most_similar_answer = answers[most_similar_idx]

    print(f"The answer that agrees the most with the majority is: {most_similar_answer}")
    return most_similar_answer, similarity_scores

In [25]:
async def decision_by_vote(question: str, n_samples: int = 3, mode: str = 'mot'):
    """
    Decides on an answer by collecting samples based using the specified thought representation.

    This function generates input questions based on the specified mode, processes samples from different language
    models, and then determines the most consistent answer along with its similarity score.

    Parameters:
    - question: The question to be answered by the models.
    - n_samples: The number of samples to generate for voting. Default is 3.
    - mode: The mode of decision making. Can be 'mot' (use mixture of thought), 'cot' (chain of thought),
      'pal' (program assisted learning), or 'tot' (tree of thought). The mode is case insensitive.

    Returns:
    - llm_answer: The answer selected as the most consistent across generated samples.
    - s_score: The s score for the most consistent answer, indicating agreement level among samples.
    """
    
    # Generate input questions based on mode
    mode = mode.lower()
    input_key = 'input' if mode == 'tot' else 'question'
    input_questions = [{input_key: question}] * n_samples

    # Initialize variables
    llm_answer = None
    similarity_matrix = None
    samples = []

    # Function to process samples and remove newlines
    def process_samples(raw_samples, key):
        return [{'text': sample[key]} for sample in raw_samples]

    # Fetch and process samples based on mode
    if mode in ["mot", "cot"]:
        cot_samples = await cot_llm_chain.aapply(input_questions)
        samples.extend(process_samples(cot_samples, 'text'))

    if mode in ["mot", "pal"]:
        pal_samples = pal_chain.batch(input_questions)  # Assuming apply can be awaited, if pal_chain supports async
        samples.extend(process_samples(pal_samples, 'result'))

    if mode == 'tot':
        tot_samples = tot_chain.batch(input_questions)  # Assuming apply can be awaited, if tot_chain supports async
        samples.extend(process_samples(tot_samples, 'ranked_solutions'))

    if samples:
        llm_answer, similarity_matrix = get_final_answer(samples)

    # Calculate the s-score for the most consistent answer
    s_score = np.mean(similarity_matrix[:, np.argmax(similarity_matrix.sum(axis=0))]) if similarity_matrix is not None else 0

    return llm_answer, s_score

In [26]:
question = "Gavin has 23 shirts. 6 are blue the rest are green. How many green shirts does Gavin have?"

In [27]:
"""
Get the most consistent answer from the PAL samples.
PAL is useful when the question needs reasoning like inferring transitive and associative properties
"""
with get_openai_callback() as cb:
    pal_results = asyncio.run(decision_by_vote(question=question, mode='pal'))
    print(cb)

The answer that agrees the most with the majority is: 17
Tokens Used: 3548
	Prompt Tokens: 3339
	Completion Tokens: 209
Successful Requests: 3
Total Cost (USD): $0.0054265


In [28]:
"""
Get the most consistent answer from the CoT samples
Good for simple reasoning questions like this one about green shirts
"""
with get_openai_callback() as cb:
    cot_results = asyncio.run(decision_by_vote(question=question, mode='cot'))
    print(cb)

The answer that agrees the most with the majority is: .



The answer is 17.
Tokens Used: 265
	Prompt Tokens: 213
	Completion Tokens: 52
Successful Requests: 3
Total Cost (USD): $0.0004235


In [29]:
"""
Get the most consistent answer from the MoT samples
Useful when we want to be really confident about the answer
"""
with get_openai_callback() as cb:
    mot_results = asyncio.run(decision_by_vote(question=question, mode='mot'))
    print(cb)

The answer that agrees the most with the majority is: 17
Tokens Used: 3779
	Prompt Tokens: 3552
	Completion Tokens: 227
Successful Requests: 6
Total Cost (USD): $0.005782


In [30]:
# accept the answer from the vote based method if the s_score is greater than a threshold
vote_threshold = 0.65

# for the green shirts question CoT should give a good answer, for more complex queries PAL/MoT are better
answer, s_score = cot_results

if s_score >= vote_threshold:
    print(f"Answer from vote using {weak_llm}: {answer}")
    
else:
    with get_openai_callback() as cb:
        result = cot_strong_llm_chain.invoke(question)
        print(cb)
    llm_answer = result['text'].replace("\n", "").replace(".", "")
    print(f"Answer from {strong_llm}: {llm_answer}")
    
# uncomment below to get the answer from gpt-4 (sample size=1)
# with get_openai_callback() as cb:
#     result = cot_strong_llm_chain.invoke(question)
#     print(cb)
# llm_answer = result['text'].replace("\n", "").replace(".", "")
# print(f"Answer from {strong_llm}: {llm_answer}")

Answer from vote using gpt-3.5-turbo-instruct: .



The answer is 17.


In [31]:
# Get the CoT answer and the PAL answer; check if they match
pal_answer, _ = pal_results
cot_answer, _ = cot_results

# Verification threshold
verification_threshold = 0.65

# Extract the last number in the cot text
last_digit = re.findall(r'(\d+)(?![\d\S])', cot_answer)
cot_answer = last_digit[-1] if last_digit else cot_answer
# get the embeddings for the pal and cot_answers
cot_embedding = embedding_model.encode(cot_answer, convert_to_tensor=True)
pal_embedding = embedding_model.encode(pal_answer, convert_to_tensor=True)

# compare the similarity between the embeddings
similarity_score = util.pytorch_cos_sim(cot_embedding, pal_embedding).numpy()

# accept answer if similarity score is greater than a threshold
if similarity_score >= verification_threshold:
    print(f"Answer from verification using {weak_llm} : {cot_answer}")
    
else:
    with get_openai_callback() as cb:
        result = cot_strong_llm_chain.invoke(question)
        print(cb)
    llm_answer = result['text'].replace("\n", "").replace(".", "")
    print(f"Answer from {strong_llm}: {llm_answer}")

Answer from verification using gpt-3.5-turbo-instruct : .



The answer is 17.


In [32]:
# Lets try ToT on a planning type question using the voting mechanism and compare the output with gpt-4
# Running ToT takes time

question = "How to colonize Mars?"

with get_openai_callback() as tot_cb:
    tot_answer, tot_s_score = asyncio.run(decision_by_vote(question=question, mode='tot'))
    print("\n gpt3.5 token usage and cost \n")
    print(tot_cb)

template = """Question: {question}

Do not hallucinate an answer

Answer:"""

prompt = PromptTemplate.from_template(template)
gpt4_cot_llm_chain = LLMChain(llm=ChatOpenAI(model="gpt-4"), prompt=prompt)
with get_openai_callback() as cb:
    gpt4_answer = gpt4_cot_llm_chain.invoke(question)
    print("\n\n gpt-4 token usage and cost \n\n")
    print(cb)

tot_embedding = embedding_model.encode(tot_answer, convert_to_tensor=True)
gpt4_embedding = embedding_model.encode(gpt4_answer['text'], convert_to_tensor=True)

The answer that agrees the most with the majority is: Steps to achieve the goal:

1. Prioritize sending robotic missions to Mars as the most promising solution. This involves partnering with private space companies, conducting extensive simulations and testing, and collaborating with space agencies and robotics companies for technology development. Be prepared for limited funding, technical failures, and the need for continuous software updates.

2. Invest in developing advanced propulsion technology as the second priority. This includes investing in research projects, collaborating with universities and research institutions, and partnering with aerospace companies and international space agencies. Anticipate technical setbacks, regulatory challenges, and public skepticism.

3. Establish international collaboration and partnerships as the third priority. Focus on clear communication channels, common goals, and agreements on data sharing and technology transfer. Address potential obsta

In [33]:
# Lets print out the answers and the similarity score
print(f"\n\n gpt3.5 ToT answer: \n\n {tot_answer}")
print(f"\n\n gpt4 answer: \n\n {gpt4_answer['text']}")
print(f"\n\n Similarity between gpt-4 answer and gpt3.5 tot answer : {util.pytorch_cos_sim(gpt4_embedding, tot_embedding).numpy()}")
print(f"\n \n ToT 's' score : {tot_s_score}")



 gpt3.5 ToT answer: 

 Steps to achieve the goal:

1. Prioritize sending robotic missions to Mars as the most promising solution. This involves partnering with private space companies, conducting extensive simulations and testing, and collaborating with space agencies and robotics companies for technology development. Be prepared for limited funding, technical failures, and the need for continuous software updates.

2. Invest in developing advanced propulsion technology as the second priority. This includes investing in research projects, collaborating with universities and research institutions, and partnering with aerospace companies and international space agencies. Anticipate technical setbacks, regulatory challenges, and public skepticism.

3. Establish international collaboration and partnerships as the third priority. Focus on clear communication channels, common goals, and agreements on data sharing and technology transfer. Address potential obstacles such as political tensio