In [1]:
from langchain.callbacks import get_openai_callback
from langchain.chains import LLMChain
from langchain_openai import OpenAI, ChatOpenAI
from langchain.prompts import PromptTemplate
# from langchain.chat_models import ChatOpenAI
from langchain.chains import SequentialChain
from langchain_experimental.pal_chain import PALChain
from sentence_transformers import SentenceTransformer, util

import asyncio
import nest_asyncio
import numpy as np
import os
import pandas as pd
import re

nest_asyncio.apply()
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

# Note : This program has OPENAI_API_KEY set as an environment variable

### Let's set up the template for Tree of Thought. We'll use this for tasks that involve planning

In [2]:
template ="""
Step1 :
 
Here is a question {input}. Could you brainstorm three distinct solutions? Please consider a variety of factors 
A:
"""

prompt = PromptTemplate(
    input_variables=["input"],
    template = template                      
)

chain1 = LLMChain(
    llm=ChatOpenAI(temperature=0, model="gpt-3.5-turbo"),
    prompt=prompt,
    output_key="solutions",
    verbose=False
)

template ="""
Step 2:

For each of the three proposed solutions, evaluate their potential. Consider their pros and cons, initial effort needed, implementation difficulty, potential challenges, and the expected outcomes. Assign a probability of success and a confidence level to each option based on these factors

{solutions}

A:"""

prompt = PromptTemplate(
    input_variables=["solutions"],
    template = template                      
)

chain2 = LLMChain(
    llm=ChatOpenAI(temperature=0, model="gpt-3.5-turbo"),
    prompt=prompt,
    output_key="review",
    verbose=False
)

template ="""
Step 3:

For each solution, deepen the thought process. Generate potential scenarios, strategies for implementation, any necessary partnerships or resources, and how potential obstacles might be overcome. Also, consider any potential unexpected outcomes and how they might be handled.

{review}

A:"""

prompt = PromptTemplate(
    input_variables=["review"],
    template = template                      
)

chain3 = LLMChain(
    llm=ChatOpenAI(temperature=0, model="gpt-3.5-turbo"),
    prompt=prompt,
    output_key="deepen_thought_process",
    verbose=False
)

template ="""
Step 4:

Based on the evaluations and scenarios, rank the solutions in order of promise. Provide a justification for each ranking and offer any final thoughts or considerations for each solution. Finally reword the response as possible steps to take to achieve the goal
{deepen_thought_process}

A:"""

prompt = PromptTemplate(
    input_variables=["deepen_thought_process"],
    template = template                      
)

chain4 = LLMChain(
    llm=ChatOpenAI(temperature=0, model="gpt-3.5-turbo"),
    prompt=prompt,
    verbose=False,
    output_key="ranked_solutions"
)

tot_chain = SequentialChain(
    chains=[chain1, chain2, chain3, chain4],
    input_variables=["input"],
    output_variables=["ranked_solutions"],
    verbose=False
)

### Set up the LLMs, embedding model and PAL chain

In [3]:
weak_llm = "gpt-3.5-turbo-instruct"
strong_llm= "gpt-4"

# Load the pre-trained sentence embedding model
embedding_model = SentenceTransformer('paraphrase-MiniLM-L6-v2', cache_folder='/mnt/artifacts/model_cache/') # change this location to where you want to store the embedding model

pal_llm_temperature = 0.7
llm = OpenAI(temperature=pal_llm_temperature, model=weak_llm)

pal_chain = PALChain.from_math_prompt(llm=llm, verbose=False)

### Setup the CoT chain

In [4]:
template = """Question: {question}

Answer: Let's think step by step. If you do not know the answer or are not confident about the answer reply that you don't know the answer. Do not hallucinate an answer. Just return the number as an answer"""

cot_llm_temperature = 0.7
prompt = PromptTemplate.from_template(template)
cot_llm_chain = LLMChain(llm=OpenAI(temperature=cot_llm_temperature,
                                    model=weak_llm),
                         prompt=prompt)
cot_strong_llm_chain = LLMChain(llm=ChatOpenAI(model=strong_llm), 
                                prompt=prompt)

### Helper function to return the answer that agrees most with the samples

In [5]:
def get_final_answer(samples):
    """
    Identifies the most similar answer among a list of samples based on cosine similarity.

    Parameters:
    - samples (list of dicts): A list where each element is a dict containing an 'text' key.

    Returns:
    - tuple: The most similar answer and the matrix of similarity scores between all answers.
    """
    
    # Extract texts from samples
    answers = [sample['text'] for sample in samples]
    
    # Encode the answers into vectors and calculate similarity scores
    answer_embeddings = embedding_model.encode(answers, convert_to_tensor=True)
    similarity_scores = util.pytorch_cos_sim(answer_embeddings, answer_embeddings).numpy()

    # Determine the index of the most similar answer
    most_similar_idx = np.argmax(np.mean(similarity_scores, axis=1))
    most_similar_answer = answers[most_similar_idx]

    print(f"The answer that agrees the most with the majority is: {most_similar_answer}")
    return most_similar_answer, similarity_scores

### Implementation of the decision by vote where multiple answers are sampled and the answer that agrees the most with the sampled answer is returned

In [6]:
async def decision_by_vote(question: str, n_samples: int = 3, mode: str = 'mot'):
    """
    Decides on an answer by collecting samples based using the specified thought representation.

    This function generates sample answers based on the specified mode and then determines the most consistent answer along with its similarity score.

    Parameters:
    - question: The question to be answered by the models.
    - n_samples: The number of samples to generate for voting. Default is 3.
    - mode: The mode of decision making. Can be 'mot' (use mixture of thought), 'cot' (chain of thought),
      'pal' (program assisted learning), or 'tot' (tree of thought). The mode is case insensitive.

    Returns:
    - llm_answer: The answer selected as the most consistent across generated samples.
    - s_score: The s score for the most consistent answer, indicating agreement level among samples.
    """
    
    # Generate input questions based on mode
    mode = mode.lower()
    input_key = 'input' if mode == 'tot' else 'question'
    input_questions = [{input_key: question}] * n_samples

    # Initialize variables
    llm_answer = None
    similarity_matrix = None
    samples = []

    # Function to process samples and remove newlines
    def process_samples(raw_samples, key):
        return [{'text': sample[key]} for sample in raw_samples]

    # Fetch and process samples based on mode
    if mode in ["mot", "cot"]:
        cot_samples = await cot_llm_chain.aapply(input_questions)
        samples.extend(process_samples(cot_samples, 'text'))

    if mode in ["mot", "pal"]:
        pal_samples = pal_chain.batch(input_questions)  # Assuming apply can be awaited, if pal_chain supports async
        samples.extend(process_samples(pal_samples, 'result'))

    if mode == 'tot':
        tot_samples = tot_chain.batch(input_questions)  # Assuming apply can be awaited, if tot_chain supports async
        samples.extend(process_samples(tot_samples, 'ranked_solutions'))

    if samples:
        llm_answer, similarity_matrix = get_final_answer(samples)

    # Calculate the s-score for the most consistent answer
    s_score = np.mean(similarity_matrix[:, np.argmax(similarity_matrix.sum(axis=0))]) if similarity_matrix is not None else 0

    return llm_answer, s_score

In [7]:
# Question to be used for the different thought representations

question  = """
Imagine you have a list of temperatures in Celsius from various cities on a particular day: [22, 18, 25, 30, 24].
convert these temperatures to Fahrenheit using the formula F = C * 9/5 + 32? """


# Get the answers from the vote based method for the different thought representations

In [8]:
"""
Get the most consistent answer from the PAL samples.
PAL is useful when the question needs reasoning like inferring transitive and associative properties
"""



with get_openai_callback() as pal_cb:
    pal_results = asyncio.run(decision_by_vote(question=question, mode='pal'))
    print(pal_cb)
# pal_chain.invoke(question)

The answer that agrees the most with the majority is: [71.6, 64.4, 77.0, 86.0, 75.2]
Tokens Used: 3798
	Prompt Tokens: 3432
	Completion Tokens: 366
Successful Requests: 3
Total Cost (USD): $0.00588


In [9]:
"""
Get the most consistent answer from the CoT samples
Good for simple reasoning questions like this one about green shirts
"""
with get_openai_callback() as cot_cb:
    cot_results = asyncio.run(decision_by_vote(question=question, mode='cot'))
    print(cot_cb)

The answer that agrees the most with the majority is: .


First, let's create a list of temperatures in Fahrenheit to store our converted values:
Fahrenheit = []

Next, let's loop through each temperature in the list of Celsius temperatures and convert them to Fahrenheit using the formula:
for temp in Celsius:
    fahrenheit = temp * 9/5 + 32
    # add the converted temperature to the Fahrenheit list
    Fahrenheit.append(fahrenheit)

Finally, let's print out the new list of Fahrenheit temperatures to see the converted values:
print(Fahrenheit)

The output should be: [71.6, 64.4, 77.0, 86.0, 75.2]
Tokens Used: 905
	Prompt Tokens: 306
	Completion Tokens: 599
Successful Requests: 3
Total Cost (USD): $0.001657


#### Note : This will resample from CoT and PAL and will not reuse the samples from above

In [10]:
"""
Get the most consistent answer from the MoT samples
Useful when we want to be really confident about the answer
"""
with get_openai_callback() as mot_cb:
    mot_results = asyncio.run(decision_by_vote(question=question, mode='mot'))
    print(mot_cb)

The answer that agrees the most with the majority is: [71.6, 64.4, 77.0, 86.0, 75.2]
Tokens Used: 4308
	Prompt Tokens: 3738
	Completion Tokens: 570
Successful Requests: 6
Total Cost (USD): $0.0067469999999999995


### Decide whether to accept/reject and call the stronger LLM to get an answer for the vote based method

In [11]:
# accept the answer from the vote based method if the s_score is greater than a threshold
vote_threshold = 0.65

# for the green shirts question CoT should give a good answer, for more complex queries PAL/MoT are better
answer, s_score = cot_results

if s_score >= vote_threshold:
    print(f"Answer from vote using {weak_llm}: {answer}")
    
else:
    with get_openai_callback() as cb:
        result = cot_strong_llm_chain.invoke(question)
        print(cb)
    llm_answer = result['text'].replace("\n", "").replace(".", "")
    print(f"Answer from {strong_llm}: {llm_answer}")
    
# uncomment below to get the answer from gpt-4 (sample size=1) and to print the comparison later on
# with get_openai_callback() as gpt4_cb:
#     result = cot_strong_llm_chain.invoke(question)
#     print(gpt4_cb)
# llm_answer = result['text'].replace("\n", "").replace(".", "")
# print(f"Answer from {strong_llm}: {llm_answer}")

Answer from vote using gpt-3.5-turbo-instruct: .


First, let's create a list of temperatures in Fahrenheit to store our converted values:
Fahrenheit = []

Next, let's loop through each temperature in the list of Celsius temperatures and convert them to Fahrenheit using the formula:
for temp in Celsius:
    fahrenheit = temp * 9/5 + 32
    # add the converted temperature to the Fahrenheit list
    Fahrenheit.append(fahrenheit)

Finally, let's print out the new list of Fahrenheit temperatures to see the converted values:
print(Fahrenheit)

The output should be: [71.6, 64.4, 77.0, 86.0, 75.2]


### Decide whether to accept/reject and call the stronger LLM to get an answer for the decision based method

In [12]:
# Get the CoT answer and the PAL answer; check if they match
pal_answer, _ = pal_results
cot_answer, _ = cot_results

# Verification threshold
verification_threshold = 0.65

# Extract the last number in the cot text
last_digit = re.findall(r'(\d+)(?![\d\S])', cot_answer)
cot_answer = last_digit[-1] if last_digit else cot_answer
# get the embeddings for the pal and cot_answers
cot_embedding = embedding_model.encode(cot_answer, convert_to_tensor=True)
pal_embedding = embedding_model.encode(pal_answer, convert_to_tensor=True)

# compare the similarity between the embeddings
similarity_score = util.pytorch_cos_sim(cot_embedding, pal_embedding).numpy()

# accept answer if similarity score is greater than a threshold
if similarity_score >= verification_threshold:
    print(f"Answer from verification using {weak_llm} : {cot_answer}")
    
else:
    with get_openai_callback() as cb:
        result = cot_strong_llm_chain.invoke(question)
        print(cb)
    llm_answer = result['text'].replace("\n", "").replace(".", "")
    print(f"Answer from {strong_llm}: {llm_answer}")

Tokens Used: 136
	Prompt Tokens: 109
	Completion Tokens: 27
Successful Requests: 1
Total Cost (USD): $0.00489
Answer from gpt-4: The temperatures in Fahrenheit would be [716, 644, 77, 86, 752]


### Let's try the Tree of Thought for a more complex and subjective question using the vote based method

In [26]:
# Lets try ToT on a planning type question using the voting mechanism and compare the output with gpt-4
# Running ToT takes time

question = "How to colonize Mars?"

with get_openai_callback() as tot_cb:
    tot_answer, tot_s_score = asyncio.run(decision_by_vote(question=question, mode='tot'))
    print("\n gpt3.5 token usage and cost \n")
    print(tot_cb)

template = """Question: {question}

Do not hallucinate an answer

Answer:"""

prompt = PromptTemplate.from_template(template)
gpt4_cot_llm_chain = LLMChain(llm=ChatOpenAI(model="gpt-4"), prompt=prompt)
with get_openai_callback() as gpt4_tot_cb:
    gpt4_answer = gpt4_cot_llm_chain.invoke(question)
    print("\n\n gpt-4 token usage and cost \n\n")
    print(gpt4_tot_cb)

tot_embedding = embedding_model.encode(tot_answer, convert_to_tensor=True)
gpt4_embedding = embedding_model.encode(gpt4_answer['text'], convert_to_tensor=True)

The answer that agrees the most with the majority is: Ranking of solutions in order of promise:

1. Building underground habitats on Mars
2. Sending robotic missions to Mars
3. Terraforming Mars

Justification for ranking:
- Building underground habitats on Mars offers a more feasible and sustainable solution for supporting human life on the planet, as it addresses the challenges of resource scarcity and environmental risks.
- Sending robotic missions to Mars provides valuable data and insights for future human missions, but may not directly address the immediate need for habitable conditions on the planet.
- Terraforming Mars is a long-term and highly complex solution that may face ethical, technological, and sustainability challenges.

Steps to achieve the goal:
1. Conduct feasibility studies and research on building underground habitats on Mars.
2. Collaborate with experts in space habitat design and construction to develop prototype habitats.
3. Test and refine construction methods

In [27]:
# Let's see if the answers from the vote based ToT and GPT-4 match
print(f"Similarity between gpt-4 answer and gpt3.5 tot answer : {util.pytorch_cos_sim(gpt4_embedding, tot_embedding).numpy()[0][0]}")
print(f"ToT 's' score : {tot_s_score}")

Similarity between gpt-4 answer and gpt3.5 tot answer : 0.7499172687530518
ToT 's' score : 0.8435128331184387


### Let's compare the cost and the results for the ToT vote based method and GPT-4

In [28]:
data = {
    'Technique': ['Cost($ USD)', 'Answer'],
    'ToT(# samples = 3)': [round(tot_cb.total_cost, 5), tot_answer],
    'GPT-4 (# samples = 1)': [round(gpt4_tot_cb.total_cost, 5), gpt4_answer['text']],
    'Answer similarity':[" ",util.pytorch_cos_sim(gpt4_embedding, tot_embedding).numpy()[0][0] ]
}

df = pd.DataFrame(data)
print("How to colonize Mars?")
df.style.hide(axis='index')

How to colonize Mars?


Technique,ToT(# samples = 3),GPT-4 (# samples = 1),Answer similarity
Cost($ USD),0.011840,0.017040,
Answer,"Ranking of solutions in order of promise: 1. Building underground habitats on Mars 2. Sending robotic missions to Mars 3. Terraforming Mars Justification for ranking: - Building underground habitats on Mars offers a more feasible and sustainable solution for supporting human life on the planet, as it addresses the challenges of resource scarcity and environmental risks. - Sending robotic missions to Mars provides valuable data and insights for future human missions, but may not directly address the immediate need for habitable conditions on the planet. - Terraforming Mars is a long-term and highly complex solution that may face ethical, technological, and sustainability challenges. Steps to achieve the goal: 1. Conduct feasibility studies and research on building underground habitats on Mars. 2. Collaborate with experts in space habitat design and construction to develop prototype habitats. 3. Test and refine construction methods in extreme environments on Earth. 4. Establish partnerships with relevant stakeholders for funding and support. 5. Implement gradual experimentation and continuous maintenance to ensure the sustainability of underground habitats on Mars.","Colonizing Mars will be a complex process involving several steps. 1. Space Travel: We need to develop the technology for long duration space flights, including advanced propulsion systems to reduce travel time, and life support systems to sustain astronauts. 2. Mars Habitat: We need to create sustainable habitats that can provide life-supporting conditions. These habitats will need to be able to recycle air and water, and produce food. 3. Radiation Protection: Mars has a much thinner atmosphere than Earth, which means it doesn't shield the surface as well from radiation. We will need to develop technology to shield habitats and astronauts from harmful radiation. 4. Resource Utilization: We need to develop technology to utilize Mars' resources to minimize reliance on supplies from Earth. This includes extracting water from the ground, mining for materials to build with, and potentially even farming. 5. Human Factors: We need to address human factors like physical and mental health during long duration space flights and living on Mars. 6. Legal and Ethical Considerations: Lastly, we need to address legal and ethical considerations regarding colonization, such as who has the right to colonize and exploit resources, and how to prevent contamination of Mars with Earth life. Please note that this is a simplified overview and the actual process will be much more complex and time-consuming, requiring collaboration from various countries and scientific fields.",0.749917


### Lets compare the costs and answers for the different thought representations and GPT-4. 
#### In most cases the cascade/chain to follow would be CoT -> PAL -> MoT -> GPT4. Once PAL is run, MoT can be modified to reuse the PAL and CoT samples can be reused and therefore not incur any additional LLM costs

In [20]:
data = {
    'Technique': ['Cost($ USD)', 'Answer'],
    'CoT(# samples = 3)': [round(cot_cb.total_cost, 5), cot_answer],
    'PAL (# samples = 3)': [round(pal_cb.total_cost, 5), pal_answer],
    'MoT (# samples = 6)': [round(mot_cb.total_cost, 5), mot_results[0]],
    'GPT-4 (# samples = 1)': [round(gpt4_cb.total_cost, 5), result['text']],
}

question  = """
Imagine you have a list of temperatures in Celsius from various cities on a particular day: [22, 18, 25, 30, 24].
convert these temperatures to Fahrenheit using the formula F = C * 9/5 + 32? """

print(question)
df = pd.DataFrame(data)
df.style.hide(axis='index')


Imagine you have a list of temperatures in Celsius from various cities on a particular day: [22, 18, 25, 30, 24].
convert these temperatures to Fahrenheit using the formula F = C * 9/5 + 32? 


Technique,CoT(# samples = 3),PAL (# samples = 3),MoT (# samples = 6),GPT-4 (# samples = 1)
Cost($ USD),0.00166,0.005880,0.006750,0.012870
Answer,32.0,"[71.6, 64.4, 77.0, 86.0, 75.2]","[71.6, 64.4, 77.0, 86.0, 75.2]","To convert the temperatures from Celsius to Fahrenheit, we use the formula F = C * 9/5 + 32. For each temperature in the list: 22°C = 22 * 9/5 + 32 = 71.6°F 18°C = 18 * 9/5 + 32 = 64.4°F 25°C = 25 * 9/5 + 32 = 77°F 30°C = 30 * 9/5 + 32 = 86°F 24°C = 24 * 9/5 + 32 = 75.2°F So, the converted list of temperatures in Fahrenheit is [71.6, 64.4, 77, 86, 75.2]."


In [29]:
data = {
    'Technique': ['Cost($ USD)', 'Answer'],
    'ToT(# samples = 3)': [round(tot_cb.total_cost, 5), tot_answer],
    'GPT-4 (# samples = 1)': [round(gpt4_tot_cb.total_cost, 5), gpt4_answer['text']],
    'Answer similarity':[" ",util.pytorch_cos_sim(gpt4_embedding, tot_embedding).numpy()[0][0] ]
}

df = pd.DataFrame(data)
print("How to colonize Mars?")
df.style.hide(axis='index')

How to colonize Mars?


Technique,ToT(# samples = 3),GPT-4 (# samples = 1),Answer similarity
Cost($ USD),0.011840,0.017040,
Answer,"Ranking of solutions in order of promise: 1. Building underground habitats on Mars 2. Sending robotic missions to Mars 3. Terraforming Mars Justification for ranking: - Building underground habitats on Mars offers a more feasible and sustainable solution for supporting human life on the planet, as it addresses the challenges of resource scarcity and environmental risks. - Sending robotic missions to Mars provides valuable data and insights for future human missions, but may not directly address the immediate need for habitable conditions on the planet. - Terraforming Mars is a long-term and highly complex solution that may face ethical, technological, and sustainability challenges. Steps to achieve the goal: 1. Conduct feasibility studies and research on building underground habitats on Mars. 2. Collaborate with experts in space habitat design and construction to develop prototype habitats. 3. Test and refine construction methods in extreme environments on Earth. 4. Establish partnerships with relevant stakeholders for funding and support. 5. Implement gradual experimentation and continuous maintenance to ensure the sustainability of underground habitats on Mars.","Colonizing Mars will be a complex process involving several steps. 1. Space Travel: We need to develop the technology for long duration space flights, including advanced propulsion systems to reduce travel time, and life support systems to sustain astronauts. 2. Mars Habitat: We need to create sustainable habitats that can provide life-supporting conditions. These habitats will need to be able to recycle air and water, and produce food. 3. Radiation Protection: Mars has a much thinner atmosphere than Earth, which means it doesn't shield the surface as well from radiation. We will need to develop technology to shield habitats and astronauts from harmful radiation. 4. Resource Utilization: We need to develop technology to utilize Mars' resources to minimize reliance on supplies from Earth. This includes extracting water from the ground, mining for materials to build with, and potentially even farming. 5. Human Factors: We need to address human factors like physical and mental health during long duration space flights and living on Mars. 6. Legal and Ethical Considerations: Lastly, we need to address legal and ethical considerations regarding colonization, such as who has the right to colonize and exploit resources, and how to prevent contamination of Mars with Earth life. Please note that this is a simplified overview and the actual process will be much more complex and time-consuming, requiring collaboration from various countries and scientific fields.",0.749917
