In [102]:
from langchain.callbacks import get_openai_callback
from langchain.chains import LLMChain
from langchain_openai import OpenAI, ChatOpenAI
from langchain.prompts import PromptTemplate
# from langchain.chat_models import ChatOpenAI
from langchain.chains import SequentialChain
from langchain_experimental.pal_chain import PALChain
from sentence_transformers import SentenceTransformer, util

import asyncio
import nest_asyncio
import numpy as np
import os
import pandas as pd
import re

nest_asyncio.apply()
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

# Note : This program has OPENAI_API_KEY set as an environment variable

### Let's set up the template for Tree of Thought. We'll use this for tasks that involve planning

In [103]:
template ="""
Step1 :
 
Here is a question {input}. Could you brainstorm three distinct solutions? Please consider a variety of factors 
A:
"""

prompt = PromptTemplate(
    input_variables=["input"],
    template = template                      
)

chain1 = LLMChain(
    llm=ChatOpenAI(temperature=0, model="gpt-3.5-turbo"),
    prompt=prompt,
    output_key="solutions",
    verbose=False
)

template ="""
Step 2:

For each of the three proposed solutions, evaluate their potential. Consider their pros and cons, initial effort needed, implementation difficulty, potential challenges, and the expected outcomes. Assign a probability of success and a confidence level to each option based on these factors

{solutions}

A:"""

prompt = PromptTemplate(
    input_variables=["solutions"],
    template = template                      
)

chain2 = LLMChain(
    llm=ChatOpenAI(temperature=0, model="gpt-3.5-turbo"),
    prompt=prompt,
    output_key="review",
    verbose=False
)

template ="""
Step 3:

For each solution, deepen the thought process. Generate potential scenarios, strategies for implementation, any necessary partnerships or resources, and how potential obstacles might be overcome. Also, consider any potential unexpected outcomes and how they might be handled.

{review}

A:"""

prompt = PromptTemplate(
    input_variables=["review"],
    template = template                      
)

chain3 = LLMChain(
    llm=ChatOpenAI(temperature=0, model="gpt-3.5-turbo"),
    prompt=prompt,
    output_key="deepen_thought_process",
    verbose=False
)

template ="""
Step 4:

Based on the evaluations and scenarios, rank the solutions in order of promise. Provide a justification for each ranking and offer any final thoughts or considerations for each solution. Finally reword the response as possible steps to take to achieve the goal
{deepen_thought_process}

A:"""

prompt = PromptTemplate(
    input_variables=["deepen_thought_process"],
    template = template                      
)

chain4 = LLMChain(
    llm=ChatOpenAI(temperature=0, model="gpt-3.5-turbo"),
    prompt=prompt,
    verbose=False,
    output_key="ranked_solutions"
)

tot_chain = SequentialChain(
    chains=[chain1, chain2, chain3, chain4],
    input_variables=["input"],
    output_variables=["ranked_solutions"],
    verbose=False
)

### Set up the LLMs, embedding model and PAL chain

In [104]:
weak_llm = "gpt-3.5-turbo-instruct"
strong_llm= "gpt-4"

# Load the pre-trained sentence embedding model
embedding_model = SentenceTransformer('paraphrase-MiniLM-L6-v2', cache_folder='/mnt/artifacts/model_cache/') # change this location to where you want to store the embedding model

pal_llm_temperature = 0.7
llm = OpenAI(temperature=pal_llm_temperature, model=weak_llm)

pal_chain = PALChain.from_math_prompt(llm=llm, verbose=False)

### Setup the CoT chain

In [105]:
template = """Question: {question}

Answer: Let's think step by step. If you do not know the answer or are not confident about the answer reply that you don't know the answer. Do not hallucinate an answer. Just return the number as an answer"""

cot_llm_temperature = 0.7
prompt = PromptTemplate.from_template(template)
cot_llm_chain = LLMChain(llm=OpenAI(temperature=cot_llm_temperature,
                                    model=weak_llm),
                         prompt=prompt)
cot_strong_llm_chain = LLMChain(llm=ChatOpenAI(model=strong_llm), 
                                prompt=prompt)

### Helper function to return the answer that agrees most with the samples

In [106]:
def get_final_answer(samples):
    """
    Identifies the most similar answer among a list of samples based on cosine similarity.

    Parameters:
    - samples (list of dicts): A list where each element is a dict containing an 'text' key.

    Returns:
    - tuple: The most similar answer and the matrix of similarity scores between all answers.
    """
    
    # Extract texts from samples
    answers = [sample['text'] for sample in samples]
    
    # Encode the answers into vectors and calculate similarity scores
    answer_embeddings = embedding_model.encode(answers, convert_to_tensor=True)
    similarity_scores = util.pytorch_cos_sim(answer_embeddings, answer_embeddings).numpy()

    # Determine the index of the most similar answer
    most_similar_idx = np.argmax(np.mean(similarity_scores, axis=1))
    most_similar_answer = answers[most_similar_idx]

    print(f"The answer that agrees the most with the majority is: {most_similar_answer}")
    return most_similar_answer, similarity_scores

### Implementation of the decision by vote where multiple answers are sampled and the answer that agrees the most with the sampled answer is returned

In [107]:
async def decision_by_vote(question: str, n_samples: int = 3, mode: str = 'mot'):
    """
    Decides on an answer by collecting samples based using the specified thought representation.

    This function generates sample answers based on the specified mode and then determines the most consistent answer along with its similarity score.

    Parameters:
    - question: The question to be answered by the models.
    - n_samples: The number of samples to generate for voting. Default is 3.
    - mode: The mode of decision making. Can be 'mot' (use mixture of thought), 'cot' (chain of thought),
      'pal' (program assisted learning), or 'tot' (tree of thought). The mode is case insensitive.

    Returns:
    - llm_answer: The answer selected as the most consistent across generated samples.
    - s_score: The s score for the most consistent answer, indicating agreement level among samples.
    """
    
    # Generate input questions based on mode
    mode = mode.lower()
    input_key = 'input' if mode == 'tot' else 'question'
    input_questions = [{input_key: question}] * n_samples

    # Initialize variables
    llm_answer = None
    similarity_matrix = None
    samples = []

    # Function to process samples and remove newlines
    def process_samples(raw_samples, key):
        return [{'text': sample[key]} for sample in raw_samples]

    # Fetch and process samples based on mode
    if mode in ["mot", "cot"]:
        cot_samples = await cot_llm_chain.aapply(input_questions)
        samples.extend(process_samples(cot_samples, 'text'))

    if mode in ["mot", "pal"]:
        pal_samples = pal_chain.batch(input_questions)  # Assuming apply can be awaited, if pal_chain supports async
        samples.extend(process_samples(pal_samples, 'result'))

    if mode == 'tot':
        tot_samples = tot_chain.batch(input_questions)  # Assuming apply can be awaited, if tot_chain supports async
        samples.extend(process_samples(tot_samples, 'ranked_solutions'))

    if samples:
        llm_answer, similarity_matrix = get_final_answer(samples)

    # Calculate the s-score for the most consistent answer
    s_score = np.mean(similarity_matrix[:, np.argmax(similarity_matrix.sum(axis=0))]) if similarity_matrix is not None else 0

    return llm_answer, s_score

In [108]:
# Question to be used for the different thought representations

question  = """
Imagine you have a list of temperatures in Celsius from various cities on a particular day: [22, 18, 25, 30, 24].
convert these temperatures to Fahrenheit using the formula F = C * 9/5 + 32? """


# Get the answers from the vote based method for the different thought representations

In [109]:
"""
Get the most consistent answer from the PAL samples.
PAL is useful when the question needs reasoning like inferring transitive and associative properties
"""



with get_openai_callback() as pal_cb:
    pal_results = asyncio.run(decision_by_vote(question=question, mode='pal'))
    print(pal_cb)
# pal_chain.invoke(question)

The answer that agrees the most with the majority is: [71.6, 64.4, 77.0, 86.0, 75.2]
Tokens Used: 3789
	Prompt Tokens: 3432
	Completion Tokens: 357
Successful Requests: 3
Total Cost (USD): $0.005861999999999999


In [110]:
"""
Get the most consistent answer from the CoT samples
Good for simple reasoning questions like this one about green shirts
"""
with get_openai_callback() as cot_cb:
    cot_results = asyncio.run(decision_by_vote(question=question, mode='cot'))
    print(cot_cb)

The answer that agrees the most with the majority is: .

The list of temperatures in Fahrenheit would be [71.6, 64.4, 77, 86, 75.2].
Tokens Used: 492
	Prompt Tokens: 306
	Completion Tokens: 186
Successful Requests: 3
Total Cost (USD): $0.0008309999999999999


#### Note : This will resample from CoT and PAL and will not reuse the samples from above

In [111]:
"""
Get the most consistent answer from the MoT samples
Useful when we want to be really confident about the answer
"""
with get_openai_callback() as mot_cb:
    mot_results = asyncio.run(decision_by_vote(question=question, mode='mot'))
    print(mot_cb)

The answer that agrees the most with the majority is: [71.6, 64.4, 77.0, 86.0, 75.2]
Tokens Used: 4605
	Prompt Tokens: 3738
	Completion Tokens: 867
Successful Requests: 6
Total Cost (USD): $0.007341


### Decide whether to accept/reject and call the stronger LLM to get an answer for the vote based method

In [112]:
# accept the answer from the vote based method if the s_score is greater than a threshold
vote_threshold = 0.65

# for the green shirts question CoT should give a good answer, for more complex queries PAL/MoT are better
answer, s_score = cot_results

if s_score >= vote_threshold:
    print(f"Answer from vote using {weak_llm}: {answer}")
    
else:
    with get_openai_callback() as cb:
        result = cot_strong_llm_chain.invoke(question)
        print(cb)
    llm_answer = result['text'].replace("\n", "").replace(".", "")
    print(f"Answer from {strong_llm}: {llm_answer}")
    
# uncomment below to get the answer from gpt-4 (sample size=1)
# with get_openai_callback() as gpt4_cb:
#     result = cot_strong_llm_chain.invoke(question)
#     print(gpt4_cb)
# llm_answer = result['text'].replace("\n", "").replace(".", "")
# print(f"Answer from {strong_llm}: {llm_answer}")

Answer from vote using gpt-3.5-turbo-instruct: .

The list of temperatures in Fahrenheit would be [71.6, 64.4, 77, 86, 75.2].


### Decide whether to accept/reject and call the stronger LLM to get an answer for the decision based method

In [113]:
# Get the CoT answer and the PAL answer; check if they match
pal_answer, _ = pal_results
cot_answer, _ = cot_results

# Verification threshold
verification_threshold = 0.65

# Extract the last number in the cot text
last_digit = re.findall(r'(\d+)(?![\d\S])', cot_answer)
cot_answer = last_digit[-1] if last_digit else cot_answer
# get the embeddings for the pal and cot_answers
cot_embedding = embedding_model.encode(cot_answer, convert_to_tensor=True)
pal_embedding = embedding_model.encode(pal_answer, convert_to_tensor=True)

# compare the similarity between the embeddings
similarity_score = util.pytorch_cos_sim(cot_embedding, pal_embedding).numpy()

# accept answer if similarity score is greater than a threshold
if similarity_score >= verification_threshold:
    print(f"Answer from verification using {weak_llm} : {cot_answer}")
    
else:
    with get_openai_callback() as cb:
        result = cot_strong_llm_chain.invoke(question)
        print(cb)
    llm_answer = result['text'].replace("\n", "").replace(".", "")
    print(f"Answer from {strong_llm}: {llm_answer}")

Tokens Used: 301
	Prompt Tokens: 109
	Completion Tokens: 192
Successful Requests: 1
Total Cost (USD): $0.01479
Answer from gpt-4: To convert each temperature from Celsius to Fahrenheit, we use the formula F = C * 9/5 + 32 Let's apply this to each temperature:- For 22 degrees Celsius: F = 22 * 9/5 + 32 = 716 degrees Fahrenheit- For 18 degrees Celsius: F = 18 * 9/5 + 32 = 644 degrees Fahrenheit- For 25 degrees Celsius: F = 25 * 9/5 + 32 = 77 degrees Fahrenheit- For 30 degrees Celsius: F = 30 * 9/5 + 32 = 86 degrees Fahrenheit- For 24 degrees Celsius: F = 24 * 9/5 + 32 = 752 degrees FahrenheitSo the converted temperatures in Fahrenheit are [716, 644, 77, 86, 752]


### Let's try the Tree of Thought for a more complex and subjective question using the vote based method

In [114]:
# Lets try ToT on a planning type question using the voting mechanism and compare the output with gpt-4
# Running ToT takes time

question = "How to colonize Mars?"

with get_openai_callback() as tot_cb:
    tot_answer, tot_s_score = asyncio.run(decision_by_vote(question=question, mode='tot'))
    print("\n gpt3.5 token usage and cost \n")
    print(tot_cb)

template = """Question: {question}

Do not hallucinate an answer

Answer:"""

prompt = PromptTemplate.from_template(template)
gpt4_cot_llm_chain = LLMChain(llm=ChatOpenAI(model="gpt-4"), prompt=prompt)
with get_openai_callback() as gpt4_tot_cb:
    gpt4_answer = gpt4_cot_llm_chain.invoke(question)
    print("\n\n gpt-4 token usage and cost \n\n")
    print(gpt4_tot_cb)

tot_embedding = embedding_model.encode(tot_answer, convert_to_tensor=True)
gpt4_embedding = embedding_model.encode(gpt4_answer['text'], convert_to_tensor=True)

The answer that agrees the most with the majority is: 1. Sending robotic missions first
2. Building underground habitats
3. Terraforming Mars

Justification:
Sending robotic missions first is ranked highest as it allows for gathering essential data and preparing for future human missions. Building underground habitats is ranked second as it provides a more feasible and immediate solution for sustaining life on Mars. Terraforming Mars is ranked last as it presents significant challenges and uncertainties that may not be achievable in the near future.

Steps to achieve the goal:
1. Develop autonomous robots for Mars exploration
2. Conduct feasibility studies for underground habitats on Mars
3. Collaborate with international partners for terraforming research and development.

 gpt3.5 token usage and cost 

Tokens Used: 7083
	Prompt Tokens: 3586
	Completion Tokens: 3497
Successful Requests: 12
Total Cost (USD): $0.012373


 gpt-4 token usage and cost 


Tokens Used: 424
	Prompt Tokens: 24

In [115]:
# Let's see if the answers from the vote based ToT and GPT-4 match
print(f"Similarity between gpt-4 answer and gpt3.5 tot answer : {util.pytorch_cos_sim(gpt4_embedding, tot_embedding).numpy()[0][0]}")
print(f"ToT 's' score : {tot_s_score}")

Similarity between gpt-4 answer and gpt3.5 tot answer : 0.6983373165130615
ToT 's' score : 0.8805679678916931


### Let's compare the cost and the results for the ToT vote based method and GPT-4

In [116]:
data = {
    'Technique': ['Cost($ USD)', 'Answer'],
    'ToT(# samples = 3)': [round(tot_cb.total_cost, 5), tot_answer],
    'GPT-4 (# samples = 1)': [round(gpt4_tot_cb.total_cost, 5), gpt4_answer['text']],
    'Answer similarity':[" ",util.pytorch_cos_sim(gpt4_embedding, tot_embedding).numpy()[0][0] ]
}

df = pd.DataFrame(data)
df.style.hide(axis='index')

Technique,ToT(# samples = 3),GPT-4 (# samples = 1),Answer similarity
Cost($ USD),0.012370,0.024720,
Answer,1. Sending robotic missions first 2. Building underground habitats 3. Terraforming Mars Justification: Sending robotic missions first is ranked highest as it allows for gathering essential data and preparing for future human missions. Building underground habitats is ranked second as it provides a more feasible and immediate solution for sustaining life on Mars. Terraforming Mars is ranked last as it presents significant challenges and uncertainties that may not be achievable in the near future. Steps to achieve the goal: 1. Develop autonomous robots for Mars exploration 2. Conduct feasibility studies for underground habitats on Mars 3. Collaborate with international partners for terraforming research and development.,"Colonizing Mars is a complex process, but here are possible steps that could be involved: 1. Investigate and Understand: First, we need to gain a thorough understanding of Mars. This involves researching its geology, climate, potential resources, and any potential hazards. This can be done through robotic missions, satellites, and telescopes. 2. Develop Technology: We need to develop the technology necessary for travel to Mars, including advanced propulsion systems to shorten the journey, spacecraft capable of carrying humans and cargo, and systems to provide life support for the astronauts during the journey. 3. Test Missions: Before sending humans, several unmanned missions should be conducted to test equipment, conduct more research, and potentially set up initial infrastructure. 4. Human Missions: After all the tests and preparations are done, humans could be sent to Mars. The first few missions might be short-term, but they would pave the way for longer stays. 5. Establish a Base: The next step would be to establish a permanent base. This would likely involve the use of habitats that protect against radiation, supply breathable air, and can maintain a comfortable temperature. 6. Utilize Martian Resources: To sustain a colony, we would need to utilize resources found on Mars. This could involve mining for water ice, which could be used for drinking water, growing food, and even producing fuel. 7. Expand and Develop: Once a base is established and resources are being utilized, the colony could begin to expand. This would involve building more habitats, developing agriculture, and even setting up industries. 8. Terraforming (Optional): This is a hypothetical step that involves transforming Mars' environment to make it more Earth-like. It could involve warming the planet and thickening its atmosphere. Remember, these steps are just a broad outline. The actual process would be far more complex and would require the cooperation of many nations and potentially private companies. It would also take a long time - possibly centuries - and would require substantial resources.",0.698337


### Lets compare the costs and answers for the different thought representations and GPT-4. 
#### In most cases the cascade/chain to follow would be CoT -> PAL -> MoT -> GPT4. Once PAL is run, MoT will not incur any additional LLM costs as the PAL and CoT samples can be reused


In [117]:
data = {
    'Technique': ['Cost($ USD)', 'Answer'],
    'CoT(# samples = 3)': [round(cot_cb.total_cost, 5), cot_answer],
    'PAL (# samples = 3)': [round(pal_cb.total_cost, 5), pal_answer],
    'MoT (# samples = 6)': [round(mot_cb.total_cost, 5), mot_results[0]],
    'GPT-4 (# samples = 1)': [round(gpt4_cb.total_cost, 5), result['text']],
}

question  = """
Imagine you have a list of temperatures in Celsius from various cities on a particular day: [22, 18, 25, 30, 24].
convert these temperatures to Fahrenheit using the formula F = C * 9/5 + 32? """

print(question)
df = pd.DataFrame(data)
df.style.hide(axis='index')


Imagine you have a list of temperatures in Celsius from various cities on a particular day: [22, 18, 25, 30, 24].
convert these temperatures to Fahrenheit using the formula F = C * 9/5 + 32? 


Technique,CoT(# samples = 3),PAL (# samples = 3),MoT (# samples = 6),GPT-4 (# samples = 1)
Cost($ USD),0.000830,0.005860,0.007340,0.014190
Answer,". The list of temperatures in Fahrenheit would be [71.6, 64.4, 77, 86, 75.2].","[71.6, 64.4, 77.0, 86.0, 75.2]","[71.6, 64.4, 77.0, 86.0, 75.2]","To convert each temperature from Celsius to Fahrenheit, we use the formula F = C * 9/5 + 32. Let's apply this to each temperature: - For 22 degrees Celsius: F = 22 * 9/5 + 32 = 71.6 degrees Fahrenheit - For 18 degrees Celsius: F = 18 * 9/5 + 32 = 64.4 degrees Fahrenheit - For 25 degrees Celsius: F = 25 * 9/5 + 32 = 77 degrees Fahrenheit - For 30 degrees Celsius: F = 30 * 9/5 + 32 = 86 degrees Fahrenheit - For 24 degrees Celsius: F = 24 * 9/5 + 32 = 75.2 degrees Fahrenheit So the converted temperatures in Fahrenheit are [71.6, 64.4, 77, 86, 75.2]."
