In [1]:
# imports
import os
import yaml
import pandas as pd
import numpy as np
import tiktoken
import openai
from openai.embeddings_utils import get_embedding
from openai.embeddings_utils import get_embedding, cosine_similarity

embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"  # this the encoding for text-embedding-ada-002
max_tokens = 8000  # the maximum for text-embedding-ada-002 is 8191

In [2]:
df_topic = pd.read_csv("data/file_info.csv")

In [9]:
def make_embedding(topic:str):
    #check the existence of the topic
    input_datapath = f"data/articles/{topic}.csv"
    if not os.path.exists(input_datapath):
        print("Topic: "+topic+" does not exist." )
        return 

    # load & inspect dataset
    df = pd.read_csv(input_datapath,header=0, encoding="utf8")
    df.fillna("", inplace=True)
    df["HTFXS"]=( "Title: " + df.Heading.str.strip() +"; Subtitle:"+ df.Subtitle.str.strip() + "; Content: " + df.First.str.strip()+df.Text.str.strip()+ "; Source: "+ df.Source.str.strip() )
    df["HTF"]=( "Title: " + df.Heading.str.strip() + "; Subtitle:"+ df.Subtitle.str.strip() + "; Content: " + df.First.str.strip())
    df["embedding"] = df.HTF.apply(lambda x: get_embedding(x, engine=embedding_model))
    df.to_csv(f"data/embedding/{topic}.csv",index=False)

In [10]:
def batch_embedding(start: int, end: int, df):
    df=df.iloc[start:end]
    for index, row in df.iterrows():
        make_embedding(row['Topic'])

In [11]:
batch_embedding(0,5,df_topic)

In [25]:
with open('prompts.yml') as f:
    prompts = yaml.safe_load(f)

with open('template.md') as f:
    template = f.read()

In [13]:
# search through the articles for a specific topic & Heading
def search_article(df, problem, n=8):
    problem_embedding = get_embedding(
        problem,
        engine="text-embedding-ada-002"
    )
    df["similarity"] = df.embedding.apply(lambda x: cosine_similarity(x, problem_embedding))

    results = (
        df.sort_values("similarity", ascending=False)
        .head(n)
    )
    
    return results

In [14]:
def query_message(
    query: str,
    df: pd.DataFrame) -> str:
    """Return a message for GPT, with relevant source texts pulled from a dataframe."""
    query=query
    message = 'Use the below articles to answer the subsequent question.'
    question = f"\n\nQuestion: {query}"
    
    for index, row in df.iterrows():
        message += row['HTFXS']
    
    return message + question

In [38]:
def ask(
    query: str,
    df: pd.DataFrame) -> str:
    """Answers a query using GPT and a dataframe of relevant texts and embeddings."""
    message = query_message(query,df)
    messages = [
        {"role": "system", "content": "You answer questions."},
        {"role": "user", "content": message},
    ]
    #print(messages)
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=messages,
        temperature=0
    )
    response_message = response["choices"][0]["message"]["content"]
    return response_message

In [39]:
def solution(topic):
    input_datapath=f"data/embedding/{topic}.csv"
    if not os.path.exists(input_datapath):
        print("Topic: "+topic+" does not exist." )
        return 
    df=pd.read_csv(input_datapath,header=0, encoding="utf8")
    df["embedding"] = df.embedding.apply(eval).apply(np.array)
    
    progress_results = search_article(df,prompts['Progress Made']['embed'].replace('{Topic}', topic), n=8)
    progress_made=ask(prompts['Progress Made']['prompt'].replace('{Topic}', topic), progress_results)
    
    lessons_results = search_article(df,prompts['Lessons Learned']['embed'].replace('{Topic}', topic), n=8)
    lessons_learned=ask(prompts['Lessons Learned']['prompt'].replace('{Topic}', topic), lessons_results)
    
    challenges_results = search_article(df,prompts['Challenges Ahead']['embed'].replace('{Topic}', topic), n=8)
    challenges_ahead=ask(prompts['Challenges Ahead']['prompt'].replace('{Topic}',topic), challenges_results)
    
    bestpath_results = search_article(df,prompts['Best Path Forward']['embed'].replace('{Topic}', topic), n=8)
    best_path_forward=ask(prompts['Best Path Forward']['prompt'].replace('{Topic}', topic), bestpath_results)
    
     # Populate the template with the generated content and image URL
    output = template.format(
        topic=topic,
        progress_made=progress_made,
        lessons_learned=lessons_learned,
        challenges_ahead=challenges_ahead,
        best_path_forward=best_path_forward,
        image_url='',#may need adjustment
        credit_url='' #may need adjustment
    )

    # Write output to file
    with open(f"output/{topic}.md", 'w') as f:
        f.write(output)

In [40]:
def generate_solution(start, end, df):
    df=df.iloc[start:end]
    for index, row in df.iterrows():
        solution(row['Topic'])

In [None]:
generate_solution(0,3, df_topic)