<a href="https://colab.research.google.com/github/eileenliu953/NHERI-AVEE-2023/blob/main/Branch_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install openai
!pip install markdown2
!pip install transformers

import numpy as np
import openai
import os
import pandas as pd
import pickle
import pyarrow
import markdown2
from bs4 import BeautifulSoup
from transformers import GPT2TokenizerFast
from nltk.tokenize import sent_tokenize
from openai.embeddings_utils import cosine_similarity

COMPLETIONS_MODEL = "text-davinci-003"
EMBEDDING_MODEL = "text-embedding-ada-002"
openai.api_key = ''
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

def count_tokens(text: str) -> int:
    """count the number of tokens in a string"""
    return len(tokenizer.encode(text))

def break_text(text, max_tokens):
  tokens = text.split()
  chunks = []
  current_chunk = ""

  for token in tokens:
      if len(current_chunk) + len(token) < max_tokens:
          current_chunk += token + " "
      else:
          chunks.append(current_chunk.strip())
          current_chunk = token + " "

  if current_chunk:
      chunks.append(current_chunk.strip())

  return chunks



#directory = "/content/drive/MyDrive/Technical Documentation " #might need to be input value

def reading_files(directory_path):
  data = []
  df_f = pd.DataFrame(data, columns=["file", "heading", "content", "tokens"])   #creating empty data frame

  #iterating through each file to tokenize
  for i in os.listdir(directory_path):
        with open(f"{i}", "r") as file:
            content = file.read()
        file_no_md = i.replace(".md", "")
        file_no_md = "tapipy/" + file_no_md
        html = markdown2.markdown(content)    # Use markdown2 to convert the markdown file to html
        soup = BeautifulSoup(html, "html.parser")   # Use BeautifulSoup to parse the html

        # Initialize variables to store heading, subheading, and corresponding paragraphs
        headings = []
        paragraphs = []
        data = []
        MAX_WORDS = 500

        # Iterate through the tags in the soup
        for tag in soup.descendants:
            # Check if the tag is a heading
            if tag.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
                # When the next heading is encountered, print the heading, subheading, and corresponding paragraphs
                if headings and paragraphs:
                    hdgs = " ".join(headings)
                    para = " ".join(paragraphs)
                    counting = count_tokens(para)
                    if counting > 1024:
                      para_chunks = break_text(para, 1024)
                      for chunk in para_chunks:
                        data.append([file_no_md, hdgs, chunk, count_tokens(chunk)])
                        headings = []
                        paragraphs = []
                    else:
                      data.append([file_no_md, hdgs, para, count_tokens(para)])
                      headings = []
                      paragraphs = []
                # Add to heading
                headings.append(tag.text)
            # Check if the tag is a paragraph
            elif tag.name == "p":
                paragraphs.append(tag.text)

        # creating data frame

        df = pd.DataFrame(data, columns=["file", "heading", "content", "tokens"])
        df = df[df.tokens > 40]
        df = df.reset_index().drop('index',axis=1) # reset index
        df_f = pd.concat([df_f, df])
        df_f = df_f.reset_index().drop('index',axis=1)

  return df_f


def get_embedding(text: str, model: str=EMBEDDING_MODEL):
    result = openai.Embedding.create(
      model=model,
      input=text
    )
    return result["data"][0]["embedding"]


def compute_doc_embeddings(df: pd.DataFrame):
    """
    Create an embedding for each row in the dataframe using the OpenAI Embeddings API.

    Return a dictionary that maps between each embedding vector and the index of the row that it corresponds to.
    """
    return {
        idx: get_embedding(r.content) for idx, r in df.iterrows()
    }


def order_documents_query_similarity(data, query_str, nres=3):
    embedding = get_embedding(query_str, model=EMBEDDING_MODEL)
    data['similarities'] = data.vector_embedding.apply(lambda x: cosine_similarity(x, embedding))

    res = data.sort_values('similarities', ascending=False).head(nres)
    return res


def construct_prompt(question: str, df: pd.DataFrame, ncontents = 3) -> str:
    """
    Fetch relevant
    """
    most_relevant_document_sections = order_documents_query_similarity(df, question)

    chosen_sections = []
    chosen_section_len = 0

    MAX_SECTION_LEN = 500
    context = order_documents_query_similarity(df, question)
    context.head()

    for _, ctx in context.iterrows():
        chosen_section_len += ctx.tokens
        if chosen_section_len > MAX_SECTION_LEN:
            break

        chosen_sections.append(" " + ctx.content.replace("\n", " "))

    header = """Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "I don't know."\n\nContext:\n"""

    return header + "".join(chosen_sections) + "\n\n Q: " + question + "\n A:"


COMPLETIONS_API_PARAMS = {
    "temperature": 0.0,
    "max_tokens": 300,
    "model": COMPLETIONS_MODEL,
}

def answer_query_with_context(
    query: str,
    df: pd.DataFrame,
    show_prompt: bool = False) -> str:

    prompt = construct_prompt(
        query,
        df
    )

    if show_prompt:
        print(prompt)

    response = openai.Completion.create(
                prompt=prompt,
                **COMPLETIONS_API_PARAMS
            )

    return response["choices"][0]["text"].strip(" \n")


#calling functions
df_f = reading_files("/content/drive/MyDrive/Technical Documentation ")
vector_embedding = compute_doc_embeddings(df_f)
df_f['vector_embedding'] = pd.Series(vector_embedding)
#df_f #this will show the results


# "What is the purpose of tapis" should be an input that the user controls. The input code hasn't been written yet.
construct_prompt(question="What is the purpose of tapis?", df=df_f)
res = order_documents_query_similarity(df_f, "What is the purpose of tapis?") #input value
answer_query_with_context("What is the purpose of tapis?", df_f)