# Create a Question Answering Chatbot for the Graduate Advising system 

### Install needed libraries

In [2]:
%pip install openai




In [3]:
%pip install transformers




In [26]:
%pip install pysimplegui

Collecting pysimplegui
  Downloading PySimpleGUI-4.60.4-py3-none-any.whl (509 kB)
Installing collected packages: pysimplegui
Successfully installed pysimplegui-4.60.4
Note: you may need to restart the kernel to use updated packages.


### Set up the model

In [27]:
# import needed libraries
import openai
import pandas as pd
import numpy as np
import pickle
import re
from transformers import GPT2TokenizerFast
from typing import Dict, List, Tuple
import PySimpleGUI as sg

In [5]:
openai.api_key = "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" #secret keys here: https://platform.openai.com/account/api-keys
COMPLETIONS_MODEL = "text-davinci-003"

In [6]:
#set the model for the task of document and query vector embedding
MODEL_NAME = "curie"

DOC_EMBEDDINGS_MODEL = f"text-search-{MODEL_NAME}-doc-001"
QUERY_EMBEDDINGS_MODEL = f"text-search-{MODEL_NAME}-query-001"

In [7]:
#add relevant context from the doc sections to a prompt by prepending them to the supplied query
MAX_SECTION_LEN = 500
SEPARATOR = "\n* "

tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
separator_len = len(tokenizer.tokenize(SEPARATOR))

f"Context separator contains {separator_len} tokens"

'Context separator contains 3 tokens'

In [8]:
#use the completions API - this allows us to provide an answer instead of just ingesting the question 
COMPLETIONS_API_PARAMS = {
    # We use temperature of 0.0 because it gives the most predictable, factual answer.
    "temperature": 0.0,
    "max_tokens": 300,
    "model": COMPLETIONS_MODEL,
}

### Write helper functions

In [9]:
#create functions to get the embedding, get the document and query embedding, and computer the embedding
def get_embedding(text: str, model: str) -> List[float]:
    print("inside get_embedding")
    result = openai.Embedding.create(
      model=model,
      input=text)
    return result["data"][0]["embedding"]

def get_doc_embedding(text: str) -> List[float]:
    print("inside get_doc_embedding")
    return get_embedding(text, DOC_EMBEDDINGS_MODEL)

def get_query_embedding(text: str) -> List[float]:
    return get_embedding(text, QUERY_EMBEDDINGS_MODEL)

def compute_doc_embeddings(df: pd.DataFrame) -> Dict[Tuple[str, str], List[float]]:
    """
    Create an embedding for each row in the dataframe using the OpenAI Embeddings API.
    
    Return a dictionary that maps between each embedding vector and the index of the row that it corresponds to.
    """
    print("inside compute_doc_embedding")
    return {
        idx: get_doc_embedding(r.content.replace("\n", " ")) for idx, r in df.iterrows()
    }

In [10]:
#create function called load_embeddings that reads the document embeddings and their keys from csv 
def load_embeddings(fname: str) -> Dict[Tuple[str, str], List[float]]:
    """
    Read the document embeddings and their keys from a CSV.
    
    fname is the path to a CSV with exactly these named columns: 
        "title", "heading", "0", "1", ... up to the length of the embedding vectors.
    """
    
    df = pd.read_csv(fname, header=0)
    max_dim = max([int(c) for c in df.columns if c != "title" and c != "heading"])
    return {
           (r.title, r.heading): [r[str(i)] for i in range(max_dim + 1)] for _, r in df.iterrows()
    }

In [11]:
#calculate the similarity between vectors 
def vector_similarity(x: List[float], y: List[float]) -> float:
    """
    We could use cosine similarity or dot product to calculate the similarity between vectors.
    In practice, we have found it makes little difference. 
    """
    return np.dot(np.array(x), np.array(y))

In [12]:
#find the query embedding and compare it to the document embeddings to find the most relevant sections
def order_document_sections_by_query_similarity(query: str, contexts: Dict[Tuple[str, str], np.array]) -> List[Tuple[float, Tuple[str, str]]]:
    """
    Find the query embedding for the supplied query, and compare it against all of the pre-calculated document embeddings
    to find the most relevant sections. 
    
    Return the list of document sections, sorted by relevance in descending order.
    """
    query_embedding = get_query_embedding(query)
    
    document_similarities = sorted([
        (vector_similarity(query_embedding, doc_embedding), doc_index) for doc_index, doc_embedding in contexts.items()
    ], reverse=True)
    
    return document_similarities

In [13]:
#create function to construct the new prompt with the most relevant document sections
def construct_prompt(question: str, context_embeddings: dict, df: pd.DataFrame) -> str:
    """
    Fetch relevant 
    """
    most_relevant_document_sections = order_document_sections_by_query_similarity(question, context_embeddings)
    
    chosen_sections = []
    chosen_sections_len = 0
    chosen_sections_indexes = []
     
    for _, section_index in most_relevant_document_sections:
        # Add contexts until we run out of space.        
        document_section = df.loc[section_index]
        
        chosen_sections_len += document_section.tokens + separator_len
        if chosen_sections_len > MAX_SECTION_LEN:
            break
            
        chosen_sections.append(SEPARATOR + document_section.content.replace("\n", " "))
        chosen_sections_indexes.append(str(section_index))
            
    # Useful diagnostic information
    print(f"Selected {len(chosen_sections)} document sections:")
    print("\n".join(chosen_sections_indexes))
    
    header = """Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "I don't know."\n\nContext:\n"""
    
    return header + "".join(chosen_sections) + "\n\n Q: " + question + "\n A:"

In [14]:
#create function to answer query with the retrieved document text
def answer_query_with_context(
    query: str,
    df: pd.DataFrame,
    document_embeddings: Dict[Tuple[str, str], np.array],
    show_prompt: bool = False
) -> str:
    prompt = construct_prompt(
        query,
        document_embeddings,
        df
    )
    
    if show_prompt:
        print(prompt)

    response = openai.Completion.create(
                prompt=prompt,
                **COMPLETIONS_API_PARAMS
            )

    return response["choices"][0]["text"].strip(" \n")

### Create dataset

Test other models (other than Davinci and Curie, as above)

In [15]:
df = pd.read_excel('grad_info.xlsx')

In [16]:
df.head()

Unnamed: 0,title,heading,content,tokens,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8
0,Graduate Program Overview Introduction,Introduction,The Department of Computer Science at Texas Te...,,,,,,
1,Graduate Program Overview Course Transfers,Course Transfers,A graduate course may be accepted for transfer...,,,,,,
2,Graduate Program Overview Graduate Links,Graduate Links,Overview. Computer Science (Master of Science)...,,,,,,
3,M.S. in Computer Science Overview Degree Plan ...,Degree Plan & Admission to Candidacy,"The degree plan specifies information, such as...",,,,,,
4,"M.S. in Computer Science Overview Thesis, Proj...","Thesis, Project, or Report Advisory",Committee: All committee members must belong t...,,,,,,


In [17]:
def count_tokens(text):
    return len(text.split())

In [18]:
# clean up the dataframe 
df = df.drop(['Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8'], axis=1)
df['tokens'] = df['content'].apply(count_tokens)

In [19]:
df

Unnamed: 0,title,heading,content,tokens
0,Graduate Program Overview Introduction,Introduction,The Department of Computer Science at Texas Te...,362
1,Graduate Program Overview Course Transfers,Course Transfers,A graduate course may be accepted for transfer...,165
2,Graduate Program Overview Graduate Links,Graduate Links,Overview. Computer Science (Master of Science)...,51
3,M.S. in Computer Science Overview Degree Plan ...,Degree Plan & Admission to Candidacy,"The degree plan specifies information, such as...",346
4,"M.S. in Computer Science Overview Thesis, Proj...","Thesis, Project, or Report Advisory",Committee: All committee members must belong t...,373
5,M.S. in Computer Science Overview Course Trans...,Course Transfers,A graduate course may be accepted for transfer...,162
6,M.S. in Compute Science Curriculum and Courses...,Curriculum,General Requirements: The Master of Science in...,362
7,M.S. in Computer Science Curriculum and Course...,Example Plan of Full-Time Study (Thesis-Based),"Fall Semester: Graduate Core Course, Graduate ...",62
8,M.S. in Computer Science Curriculum and Course...,Example Plan of Full-Time Study (Non-Thesis-Ba...,"Fall Semester: Graduate Core Course, Graduate ...",76
9,M.S. in Software and Security Engineering Intr...,Introduction,Our Master of Science in Software and Security...,201


### Test on our data

In [20]:
#calcualte the embeddings from scratch
document_embeddings = compute_doc_embeddings(df)

inside compute_doc_embedding
inside get_doc_embedding
inside get_embedding
inside get_doc_embedding
inside get_embedding
inside get_doc_embedding
inside get_embedding
inside get_doc_embedding
inside get_embedding
inside get_doc_embedding
inside get_embedding
inside get_doc_embedding
inside get_embedding
inside get_doc_embedding
inside get_embedding
inside get_doc_embedding
inside get_embedding
inside get_doc_embedding
inside get_embedding
inside get_doc_embedding
inside get_embedding
inside get_doc_embedding
inside get_embedding
inside get_doc_embedding
inside get_embedding
inside get_doc_embedding
inside get_embedding
inside get_doc_embedding
inside get_embedding
inside get_doc_embedding
inside get_embedding
inside get_doc_embedding
inside get_embedding
inside get_doc_embedding
inside get_embedding
inside get_doc_embedding
inside get_embedding
inside get_doc_embedding
inside get_embedding
inside get_doc_embedding
inside get_embedding
inside get_doc_embedding
inside get_embedding
insid

In [21]:
#check out an example embedding 
example_entry = list(document_embeddings.items())[0]
print(f"{example_entry[0]} : {example_entry[1][:5]}... ({len(example_entry[1])} entries)")

0 : [-0.02202020213007927, -0.0052879247814416885, -0.001247344072908163, 0.018745217472314835, -0.009870189242064953]... (4096 entries)


In [22]:
#find the most relevant sections related to a query
order_document_sections_by_query_similarity("What graduate degrees can I get?", document_embeddings)[:5]

inside get_embedding


[(0.3625775753306756, 32),
 (0.36205321429516313, 36),
 (0.3533794092444966, 17),
 (0.3517290422425496, 37),
 (0.3482119440623258, 29)]

In [23]:
prompt = construct_prompt(
    "How many faculty members are in the department of computer science at Texas Tech?",
    document_embeddings,
    df
)

print("===\n", prompt)

inside get_embedding
Selected 2 document sections:
0
19
===
 Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "I don't know."

Context:

* The Department of Computer Science at Texas Tech University provides an excellent environment for furthering your studies. There are 18 faculty members, with research specialties across the entire spectrum of computer science. You can view the different research groups here (https://www.depts.ttu.edu/cs/research/index.php) or you can view our individual faculty member pages here (https://www.depts.ttu.edu/cs/faculty/). Our department offers five different graduate programs: Master of Science in Computer Science, Master of Science in Software and Security Engineering, Doctor of Philosophy in Computer Science, Graduate Certificate in Software Engineering, Graduate Certificate in Security. We also offer combined Bachelor's and Master's programs, i.e., the 150-hour co

In [24]:
#lets ask a question
answer_query_with_context("How many faculty members are in the department of computer science at Texas Tech?", df, document_embeddings)

inside get_embedding
Selected 2 document sections:
0
19


'18'

### Make UI

In [25]:
# Bare bones UI
session = input("What would you like to name your session? ")

print("type 'end' or 'quit' to end session")

while not re.search('end|End|quit|Quit', session): #problem: what if end or quit is part of the question
    question = input("Question: ")
    answer = answer_query_with_context(question, df, document_embeddings)
    print(answer)
    session = question

print("Ending session ... ")

What would you like to name your session? how many credits does a certificate require
type 'end' or 'quit' to end session
Question: how many credits does a certificate require
inside get_embedding
Selected 4 document sections:
32
30
29
27
12 semester credit hours.
Question: what kind of certifiacts are offered
inside get_embedding
Selected 5 document sections:
29
32
30
2
17
The Department of Computer Science offers the Certificate in Software Engineering for those who do not need or wish to have a full graduate degree in software engineering or computer science. In addition, the department offers a Master of Science in Software and Security Engineering and a Doctor of Philosophy in Computer Science.
Question: quit
inside get_embedding
Selected 5 document sections:
9
23
10
30
29
I don't know.
Ending session ... 


In [None]:
# GUI 
