In [None]:
'''
 * Copyright 2023 QuickAns
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 '''

# Question answering using embeddings-based search

In [None]:
# imports
import ast  # for converting embeddings saved as strings back to arrays
import openai  # for calling the OpenAI API
import pandas as pd  # for storing text and embeddings data
from scipy import spatial  # for calculating vector similarities for search


# models
EMBEDDING_MODEL = "text-embedding-ada-002"
GPT_MODEL = "gpt-3.5-turbo"

#from keys import *
openai.api_key = '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'

In [None]:
embeddings_path = "/home/ishikaa2/quickans/ans_generator/data/ir_txt/ir_book_embeddings.csv"

df = pd.read_csv(embeddings_path)

In [None]:
# convert embeddings from CSV str type back to list type
df['embedding'] = df['embedding'].apply(ast.literal_eval)

In [None]:
# the dataframe has two columns: "text" and "embedding"
df

In [None]:
# search function
def strings_ranked_by_relatedness(
    query: str,
    df: pd.DataFrame,
    relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y),
    top_n: int = 100
):
    """Returns a list of strings and relatednesses, sorted from most related to least."""
    query_embedding_response = openai.Embedding.create(
        model=EMBEDDING_MODEL,
        input=query,
    )
    query_embedding = query_embedding_response["data"][0]["embedding"]
    strings_and_relatednesses = [
        (row["text"], relatedness_fn(query_embedding, row["embedding"]))
        for i, row in df.iterrows()
    ]
    strings_and_relatednesses.sort(key=lambda x: x[1], reverse=True)
    strings, relatednesses = zip(*strings_and_relatednesses)
    return strings[:top_n], relatednesses[:top_n]


In [None]:
# examples
strings, relatednesses = strings_ranked_by_relatedness("Web Crawling", df, top_n=5)
for string, relatedness in zip(strings, relatednesses):
    print(f"{relatedness}")
    display(string)

In [None]:
def query_message(
    query: str,
    df: pd.DataFrame,
    model: str,
    token_budget: int
) -> str:
    """Return a message for GPT, with relevant source texts pulled from a dataframe."""
    strings, relatednesses = strings_ranked_by_relatedness(query, df)
    introduction = "Suppose you are a teaching assistant for the course Advanced Information Retrieval and a student has posed the following question.\n"
    question = f"\n\nQuestion: {query}\n\n"
    end = 'How will you answer the question? \n Here are some snippets from the course textbook which may be useful.\n\n"'
    book_info = ""
    
    preface = introduction + question + end
    for string in strings:
        next_article = f'\n{string}\n'
        if (
            len(preface + book_info + next_article) > 2500
        ):
            break
        else:
            book_info += next_article
    return preface + book_info

def api_call(message, model: str = GPT_MODEL):
    messages = [
        {"role": "user", "content": message},
    ]
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=0
    )
    response_message = response["choices"][0]["message"]["content"]
    return response_message

def ask(
    query: str,
    df: pd.DataFrame = df,
    model: str = GPT_MODEL,
    token_budget: int = 4096 - 500,
    print_message: bool = False,
) -> str:
    """Answers a query using GPT and a dataframe of relevant texts and embeddings."""
    message = query_message(query, df, model=model, token_budget=token_budget)
    if print_message:
        print(message)
    
    reply = api_call(message, model)
    return reply

In [None]:
query = 'When modeling queries, how is multi-bernoulli different from multinomial?'

api_call(query)
# ask('When modeling queries, how is multi-bernoulli different from multinomial?')

In [None]:
def compare_responses(query):
    emb_reply = ask(query)
    #emb_reply = 'emb_reply'
    base_reply = api_call(query)
    #base_reply = 'base_reply'
    

    print(f'Query:{query}\n\n')
    print("Embedding Method response\n")
    print(emb_reply+"\n\n")
    print('Base GPT response\n')
    print(base_reply)

    return emb_reply, base_reply

In [None]:
query = 'How do you define a reference language model?'

compare_responses(query)

In [None]:
query = 'How can we use language models for part of speech tagging?'

compare_responses(query)

In [None]:
query = 'What happens to the Dirichlet smoothing when the document langeth goes to infinity'

compare_responses(query)

In [None]:
#['What order should you evaluate the kl divergence?'] #, 
'''queries = ['Why do we need a background language model?', 
           'How does a tokenizer work?', 
           'What are the components of an inverted index?', 
           'What is the document filtering problem?', 
           'What are the aspects of a search engine?', 
           'For computing the f1 score, why can\'t we take the mean of precision and recall?',
           'Why can’t we use IR models for ranking of webpages?',
           'How do you evaluate a filtering system?',
           'What are the components in context-based filtering?',
           'What is the exploration-exploitation tradeoff?',
           'What is beta-gamma threshold learning?',
           'What type of word relations are there?',
           'What is intrusion detection?',
           'What are some cluster similarity measures?']'''

response_split = 'RESPONSESPLIT'

queries = ['How is Jelinek-Mercer smoothing different from interpolation? The formulae are the same... Is JM a particular type of interpolation? If yes, then what type is it?',
           'Going back to the point that KL-divergence is not symmetric unlike Mutual Information, can we think of real-time applications that would prefer an unsymmetric measure over a symmetric measure? That is, are there cases where we intentionally prefer to use unsymmetric measures? It is always useful to improve our understanding of the data, but what are the application level use cases?',
           'In the following equation what if the denominator is zero i.e there are no counts for all the various sequences that appear in denominator. How is the probability calculated in this case? Is it a zero as even the numerator will be zero or is it not defined?',
           'Can blind feedback be applied repeatedly? Basically, use the original query model q0 to retrieve top k documents and estimate an updated model q1, then use q1 to again retrieve top k documents and update the model and so on. Can this lead to any improvement over applying the feedback just once?',
           'How do you tell if a retrieval function has IDF weighting? Do we basically look for a component that penalizes the frequent use of a term? Or is it when we compare the term against the collection/background probability to balance out the 2 probabilities?',
           'Hi, I was wondering about Divergence minimization in the lecture notes. On line 3, why lambda * H(theta ; theta_C) can be joined into the summation? Isn\'t it originally outside of the summation?',
           'Is there any difference between two quantities being proportional and being rank equivalent? I thought two quantities have to be proportional in order for them to be rank equivalent. But, I vaugely remember the Prof. mentioning there was some difference. I may be wrong. Pls advise/share your thoughts.',
           'Dirichlet prior smoothing. I can write the equation of score after multiply doucument k times. But how can we infer from this equation whether the score will increase or decrease? Since we don\'t know the influence of k to each part of the equation.',
           'In this picture, if w occurs very frequently in background model U, the p(w|U) will be large, right? I thought it should be if w occurs frequently then p(w|U) will be smaller so we can do the penalty. But here seems the opposite. Can you explain how we penalize the frequent word in detail?',
           'Hello, this is a dumb question but I was wondering how the log-likelihood value is calculated here:',
           'If we set a high value of  p ( θ B ) p(θ  B ​   )  (say, 0.9) in the above formula in Slide 32 of Mixture Language Models, we can get  p ( ‘ ‘ t e x t " ∣ θ d ) = 0.4 1 − 0.9 + 0.1 = 0.4 0.1 + 0.1 = 4 + 0.1 = 4.1 > 1 ? p(‘‘text"∣θ  d ​   )=  1−0.9 0.4 ​   +0.1=  0.1 0.4 ​   +0.1=4+0.1=4.1>1?  Can anyone please help me understand something I may be missing? Thanks']

f = open('ground_truth_responses.txt', 'r')
grouth_truth = f.read().split('\n')

import time
if False:
    for i in range(len(queries)):
        q = queries[i]
        print(q)
        emb, base = compare_responses(q)
        emb = ' '.join(emb.splitlines())
        base = ' '.join(base.splitlines())
        f = open('responses.csv', 'a')
        f.write(f'{emb}{response_split}{base}{response_split}{grouth_truth[i+3]}\n')
        f.flush()
        f.close()
        time.sleep(60)

## Evaluation

In [None]:
f = open('responses.csv', 'r')
data = f.read()
f.close()

data

In [None]:
lines = data.split('\n')
lines

In [None]:
split_lines = [line.split(f'{response_split}') for line in lines]
len(split_lines)
ft = [sl[0] for sl in split_lines]
gpt = [sl[1] for sl in split_lines]
book = [sl[2] for sl in split_lines]
output = pd.DataFrame({'fine_tuned':ft, 'gpt':gpt, 'textbook':book})
output

In [None]:
ind = 0

queries[ind]

In [None]:
output.loc[ind]['gpt']

In [None]:
def jaccard(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    return float(intersection) / union

In [None]:
# not super efficient, but because we are dealing with a small dataframe, we can get away with it
for i, row in output.iterrows():
    print(f'------{queries[i]}------')
    list1 = row['textbook'].split(' ')

    #fine_tuned
    list2 = row['fine_tuned'].split(' ')
    sim = jaccard(list1, list2)
    print(f'textbook with fine_tuned: {sim}')

    #plain gpt
    list2 = row['gpt'].split(' ')
    sim = jaccard(list1, list2)
    print(f'textbook with gpt: {sim}')

    