In [2]:
__import__('pysqlite3')
import sys
sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')

In [3]:
import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
import json
from langchain_text_splitters import RecursiveCharacterTextSplitter
load_dotenv(dotenv_path='../.envrc')
embedding_function = OpenAIEmbeddings(model="text-embedding-3-large")
client = OpenAI()

In [4]:
db = Chroma(persist_directory="paper_info", embedding_function=embedding_function)
retriever = db.as_retriever(search_kwargs={'k': 5})

In [5]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

def build_prompt(query, search_results):
    prompt_template = """
    You're a helpful deep learning mentor. Answer the QUESTION based on the CONTEXT from the FAQ database.
    Use only the facts from the CONTEXT when answering the QUESTION.
    
    QUESTION: 
    {question}
    CONTEXT: 
    {context}
    """.strip()

    context = format_docs(search_results)
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

def search(query):

    results = retriever.invoke(query)
    return results

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    # print(prompt)
    answer = llm(prompt)
    return answer


In [6]:
dataset_template = """
You emulate a student who's taking deep learning class.
Formulate 5 questions this student might ask based on a deep learning articles or about deep learning concepts in general. 
The record should contain the answer to the questions, and the questions should be complete and not too short.
Make questions specific to the deep learning details.
If possible, use as fewer words as possible from the record. 

The record:

question: {question}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]
""".strip()

In [68]:
def generate_questions(prompt):
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": dataset_template.format(question=prompt)}]
    )
    
    return response.choices[0].message.content

In [60]:
questions = generate_questions("generative adversarial_networks")

In [61]:
question = 'What are the fundamental components of a Generative Adversarial Network (GAN), and how do they interact during the training process?'

In [64]:
answer = rag(question)
print(answer)

The fundamental components of a Generative Adversarial Network (GAN) are the generative model \( G \) and the discriminative model \( D \). 

1. **Generative Model (G)**: This model generates new data samples from a prior distribution \( p_z(z) \). It maps noise inputs \( z \) through a differentiable function \( G(z; \theta_g) \) to produce data-like outputs \( G(z) \).

2. **Discriminative Model (D)**: This model evaluates the probability that a sample \( x \) comes from the actual data distribution \( p_{\text{data}}(x) \) rather than from the generative model \( p_g(G) \). It outputs a single scalar, which represents this probability.

### Interaction During Training:

The training process of GANs involves a minimax game between the generator and the discriminator:

- \( G \) is trained to minimize the log probability of \( D \) classifying its generated samples as fake:
  \[
  \text{min}_G \log(1 - D(G(z)))
  \]
  This means \( G \) aims to produce samples that \( D \) will mistak

In [65]:
from tqdm.auto import tqdm

In [66]:
results = {}

In [67]:
list_of_topics = ["Deep Belief Network", "ImageNet Evolution", "Speech Recognition Evolution", "Optimization", "Unsupervised Learning / Deep Generative Model", 
                  "RNN", "Sequence-to-Sequence Model", "Neural Turing Machine", "Deep Reinforcement Learning", "Deep Transfer Learning", "Lifelong Learning",
                  "One Shot Deep Learning", "Natural Language Processing", "Object Detection", "Visual Tracking", "Image Caption", "Machine Translation",
                  "Object Segmentation"]

In [71]:
for topic in tqdm(list_of_topics):
    questions = generate_questions(topic)
    results[topic] = json.loads(questions)['questions']

In [74]:
results

{'Deep Belief Network': [{'question': 'What is the difference between semantic segmentation and instance segmentation in deep learning, and how do they impact the model architecture?'},
  {'question': 'Could you explain the role of the loss function in training deep learning models for object segmentation tasks, and what are common loss functions used for this purpose?'},
  {'question': 'In the context of deep learning, what are the advantages and disadvantages of using Fully Convolutional Networks (FCNs) for object segmentation compared to traditional CNNs?'},
  {'question': 'How does data augmentation help in improving the performance of object segmentation models, and what are some popular techniques used for this?'},
  {'question': 'What are some common pre-trained models for object segmentation in deep learning, and how can transfer learning be applied effectively in this domain?'}],
 'ImageNet Evolution': [{'question': 'What is the difference between semantic segmentation and ins

In [77]:
final_results = []
for topic in results.keys():
    for question in results[topic]:
        final_results.append({
            "topic": topic, 
            "question":question['question']
        })
    

In [78]:
final_results

[{'topic': 'Deep Belief Network',
  'question': 'What is the difference between semantic segmentation and instance segmentation in deep learning, and how do they impact the model architecture?'},
 {'topic': 'Deep Belief Network',
  'question': 'Could you explain the role of the loss function in training deep learning models for object segmentation tasks, and what are common loss functions used for this purpose?'},
 {'topic': 'Deep Belief Network',
  'question': 'In the context of deep learning, what are the advantages and disadvantages of using Fully Convolutional Networks (FCNs) for object segmentation compared to traditional CNNs?'},
 {'topic': 'Deep Belief Network',
  'question': 'How does data augmentation help in improving the performance of object segmentation models, and what are some popular techniques used for this?'},
 {'topic': 'Deep Belief Network',
  'question': 'What are some common pre-trained models for object segmentation in deep learning, and how can transfer learning

In [82]:
pd.DataFrame(final_results).to_csv("eval.csv", index=False)

## Retrieval_evaluation

In [7]:
df_questions = pd.read_csv("eval.csv")

### Chroma

In [8]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [9]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }