In [None]:
!pip install -qq "arize-phoenix[experimental,llama-index]" "openai>=1" gcsfs nest_asyncio
!pip install -qq  PyPDF2 langchain lang_chain_google_gen_ai


In [1]:
import streamlit as st
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import google.generativeai as genai
from langchain.vectorstores import FAISS
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import PromptTemplate
from dotenv import load_dotenv
import phoenix as px
import pandas as pd
import numpy as np

In [2]:
load_dotenv()
os.getenv("GOOGLE_API_KEY")
os.getenv("OPENAI_API_KEY")
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))


In [3]:
def get_pdf_text(pdf_docs):
    text=""
    for pdf in pdf_docs:
        #print(pdf)
        pdf_reader= PdfReader(pdf)
        for page in pdf_reader.pages:
            text+= page.extract_text()
    return  text



def get_text_chunks(text):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
    chunks = text_splitter.split_text(text)
    return chunks


def get_vector_store(text_chunks):
    embeddings = GoogleGenerativeAIEmbeddings(model = "models/embedding-001")
    vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
    vector_store.save_local("faiss_index")

def get_conversational_chain():

    prompt_template = """
    Answer the question as detailed as possible from the provided context, make sure to provide all the details, if the answer is not in
    provided context just say, "answer is not available in the context", don't provide the wrong answer\n\n
    Context:\n {context}?\n
    Question: \n{question}\n

    Answer:
    """

    model = ChatGoogleGenerativeAI(model="gemini-pro",
                             temperature=0.3)

    prompt = PromptTemplate(template = prompt_template, input_variables = ["context", "question"])
    chain = load_qa_chain(model, chain_type="stuff", prompt=prompt)

    return chain
    
    
    
    
    
    


    
    

In [4]:

#!pip install pyngrok
#import getpass
"""
from pyngrok import ngrok, conf
#print("Enter your authtoken, which can be copied from https://dashboard.ngrok.com/auth")
conf.get_default().auth_token = "2avo9mStYXThBAaeoguTfu98cjY_3DryQWqmQ9hS91HUDrHKR"
port = 37689
# Open a ngrok tunnel to the HTTP server
public_url = ngrok.connect(port).public_url
print(" * ngrok tunnel \"{}\" -> \"http://127.0.0.1:{}\"".format(public_url, port))
"""


from langchain.chains import RetrievalQA
# For the evaluation purpose, transform Google Gemini Report into text embedding 
raw_text = get_pdf_text(["gemini_1_report.pdf"])
text_chunks = get_text_chunks(raw_text)
get_vector_store(text_chunks)


embeddings = GoogleGenerativeAIEmbeddings(model = "models/embedding-001")
vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
vector_store.save_local("faiss_index2")

#Loading Vector Database   
new_db = FAISS.load_local("faiss_index2", embeddings)


# Create a retriever object from the 'new_db' using the 'as_retriever' method.
# This retriever is likely used for retrieving data or documents from the database.
#faiss_retriever = new_db.as_retriever()

#print(faiss_retriever)

# Launch phoenix
#session = px.launch_app(port=37689)
session = px.launch_app()
# Once you have started a Phoenix server, you can start your LangChain application with the OpenInferenceTracer as a callback. To do this, you will have to instrument your LangChain application with the tracer:

from phoenix.trace.langchain import OpenInferenceTracer, LangChainInstrumentor

# If no exporter is specified, the tracer will export to the locally running Phoenix server
tracer = OpenInferenceTracer()
LangChainInstrumentor(tracer).instrument()


llm= ChatGoogleGenerativeAI(model="gemini-pro", temperature=0.3 , convert_system_message_to_human=True)
#prompt_template = """
#    Answer the question as detailed as possible from the provided context, make sure to provide all the details, if the answer is not in
#    provided context just say, "answer is not available in the context", don't provide the wrong answer\n\n
#    Context:\n {context}?\n
#    Question: \n{question}\n
#
#    Answer:
#    """
#prompt = PromptTemplate(template = prompt_template, input_variables = ["context", "question"])    
chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vector_store.as_retriever(),
)


from tqdm import tqdm

queries = [
    "What model is Gemini compared against?",
    "How does Gemini compare to GPT4?",
    "What are the different models avaialble for Gemini?",
    "What benchmark was used to evaluate Gemini?",
    "Is GPT4 better than Gemini?"
    "Does Gemini support image generation?"
]

for query in tqdm(queries):    
    #docs = new_db.similarity_search(query)
    temp=chain.run(query=query, return_only_outputs=True)
    print(query, temp)

🌍 To view the Phoenix app in your browser, visit http://127.0.0.1:6006/
📺 To view the Phoenix app in a notebook, run `px.active_session().view()`
📖 For more information on how to use Phoenix, check out https://docs.arize.com/phoenix


  warn_deprecated(
 20%|████████████████▊                                                                   | 1/5 [00:08<00:32,  8.09s/it]

What model is Gemini compared against? Gemini is compared against the following models:
- GPT-4
- PaLM 2-L
- Claude 2
- Inflection-2
- Grok 1
- LLAMA-2
- MMLU

Source: Gemini: A Family of Highly Capable Multimodal Models


 40%|█████████████████████████████████▌                                                  | 2/5 [00:15<00:23,  7.93s/it]

How does Gemini compare to GPT4? Gemini Ultra improves its performance significantly from 84.0% with greedy sampling to 90.0% with uncertainty-routed chain-of-thought approach with 32 samples while it marginally improves to 85.0% with the use of 32 chain-of-thought samples only. In contrast, GPT-4’s performance improves from 84.2% with greedy sampling to 87.3% with uncertainty-routed chain-of-thought approach with 32 samples, but it already achieves these gains from using 32 chain-of-thought samples.


 60%|██████████████████████████████████████████████████▍                                 | 3/5 [00:21<00:13,  6.89s/it]

What are the different models avaialble for Gemini? The Gemini family of models includes Gemini Nano-1, Gemini Nano-2, Gemini Pro, and Gemini Ultra.


 80%|███████████████████████████████████████████████████████████████████▏                | 4/5 [00:29<00:07,  7.11s/it]

What benchmark was used to evaluate Gemini? Gemini was evaluated using the following benchmarks:
- MMLU
- GSM8K
- MATH
- American Mathematical Competitions
- MGSM
- XLSum
- WikiLingua
- WMT 23
- VQAv2
- TextVQA
- DocVQA
- ChartQA
- InfographicVQA
- MathVista
- AI2D
- MMMU
- VATEX
- VATEX ZH
- YouCook2
- NextQA
- ActivityNet-QA
- Perception Test MCQA


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:36<00:00,  7.25s/it]

Is GPT4 better than Gemini?Does Gemini support image generation? I do not have access to real-time information, therefore I cannot answer the question of whether GPT4 is better than Gemini.

Gemini is able to output images natively, without having to rely on an intermediate natural language description that can bottleneck the model’s ability to express images. This uniquely enables the model to generate images with prompts using interleaved sequences of image and text in a few-shot setting.





In [5]:
px.active_session().view()
     

📺 Opening a view to the Phoenix app. The app is running at http://127.0.0.1:6006/


In [6]:

spans_df = px.active_session().get_spans_dataframe()
spans_df[["name", "span_kind", "attributes.input.value"]].head()


Unnamed: 0_level_0,name,span_kind,attributes.input.value
context.span_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
f6884afe-5289-4971-81d8-77a640f73f26,,LLM,"{""messages"": [[{""lc"": 1, ""type"": ""constructor""..."
d748f26c-787d-4fe1-9b52-b5e8d0671417,LLMChain,CHAIN,"{""question"": ""Is GPT4 better than Gemini?Does ..."
aab9b9ac-0035-4ef3-8bfb-8fc326c8d1b3,StuffDocumentsChain,CHAIN,"{""input_documents"": [{""page_content"": ""Megan B..."
5ee842bd-53b0-4393-bdc8-2ba1c5abbb62,Retriever,RETRIEVER,Is GPT4 better than Gemini?Does Gemini support...
e6d412a9-24e1-4447-a667-f2ab987f0128,RetrievalQA,CHAIN,"{""query"": ""Is GPT4 better than Gemini?Does Gem..."


In [7]:
from phoenix.session.evaluation import get_qa_with_reference, get_retrieved_documents

retrieved_documents_df = get_retrieved_documents(px.active_session())
queries_df = get_qa_with_reference(px.active_session())

In [None]:
queries_df

In [8]:

from langchain.chat_models import ChatOpenAI
from langchain.retrievers import KNNRetriever
from phoenix.experimental.evals import (
    HallucinationEvaluator,
    OpenAIModel,
    QAEvaluator,
    RelevanceEvaluator,
    run_evals,
)
from phoenix.session.evaluation import get_qa_with_reference, get_retrieved_documents
from phoenix.trace import DocumentEvaluations, SpanEvaluations


eval_model = OpenAIModel(
    model_name="gpt-4-1106-preview",
)
hallucination_evaluator = HallucinationEvaluator(eval_model)
qa_correctness_evaluator = QAEvaluator(eval_model)
relevance_evaluator = RelevanceEvaluator(eval_model)

hallucination_eval_df, qa_correctness_eval_df = run_evals(
    dataframe=queries_df,
    evaluators=[hallucination_evaluator, qa_correctness_evaluator],
    provide_explanation=True,
)
relevance_eval_df = run_evals(
    dataframe=retrieved_documents_df,
    evaluators=[relevance_evaluator],
    provide_explanation=True,
)[0]

px.log_evaluations(
    SpanEvaluations(eval_name="Hallucination", dataframe=hallucination_eval_df),
    SpanEvaluations(eval_name="QA Correctness", dataframe=qa_correctness_eval_df),
)
px.log_evaluations(DocumentEvaluations(eval_name="Relevance", dataframe=relevance_eval_df))

🐌!! If running llm_classify inside a notebook, patching the event loop with nest_asyncio will allow asynchronous eval submission, and is significantly faster. To patch the event loop, run `nest_asyncio.apply()`.


run_evals |          | 0/10 (0.0%) | ⏳ 00:00<? | ?it/s

🐌!! If running llm_classify inside a notebook, patching the event loop with nest_asyncio will allow asynchronous eval submission, and is significantly faster. To patch the event loop, run `nest_asyncio.apply()`.


run_evals |          | 0/20 (0.0%) | ⏳ 00:00<? | ?it/s



Sending Evaluations:   0%|                                                                      | 0/10 [00:00<?, ?it/s][A[A

Sending Evaluations: 100%|█████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 93.45it/s][A[A


Sending Evaluations:   0%|                                                                      | 0/20 [00:00<?, ?it/s][A[A

Sending Evaluations:  25%|███████████████▌                                              | 5/20 [00:00<00:00, 49.60it/s][A[A

Sending Evaluations: 100%|█████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 89.15it/s][A[A


In [10]:
px.active_session().view()

📺 Opening a view to the Phoenix app. The app is running at http://127.0.0.1:6006/
