In [None]:
!pip install pypdf -q
!pip install gradio==3.41.2 -q
!pip install langchain==0.0.274 -q
!pip install openai==0.27.9
!pip install llama-index==0.8.11

In [None]:
import os

from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import AzureChatOpenAI

from llama_index import (
    SimpleDirectoryReader,
    LLMPredictor,
    PromptHelper,
    StorageContext,
    ServiceContext,
    GPTVectorStoreIndex,
    LangchainEmbedding,
    load_index_from_storage,
    set_global_service_context)

from llama_index.node_parser import SimpleNodeParser
from llama_index.text_splitter import TokenTextSplitter
from llama_index.response.notebook_utils import display_response

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount = True)

Mounted at /content/drive


In [None]:
os.environ['OPENAI_API_KEY'] = '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'
os.environ['OPENAI_API_VERSION'] = '2020-11-07'
os.environ['OPENAI_API_BASE'] = 'https://api.openai.com/v1'

In [None]:
def create_service_context(
        # Constraint parameters
        max_input_size=4096,        # Context window for the LLM.
        num_outputs=256,            # Number of output tokens for the LLM.
        chunk_overlap_ratio=0.1,    # Chunk overlap as a ratio of chunk size.
        chunk_size_limit=None,      # Maximum chunk size to use.
        chunk_overlap=20,           # Maximum chunk size to use.
        chunk_size=1024):           # Set chunk overlap to use.

    # The parser that converts documents into nodes.
    node_parser = SimpleNodeParser.from_defaults(
        # The text splitter used to split text into chunks.
        text_splitter=TokenTextSplitter(
            chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    )

    # Allows the user to explicitly set certain constraint parameters.
    prompt_helper = PromptHelper(
        max_input_size,
        num_outputs,
        chunk_overlap_ratio,
        chunk_size_limit=chunk_size_limit)

    # LLMPredictor is a wrapper class around LangChain's LLMChain that allows easy integration into LlamaIndex.
    llm_predictor = LLMPredictor(
        ChatOpenAI(model="gpt-3.5-turbo"))

    # The embedding model used to generate vector representations of text.
    embedding_llm = LangchainEmbedding(
        langchain_embeddings=OpenAIEmbeddings(
            model="text-embedding-ada-002",
            chunk_size=1)
    )

    # Constructs service_context
    service_context = ServiceContext.from_defaults(
        llm_predictor=llm_predictor,
        embed_model=embedding_llm,
        node_parser=node_parser,
        prompt_helper=prompt_helper)

    return service_context


In [None]:
# import pandas as pd

"""
from reportlab.pdfgen import canvas

def create_pdf(row, output_file):

    pdf_filename = f"{output_file}_{row['ID']}.pdf"

    # Create a PDF canvas
    c = canvas.Canvas(pdf_filename)

    # Set up the PDF content based on your requirements
    c.drawString(100, 800, f"Title: {row['Title']}")
    c.drawString(100, 780, f"Author: {row['Author']}")
    c.drawString(100, 760, f"Link: {row['Link']}")
    c.drawString(100, 740, f"ID: {row['ID']}")
    c.drawString(100, 720, f"Bookshelf: {row['Bookshelf']}")
    c.drawString(100, 700, f"Text: {row['Text']}")

    # Save the PDF
    c.save()

    print(f"PDF created: {pdf_filename}")

def create_pdfs_from_csv(csv_file, output_file_prefix):

    df = pd.read_csv(csv_file)

    for _, row in df.iterrows():
        create_pdf(row, output_file_prefix)

# Specify the path to your CSV file
csv_file_path = "/content/drive/MyDrive/Data Science Final Project/books/gutenberg_data.csv"

# Specify the prefix for output PDF files
output_file_prefix = "/content/drive/MyDrive/Data Science Final Project/books/doc/id"

# Create PDFs from the CSV file
create_pdfs_from_csv(csv_file_path, output_file_prefix)
"""


'\nfrom reportlab.pdfgen import canvas\n\ndef create_pdf(row, output_file):\n\n    pdf_filename = f"{output_file}_{row[\'ID\']}.pdf"\n\n    # Create a PDF canvas\n    c = canvas.Canvas(pdf_filename)\n\n    # Set up the PDF content based on your requirements\n    c.drawString(100, 800, f"Title: {row[\'Title\']}")\n    c.drawString(100, 780, f"Author: {row[\'Author\']}")\n    c.drawString(100, 760, f"Link: {row[\'Link\']}")\n    c.drawString(100, 740, f"ID: {row[\'ID\']}")\n    c.drawString(100, 720, f"Bookshelf: {row[\'Bookshelf\']}")\n    c.drawString(100, 700, f"Text: {row[\'Text\']}")\n\n    # Save the PDF\n    c.save()\n\n    print(f"PDF created: {pdf_filename}")\n\ndef create_pdfs_from_csv(csv_file, output_file_prefix):\n\n    df = pd.read_csv(csv_file)\n\n    for _, row in df.iterrows():\n        create_pdf(row, output_file_prefix)\n\n# Specify the path to your CSV file\ncsv_file_path = "/content/drive/MyDrive/Data Science Final Project/books/gutenberg_data.csv"\n\n# Specify the p

In [None]:

from langchain.chains import ConversationalRetrievalChain, RetrievalQA
from langchain.chat_models import ChatOpenAI
import pandas as pd
import sys
import tensorflow as tf
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain.llms import OpenAI
from langchain.vectorstores import Chroma
import openai
import json

physical_devices = tf.config.list_physical_devices('GPU')
if len(physical_devices) > 0:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
else:
    print("No GPU detected. Make sure your TensorFlow installation supports GPU.")


No GPU detected. Make sure your TensorFlow installation supports GPU.


In [None]:
"""
def data_ingestion_indexing(directory_path):
  storage_context = StorageContext.from_defaults(persist_dir="/content/drive/MyDrive/Data Science Final Project/books/ada_embed")
  index = load_index_from_storage(storage_context, service_context=create_service_context())

  # Loads data from the specified directory path
  documents = SimpleDirectoryReader(directory_path).load_data()
  #index = GPTVectorStoreIndex.from_documents(
  #    documents, service_context=create_service_context()
  #)

  for d in documents:
    print(d.metadata)
    index.insert(d, service_context=create_service_context())

  # Persist index to disk, default "storage" folder
  index.storage_context.persist(persist_dir="/content/drive/MyDrive/Data Science Final Project/books/ada_embed")


base_path = '/content/drive/MyDrive/Data Science Final Project/books/batches/Batch_'
start = 10
end = 20

for i in range(start,end):
  path = base_path+str(i)
  print(path)
  data_ingestion_indexing(path)

"""

'\ndef data_ingestion_indexing(directory_path):\n  storage_context = StorageContext.from_defaults(persist_dir="/content/drive/MyDrive/Data Science Final Project/books/ada_embed")\n  index = load_index_from_storage(storage_context, service_context=create_service_context())\n\n  # Loads data from the specified directory path\n  documents = SimpleDirectoryReader(directory_path).load_data()\n  #index = GPTVectorStoreIndex.from_documents(\n  #    documents, service_context=create_service_context()\n  #)\n\n  for d in documents:\n    print(d.metadata)\n    index.insert(d, service_context=create_service_context())\n\n  # Persist index to disk, default "storage" folder\n  index.storage_context.persist(persist_dir="/content/drive/MyDrive/Data Science Final Project/books/ada_embed")\n\n\nbase_path = \'/content/drive/MyDrive/Data Science Final Project/books/batches/Batch_\'\nstart = 10\nend = 20\n\nfor i in range(start,end):\n  path = base_path+str(i)\n  print(path)\n  data_ingestion_indexing(pat

In [None]:
service_context = create_service_context()
set_global_service_context(service_context)

In [None]:
import re
storage_context = StorageContext.from_defaults(persist_dir="/content/drive/MyDrive/Data Science Final Project/books/ada_embed")
index = load_index_from_storage(storage_context, service_context=create_service_context())


In [None]:
query_engine = index.as_query_engine()

In [None]:
response = query_engine.query("what book is this quote from: An hour later the guardians assembled and, upon hearing the circumstances of the newcomer's admission, and the death of the tramp, they decided that the child should be entered in the books as William Gale,--the name being chosen with a reference to the weather during which he came into the house --and against his name a note was written, to the effect that his mother--a tramp, name unknown--had, after leaving him at the door ")
#document_info = str(response.source_nodes)
print(response.get_formatted_sources())
print(response.source_nodes)

#find = re.findall(r"'page_label': '[^']*', 'file_name': '[^']*'", document_info)
#print(document_info)
display_response(response)


KeyboardInterrupt: ignored

In [None]:
# 25 Words Query

response_25 = query_engine.query("what book is this quote from: An hour later the guardians assembled and, upon hearing the circumstances of the newcomer's admission, and the death of the tramp, they decided that the")
#document_info = str(response.source_nodes)
print(response_25.get_formatted_sources())
print(response_25.source_nodes)

#find = re.findall(r"'page_label': '[^']*', 'file_name': '[^']*'", document_info)
#print(document_info)
display_response(response_25)



> Source (Doc id: 03086007-a281-4a9e-8904-686cea32f6a0): the five men had become callous.They sang or talked as unconcernedly as they might have done in t...

> Source (Doc id: e244f97d-2880-42e6-bc8c-7fe82b37e406): us; he offers this sum, but says that if the boys are notreleased before sunrise, he will come an...
[NodeWithScore(node=TextNode(id_='03086007-a281-4a9e-8904-686cea32f6a0', embedding=None, metadata={'page_label': '1', 'file_name': "Out of Time's Abyss.pdf"}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='9af501f3-ade4-45bb-8e14-a5bb96489789', node_type=None, metadata={'page_label': '1', 'file_name': "Out of Time's Abyss.pdf"}, hash='e3d0d0dcbe52bbe168ce4a47852cd4cf806b9fecd50c5f5ea91530269bf6332d'), <NodeRelationship.PREVIOUS: '2'>: RelatedNodeInfo(node_id='98183da0-344d-44ac-a4ae-f286ce07b733', node_type=None, metadata={'page_label': '1', 'file_name': "Out of Time's Abyss.pdf"}, hash=

**`Final Response:`** Out of Time's Abyss.pdf

In [None]:
# 75 Words Query

response_75 = query_engine.query("what book is this quote from: An hour later the guardians assembled and, upon hearing the circumstances of the newcomer's admission, and the death of the tramp, they decided that the child should be entered in the books as William Gale,--the name being chosen with a reference to the weather during which he came into the house --and against his name a note was written, to the effect that his mother--a tramp, name unknown--had, after leaving him at the door")
#document_info = str(response.source_nodes)
print(response_75.get_formatted_sources())
print(response_75.source_nodes)

#find = re.findall(r"'page_label': '[^']*', 'file_name': '[^']*'", document_info)
#print(document_info)
display_response(response_75)

> Source (Doc id: fd12c3df-7c6e-46ba-91ca-54aa1ae68129): had better take that child down, andlet it see the tramp they have found, frozen to death. The ch...

> Source (Doc id: e85c4ca7-01fc-41b9-8779-8dfc784b0aa9): months. It's enough to make a fellow feelblue, listenin' to her complainin' and groanin' all the ...
[NodeWithScore(node=TextNode(id_='fd12c3df-7c6e-46ba-91ca-54aa1ae68129', embedding=None, metadata={'page_label': '1', 'file_name': 'For Name and Fame; Or, Through Afghan Passes.pdf'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='c03d1ab7-7e31-458a-9c6c-aac210b607c0', node_type=None, metadata={'page_label': '1', 'file_name': 'For Name and Fame; Or, Through Afghan Passes.pdf'}, hash='369ea224f96fa8a26a60819326bd9ade80463dfab1855ac9a03c507a6edbc9c9'), <NodeRelationship.PREVIOUS: '2'>: RelatedNodeInfo(node_id='fb0d1754-1204-48fc-9895-174a55a7b8c3', node_type=None, metadata={'page_label': '

**`Final Response:`** Based on the given context, the quote is from the book "For Name and Fame; Or, Through Afghan Passes.pdf".

In [None]:
# Paraphrased Query

response_para = query_engine.query("What is the title of the book where Paulus, absorbed in his own spiritual practices, neglects Stephanus' urgent need for water, only to realize his mistake and reflect on human egoism?")
#document_info = str(response.source_nodes)
print(response_para.get_formatted_sources())
print(response_para.source_nodes)

#find = re.findall(r"'page_label': '[^']*', 'file_name': '[^']*'", document_info)
#print(document_info)
display_response(response_para)

> Source (Doc id: 02526df8-eac3-4dd4-a5ce-2784eb6fc4ae): over again the most miserable hour of his life, anhour now long since past and gone.He thought he...

> Source (Doc id: c5b9a38b-0b12-4622-bd4e-464fa899db24): snatches it from me, as the wind swept back the fruit-ladenboughs which Tantalus, parched with th...
[NodeWithScore(node=TextNode(id_='02526df8-eac3-4dd4-a5ce-2784eb6fc4ae', embedding=None, metadata={'page_label': '1', 'file_name': 'Homo Sum — Complete.pdf'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='2f266029-f7ee-4693-be8e-1355ea984010', node_type=None, metadata={'page_label': '1', 'file_name': 'Homo Sum — Complete.pdf'}, hash='8947228779d4570875750f9b99884c351f9fba0d2ef7c484601f868a51ffeaa6'), <NodeRelationship.PREVIOUS: '2'>: RelatedNodeInfo(node_id='87ab5182-bfeb-44de-8b3f-c1a55fd25936', node_type=None, metadata={'page_label': '1', 'file_name': 'Homo Sum — Complete.pdf'}, hash=

**`Final Response:`** The title of the book where Paulus, absorbed in his own spiritual practices, neglects Stephanus' urgent need for water, only to realize his mistake and reflect on human egoism is "Homo Sum".

In [None]:
"""
def data_querying(input_text, follow_up_questions = False):
  # Rebuild storage context
  storage_context = StorageContext.from_defaults(persist_dir="/content/drive/MyDrive/Data Science Final Project/books/ada_embed")

  # Loads index from storage
  index = load_index_from_storage(storage_context, service_context=create_service_context())

  # Check if it's a follow up chat ot not
  # Queries the index with the input text
  if follow_up_questions:
    response = index.as_chat_engine().chat(input_text)
  else:
    response = index.as_query_engine().query(input_text)
  return response.response

iface = gr.Interface(fn=data_querying,
                     inputs=gr.components.Textbox(lines=7, placeholder="Enter your question here"),
                     outputs="text",
                     title="Books",
                     description="Ask about old books",
                     )
iface.launch(share=True)
"""

'\ndef data_querying(input_text, follow_up_questions = False):\n  # Rebuild storage context\n  storage_context = StorageContext.from_defaults(persist_dir="/content/drive/MyDrive/Data Science Final Project/books/ada_embed")\n\n  # Loads index from storage\n  index = load_index_from_storage(storage_context, service_context=create_service_context())\n\n  # Check if it\'s a follow up chat ot not\n  # Queries the index with the input text\n  if follow_up_questions:\n    response = index.as_chat_engine().chat(input_text)\n  else:\n    response = index.as_query_engine().query(input_text)\n  return response.response\n\niface = gr.Interface(fn=data_querying,\n                     inputs=gr.components.Textbox(lines=7, placeholder="Enter your question here"),\n                     outputs="text",\n                     title="Books",\n                     description="Ask about old books",\n                     )\niface.launch(share=True)\n'