In [30]:
from openai import OpenAI

from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.vectorstores import FAISS

import faiss

from io import BytesIO
import pandas as pd
import numpy as np

from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
from googleapiclient.http import MediaIoBaseDownload

from datetime import datetime
import pickle
import math
import os

In [2]:
api_key = os.getenv('OPENAI_API_KEY')

In [3]:
client = OpenAI(api_key=api_key)

In [4]:
prompt = '''What was uber's revenue in 2022?'''

In [5]:
openai_response = client.chat.completions.create(
    model='gpt-3.5-turbo',
    messages=[{'role':'user', 'content':prompt}])

In [6]:
openai_response.choices[0].message.content

"I do not have information on Uber's revenue for 2022 as it is a future date. My training data only goes up until September 2021."

In [7]:
openai_response = client.chat.completions.create(
    model='gpt-4o-mini',
    messages=[{'role':'user', 'content':prompt}])

In [8]:
openai_response.choices[0].message.content

"Uber's revenue for the full year of 2022 was approximately $31.88 billion. This represented a significant increase from previous years, reflecting the company's growth in the ride-hailing and delivery sectors. If you need more details or specific figures, feel free to ask!"

In [9]:
retrieved_context = '''Revenue was $37.3 billion, up 17% year-over-year. Mobility revenue increased $5.8 billion primarily attributable to an increase in
               Mobility Gross Bookings of 31% year-over-year.'''

In [10]:
prompt = f"What was Uber's revenue in 2022? Check in {retrieved_context}"

In [11]:
openai_response = client.chat.completions.create(
    model = 'gpt-3.5-turbo',
    messages = [{'role': 'user', 'content': prompt}])

In [12]:
openai_response.choices[0].message.content

"In 2022, Uber's revenue was $37.3 billion."

In [16]:
SCOPES = ['https://www.googleapis.com/auth/drive']

creds = None
if os.path.exists('token.pickle'):
    with open('token.pickle', 'rb') as token:
        creds = pickle.load(token)

# Authenticate if no valid credentials exist
if not creds or not creds.valid:
    if creds and creds.expired and creds.refresh_token:
        creds.refresh(Request())
    else:
        flow = InstalledAppFlow.from_client_secrets_file('credentials.json', SCOPES)
        creds = flow.run_local_server(port=0)
    
    # Save the credentials for future use
    with open('token.pickle', 'wb') as token:
        pickle.dump(creds, token)

# Initialize Google Drive API service
drive_service = build('drive', 'v3', credentials=creds)

In [17]:
# List files in Google Drive
results = drive_service.files().list(pageSize=10, fields="files(id, name)").execute()
files = results.get('files', [])

for file in files:
    print(f"File: {file['name']}, ID: {file['id']}")

File: Copy of Review Genie_Solution.ipynb, ID: 1QNo1c4lCjH1gm_eCx3eMnWjzloWuUZGW
File: Movies to download, ID: 1QIXssYJy3qWHqAIQeZMd-qUkvgYERhrh3pN82m8maxo
File: Copy of week3_template [FinQuery/SalesTrend].ipynb, ID: 13dBjmLMPx5a7liuMscpnuJ9XIvcCMGtf
File: Copy of Week2[SalesTrends and FinQuery].ipynb, ID: 1DrTZWHKO5VCwRUyUq-50pMKYDdByUxcQ
File: Copy of Review_Genie_Week_4_Evaluation.ipynb, ID: 1tWvl6YQwRccuxbv_lrfKgLMix3DEJgC9
File: Copy of Live_Class_Review_Genie_Week_3.ipynb, ID: 13dHdxASHuNWfmu7ULTMEte8SZzhWS4d9
File: Copy of Template_Review_Genie_Week_2_Agents.ipynb, ID: 1q6N1EylCVwp2BlKZzLIpZCsFA00ziVUv
File: FinQuery, ID: 1thyzKOPzhA3pq9LJ5qiO4B2T-Nr7KJfG
File: Review Genie, ID: 1_5o__qGXdlrXEeUSVlt6rMVA9u9a3Y_H
File: IK, ID: 1NTkDWLqSR6tkGj8Z-lV6Vt3fzn70VZef


In [18]:
# List files in Google Drive
results = drive_service.files().list(
    q="name='train.csv'",  # Query to search by filename
    spaces='drive',
    fields="files(id, name)"
).execute()

files = results.get("files", [])

if not files:
    print("No file found.")
else:
    file_id = files[0]['id']
    print(f"File ID: {file_id}")

File ID: 1NFJ1uypGmMZSA6MOvDdiZA3aegCceNK0


In [19]:
# Request to download the file
request = drive_service.files().get_media(fileId=file_id)
file = BytesIO()
downloader = MediaIoBaseDownload(file, request)

done = False
while not done:
    status, done = downloader.next_chunk()
    print(f"Download {int(status.progress() * 100)}%.")

# Move to the beginning of the file
file.seek(0)

# Read CSV into Pandas DataFrame
df = pd.read_csv(file, index_col=0)

# Display the first few rows
df.head()


Download 6%.
Download 12%.
Download 19%.
Download 25%.
Download 31%.
Download 38%.
Download 44%.
Download 50%.
Download 57%.
Download 63%.
Download 69%.
Download 76%.
Download 82%.
Download 88%.
Download 95%.
Download 100%.


Unnamed: 0_level_0,TITLE,BULLET_POINTS,DESCRIPTION,PRODUCT_TYPE_ID,PRODUCT_LENGTH
PRODUCT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1925202,ArtzFolio Tulip Flowers Blackout Curtain for D...,[LUXURIOUS & APPEALING: Beautiful custom-made ...,,1650,2125.98
2673191,Marks & Spencer Girls' Pyjama Sets T86_2561C_N...,"[Harry Potter Hedwig Pyjamas (6-16 Yrs),100% c...",,2755,393.7
2765088,PRIKNIK Horn Red Electric Air Horn Compressor ...,"[Loud Dual Tone Trumpet Horn, Compatible With ...","Specifications: Color: Red, Material: Aluminiu...",7537,748.031495
1594019,ALISHAH Women's Cotton Ankle Length Leggings C...,[Made By 95%cotton and 5% Lycra which gives yo...,AISHAH Women's Lycra Cotton Ankel Leggings. Br...,2996,787.401574
283658,The United Empire Loyalists: A Chronicle of th...,,,6112,598.424


In [20]:
%%time
product_description = []
product_description_len = []

row_num = 0
# Iterating through each row in the dataframe df2
for row in df.iterrows():
    if row_num % 100000 == 0:
        print(f"processing row {row_num}")
    row_num += 1
    product = ""  # Initialize an empty string to accumulate product details

    # Extracting the product title from the current row
    title = row[1]["TITLE"]

    # Checking if the title is valid (not NaN or missing)
    if type(title) != float or not math.isnan(title):
        product += "Title\n" + title + "\n"  # Append the title to the product description

    # Extracting the product description from the current row
    description = row[1]["DESCRIPTION"]

    # Checking if the description is valid (not NaN or missing)
    if type(description) != float or not math.isnan(description):
        product += "Description\n" + description + "\n"  # Append the description to the product details

    # Check if either title or description was added
    added_content = title or description
    if added_content:
        product = product.strip()  # Remove any leading/trailing whitespace
        product_description.append(product)  # Add the formatted product details to the list
        product_description_len.append(len(product))  # Store the length of the product description

processing row 0
processing row 100000
processing row 200000
processing row 300000
processing row 400000
processing row 500000
processing row 600000
processing row 700000
processing row 800000
processing row 900000
processing row 1000000
processing row 1100000
processing row 1200000
processing row 1300000
processing row 1400000
processing row 1500000
processing row 1600000
processing row 1700000
processing row 1800000
processing row 1900000
processing row 2000000
processing row 2100000
processing row 2200000
CPU times: user 2min 29s, sys: 2.1 s, total: 2min 31s
Wall time: 2min 32s


In [21]:
print(f"Number of elements {len(product_description)}")
print("Number of items", len(product_description_len))
print("Min length of the description", np.min(product_description_len))
print("Avg length of the description", np.mean(product_description_len))
print("Max length of the description", np.max(product_description_len))

Number of elements 2249698
Number of items 2249698
Min length of the description 0
Avg length of the description 404.86550550340536
Max length of the description 4656


In [22]:
%%time
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=250,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)
documents = text_splitter.create_documents(product_description)

CPU times: user 2min 58s, sys: 6.09 s, total: 3min 4s
Wall time: 3min 7s


In [23]:
embeddings = OpenAIEmbeddings()

## TODO: add loading logic if index exists on disk

In [31]:
%%time
# code suggested by ChatGPT, for reducing memory requirements

from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.docstore.in_memory import InMemoryDocstore
from langchain.schema import Document
import faiss
import numpy as np

# Initialize OpenAI embeddings
embeddings = OpenAIEmbeddings()

# Determine embedding dimension
dummy_embedding = embeddings.embed_query("dummy")  
dimension = len(dummy_embedding)
print(f'{dimension=}')

# Create an empty FAISS index
index = faiss.IndexFlatL2(dimension)  

# Initialize a docstore (needed for LangChain's FAISS wrapper)
docstore = InMemoryDocstore()
index_to_docstore_id = {}

# Wrap the FAISS index with LangChain
vector_store = FAISS(index=index, docstore=docstore, index_to_docstore_id=index_to_docstore_id, embedding_function=embeddings)

# Example: Batch Ingestion
batch_size = 10000
for i in range(0, len(documents), batch_size):
    print(f'{datetime.now()} processing doc {i}')
    batch = documents[i:i+batch_size]
    texts = [doc.page_content for doc in batch]
    
    # Generate embeddings
    vectors = embeddings.embed_documents(texts)
    
    # Convert to LangChain document format
    langchain_docs = [Document(page_content=text) for text in texts]
    
    # Add to FAISS index
    vector_store.add_documents(langchain_docs)

# Save FAISS index for later use
vector_store.save_local("faiss_index")


dimension=1536
2025-03-19 20:10:55.142255 processing doc 0
2025-03-19 20:11:59.253138 processing doc 10000
2025-03-19 20:13:01.605302 processing doc 20000
2025-03-19 20:14:10.663023 processing doc 30000
2025-03-19 20:15:17.872574 processing doc 40000
2025-03-19 20:16:23.358752 processing doc 50000
2025-03-19 20:17:35.105664 processing doc 60000
2025-03-19 20:18:42.297259 processing doc 70000
2025-03-19 20:19:49.668162 processing doc 80000
2025-03-19 20:20:56.957328 processing doc 90000
2025-03-19 20:22:06.588971 processing doc 100000
2025-03-19 20:23:14.334245 processing doc 110000
2025-03-19 20:24:21.639089 processing doc 120000
2025-03-19 20:25:34.029694 processing doc 130000
2025-03-19 20:26:41.258066 processing doc 140000
2025-03-19 20:27:44.164939 processing doc 150000
2025-03-19 20:28:44.462792 processing doc 160000
2025-03-19 20:30:27.001031 processing doc 170000
2025-03-19 20:31:33.777947 processing doc 180000
2025-03-19 20:32:43.776449 processing doc 190000
2025-03-19 20:33:51

In [None]:
%%time
#vector = FAISS.from_documents(documents, embeddings)

In [32]:
llm = ChatOpenAI(api_key=os.environ["OPENAI_API_KEY"], model = 'gpt-4o-mini')

In [33]:
# Importing the output parser to process and format the model's response into a readable string format.
output_parser = StrOutputParser()

# Creating a prompt template that instructs the AI to act as a customer service agent.
# The prompt takes two parameters:
#   1. {context} - Relevant information retrieved from the document store.
#   2. {input} - The user's question.
# The model is instructed to base its answer solely on the provided context.
prompt = ChatPromptTemplate.from_template(
    """Answer the following question based only on the provided context:

    <context>
    {context}
    </context>

    Question: {input}""",
    output_parser=output_parser  # The output parser ensures that the response is returned in a structured string format.
)

# Creating a document processing chain using the LLM and the defined prompt template.
# This chain takes a list of retrieved documents and passes them as context to the model for generating responses.
document_chain = create_stuff_documents_chain(llm, prompt)

# Alternative chain creation method:
# Using the "|" (pipe) operator to link the prompt with the language model (llm),
# meaning the input first goes to the prompt and then to the model for response generation.
# document_chain = prompt | llm

In [34]:
retriever = vector_store.as_retriever()

In [35]:
retrieval_chain = create_retrieval_chain(retriever, document_chain)

In [37]:
%%time
retrieval_chain.invoke({"input": "what are some of the best shoes available?"})

CPU times: user 7.75 s, sys: 1min 3s, total: 1min 10s
Wall time: 2min 24s


{'input': 'what are some of the best shoes available?',
 'context': [Document(id='ee86fe5f-2772-48dd-a101-1ea2bde933ee', metadata={}, page_content='and most affordable footwear available.'),
  Document(id='36639e1d-5173-4b49-b7ac-91b8d17e00a7', metadata={}, page_content='shoes, sneakers, boots, etc.'),
  Document(id='0407812b-5f76-4c93-b40c-03ef66ffe982', metadata={}, page_content='shoes, sneakers, boots, etc.'),
  Document(id='0c41a944-a846-4eb9-906a-db3bf5887bf2', metadata={}, page_content='shoes, sneakers, boots, etc.')],
 'answer': 'The context does not provide specific names or brands of shoes. It only mentions "shoes, sneakers, boots, etc." as general categories of footwear. Therefore, I cannot determine what the best shoes available are based on the provided information.'}

In [41]:
def now():
    s = f'{datetime.now()}'
    dot_index = s.find('.')
    return s[:dot_index]


In [43]:
%%time
#  Convert IndexFlatL2 to On-Disk IndexIVFFlat
import faiss
import numpy as np

# Load existing index
print(f'{now()} Loading existing index...')
index_flat = faiss.read_index("faiss_index/index.faiss")

# Get embedding dimension
dimension = index_flat.d  

# Number of clusters (tune based on dataset size)
nlist = 1000  

# Create on-disk IndexIVFFlat
quantizer = faiss.IndexFlatL2(dimension)
index_ivf = faiss.IndexIVFFlat(quantizer, dimension, nlist, faiss.METRIC_L2)

# Train with a subset of existing embeddings (required for IVFFlat)
print(f'{now()} training on subset of existing embeddings...')
sample_size = min(100000, index_flat.ntotal)  # Use 100K samples for training
vectors = np.array([index_flat.reconstruct(i) for i in range(sample_size)], dtype=np.float32)
index_ivf.train(vectors)

# Use on-disk storage
index_ivf = faiss.index_cpu_to_index_ivf_on_disk(index_ivf, "faiss_index_ivf")

# Add full dataset
print(f'{now()} add full dataset...')
all_vectors = np.array([index_flat.reconstruct(i) for i in range(index_flat.ntotal)], dtype=np.float32)
index_ivf.add(all_vectors)

# Save index
print(f'{now()} saving...')
faiss.write_index(index_ivf, "faiss_index_ivf")
print("✅ FAISS index converted to on-disk storage!")


2025-03-20 12:27:45 Loading existing index...
2025-03-20 12:28:49 training on subset of existing embeddings...


AttributeError: module 'faiss' has no attribute 'index_cpu_to_index_ivf_on_disk'