In [1]:
from openai import OpenAI

from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.vectorstores import FAISS

import faiss

from io import BytesIO
import pandas as pd
import numpy as np

from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
from googleapiclient.http import MediaIoBaseDownload

from datetime import datetime
import pickle
import math
import os

In [2]:
api_key = os.getenv('OPENAI_API_KEY')

In [3]:
client = OpenAI(api_key=api_key)

In [4]:
prompt = '''What was uber's revenue in 2022?'''

In [5]:
openai_response = client.chat.completions.create(
    model='gpt-3.5-turbo',
    messages=[{'role':'user', 'content':prompt}])

In [6]:
openai_response.choices[0].message.content

"As of the current year 2021, Uber has not released its revenue figures for 2022 yet. Typically, companies release their annual financial reports the following year. You may have to wait until 2023 to find out Uber's revenue for 2022."

In [7]:
openai_response = client.chat.completions.create(
    model='gpt-4o-mini',
    messages=[{'role':'user', 'content':prompt}])

In [8]:
openai_response.choices[0].message.content

"Uber's revenue for the full year 2022 was approximately $31.9 billion. This figure reflects the company's continued growth in ride-hailing and delivery services, including Uber Eats. If you need more specific information or insights about Uber's performance, feel free to ask!"

In [9]:
retrieved_context = '''Revenue was $37.3 billion, up 17% year-over-year. Mobility revenue increased $5.8 billion primarily attributable to an increase in
               Mobility Gross Bookings of 31% year-over-year.'''

In [10]:
prompt = f"What was Uber's revenue in 2022? Check in {retrieved_context}"

In [11]:
openai_response = client.chat.completions.create(
    model = 'gpt-3.5-turbo',
    messages = [{'role': 'user', 'content': prompt}])

In [12]:
openai_response.choices[0].message.content

"In 2022, Uber's revenue was $37.3 billion."

In [13]:
SCOPES = ['https://www.googleapis.com/auth/drive']

creds = None
if os.path.exists('token.pickle'):
    with open('token.pickle', 'rb') as token:
        creds = pickle.load(token)

# Authenticate if no valid credentials exist
if not creds or not creds.valid:
    if creds and creds.expired and creds.refresh_token:
        creds.refresh(Request())
    else:
        flow = InstalledAppFlow.from_client_secrets_file('credentials.json', SCOPES)
        creds = flow.run_local_server(port=0)
    
    # Save the credentials for future use
    with open('token.pickle', 'wb') as token:
        pickle.dump(creds, token)

# Initialize Google Drive API service
drive_service = build('drive', 'v3', credentials=creds)

Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=562544657444-e8o0tl8emg9s2p5hancbc7a4pbucbvod.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A57291%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&state=8AtCmXXSp4PFkRZgplSjyIeCmoJuL6&access_type=offline


In [14]:
# List files in Google Drive
results = drive_service.files().list(pageSize=10, fields="files(id, name)").execute()
files = results.get('files', [])

for file in files:
    print(f"File: {file['name']}, ID: {file['id']}")

File: train_100.csv, ID: 1f1E8hjgOWFEXIRsK5gV-j-XD2gYBwPN2
File: Copy of Review Genie_Solution.ipynb, ID: 1QNo1c4lCjH1gm_eCx3eMnWjzloWuUZGW
File: Movies to download, ID: 1QIXssYJy3qWHqAIQeZMd-qUkvgYERhrh3pN82m8maxo
File: Copy of week3_template [FinQuery/SalesTrend].ipynb, ID: 13dBjmLMPx5a7liuMscpnuJ9XIvcCMGtf
File: Copy of Week2[SalesTrends and FinQuery].ipynb, ID: 1DrTZWHKO5VCwRUyUq-50pMKYDdByUxcQ
File: Copy of Review_Genie_Week_4_Evaluation.ipynb, ID: 1tWvl6YQwRccuxbv_lrfKgLMix3DEJgC9
File: Copy of Live_Class_Review_Genie_Week_3.ipynb, ID: 13dHdxASHuNWfmu7ULTMEte8SZzhWS4d9
File: Copy of Template_Review_Genie_Week_2_Agents.ipynb, ID: 1q6N1EylCVwp2BlKZzLIpZCsFA00ziVUv
File: FinQuery, ID: 1thyzKOPzhA3pq9LJ5qiO4B2T-Nr7KJfG
File: Review Genie, ID: 1_5o__qGXdlrXEeUSVlt6rMVA9u9a3Y_H


In [15]:
# List files in Google Drive
results = drive_service.files().list(
    q="name='train_100.csv'",  # Query to search by filename
    spaces='drive',
    fields="files(id, name)"
).execute()

files = results.get("files", [])

if not files:
    print("No file found.")
else:
    file_id = files[0]['id']
    print(f"File ID: {file_id}")

File ID: 1f1E8hjgOWFEXIRsK5gV-j-XD2gYBwPN2


In [16]:
# Request to download the file
request = drive_service.files().get_media(fileId=file_id)
file = BytesIO()
downloader = MediaIoBaseDownload(file, request)

done = False
while not done:
    status, done = downloader.next_chunk()
    print(f"Download {int(status.progress() * 100)}%.")

# Move to the beginning of the file
file.seek(0)

# Read CSV into Pandas DataFrame
df = pd.read_csv(file, index_col=0)

# Display the first few rows
df.head()


Download 100%.


Unnamed: 0_level_0,TITLE,BULLET_POINTS,DESCRIPTION,PRODUCT_TYPE_ID,PRODUCT_LENGTH
PRODUCT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1925202,ArtzFolio Tulip Flowers Blackout Curtain for D...,[LUXURIOUS & APPEALING: Beautiful custom-made ...,,1650,2125.98
2673191,Marks & Spencer Girls' Pyjama Sets T86_2561C_N...,"[Harry Potter Hedwig Pyjamas (6-16 Yrs),100% c...",,2755,393.7
2765088,PRIKNIK Horn Red Electric Air Horn Compressor ...,"[Loud Dual Tone Trumpet Horn, Compatible With ...","Specifications: Color: Red, Material: Aluminiu...",7537,748.031495
1594019,ALISHAH Women's Cotton Ankle Length Leggings C...,[Made By 95%cotton and 5% Lycra which gives yo...,AISHAH Women's Lycra Cotton Ankel Leggings. Br...,2996,787.401574
283658,The United Empire Loyalists: A Chronicle of th...,,,6112,598.424


In [17]:
%%time
product_description = []
product_description_len = []

row_num = 0
# Iterating through each row in the dataframe df2
for row in df.iterrows():
    if row_num % 100000 == 0:
        print(f"processing row {row_num}")
    row_num += 1
    product = ""  # Initialize an empty string to accumulate product details

    # Extracting the product title from the current row
    title = row[1]["TITLE"]

    # Checking if the title is valid (not NaN or missing)
    if type(title) != float or not math.isnan(title):
        product += "Title\n" + title + "\n"  # Append the title to the product description

    # Extracting the product description from the current row
    description = row[1]["DESCRIPTION"]

    # Checking if the description is valid (not NaN or missing)
    if type(description) != float or not math.isnan(description):
        product += "Description\n" + description + "\n"  # Append the description to the product details

    # Check if either title or description was added
    added_content = title or description
    if added_content:
        product = product.strip()  # Remove any leading/trailing whitespace
        product_description.append(product)  # Add the formatted product details to the list
        product_description_len.append(len(product))  # Store the length of the product description

processing row 0
CPU times: user 12.9 ms, sys: 1.63 ms, total: 14.5 ms
Wall time: 14.2 ms


In [18]:
print(f"Number of elements {len(product_description)}")
print("Number of items", len(product_description_len))
print("Min length of the description", np.min(product_description_len))
print("Avg length of the description", np.mean(product_description_len))
print("Max length of the description", np.max(product_description_len))

Number of elements 100
Number of items 100
Min length of the description 18
Avg length of the description 385.9
Max length of the description 1834


In [19]:
%%time
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=250,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)
documents = text_splitter.create_documents(product_description)

CPU times: user 14.1 ms, sys: 2.28 ms, total: 16.3 ms
Wall time: 17 ms


In [20]:
embeddings = OpenAIEmbeddings()

In [21]:
vector = FAISS.from_documents(documents, embeddings)

In [22]:
llm = ChatOpenAI(api_key=os.environ["OPENAI_API_KEY"], model = 'gpt-4o-mini')

In [23]:
# Importing the output parser to process and format the model's response into a readable string format.
output_parser = StrOutputParser()

# Creating a prompt template that instructs the AI to act as a customer service agent.
# The prompt takes two parameters:
#   1. {context} - Relevant information retrieved from the document store.
#   2. {input} - The user's question.
# The model is instructed to base its answer solely on the provided context.
prompt = ChatPromptTemplate.from_template(
    """Answer the following question based only on the provided context:

    <context>
    {context}
    </context>

    Question: {input}""",
    output_parser=output_parser  # The output parser ensures that the response is returned in a structured string format.
)

# Creating a document processing chain using the LLM and the defined prompt template.
# This chain takes a list of retrieved documents and passes them as context to the model for generating responses.
document_chain = create_stuff_documents_chain(llm, prompt)

# Alternative chain creation method:
# Using the "|" (pipe) operator to link the prompt with the language model (llm),
# meaning the input first goes to the prompt and then to the model for response generation.
# document_chain = prompt | llm

In [24]:
retriever = vector.as_retriever()

In [25]:
retrieval_chain = create_retrieval_chain(retriever, document_chain)

In [26]:
%%time
retrieval_chain.invoke({"input": "what are some of the best shoes available?"})

CPU times: user 45.1 ms, sys: 10.9 ms, total: 56 ms
Wall time: 2.33 s


{'input': 'what are some of the best shoes available?',
 'context': [Document(id='c0cd7492-4c9f-467c-b00d-ba42fd6cd18c', metadata={}, page_content="Title\nadidas Men's Predator 18+ FG Firm Ground Soccer Cleats\nDescription\nadidas Predator 18+ FG- Black 7.5"),
  Document(id='8615b689-b2da-4278-8b75-236da0ccf24d', metadata={}, page_content="Title\nKenneth Cole REACTION Men's Crespo Loafer B Shoe, Cognac, 10 M US"),
  Document(id='c85b8322-4692-4a8c-9290-440c3288a085', metadata={}, page_content="Title\nPUMA Cali Sport Clean Women's Sneakers White Leather (37540701)"),
  Document(id='6f78bf1b-2277-4cea-b417-a26c4af22417', metadata={}, page_content="The Remora Climbing Shoe is Mad Rock's do-it-all slipper for climbers who can't have separate shoes for boulders, sport routes, and gyms. With a moderately stiff, slightly downturned design, the Remora performs on any climb at steep to vertical")],
 'answer': "Based on the provided context, some of the best shoes available include: \n\n1. adida

In [27]:
retrieval_chain.invoke({"input": "what are some of the best shoes available?"})['answer']

"Based on the provided context, some of the best shoes available include:\n\n1. adidas Men's Predator 18+ FG Firm Ground Soccer Cleats\n2. Kenneth Cole REACTION Men's Crespo Loafer B Shoe\n3. PUMA Cali Sport Clean Women's Sneakers\n4. Mad Rock Remora Climbing Shoe\n\nThese options cater to different activities such as soccer, casual wear, and climbing."

In [28]:
def final_response(user_query):
    # Invoking the retrieval chain with the user's query to fetch relevant product information
    response = retrieval_chain.invoke({"input": user_query})['answer']

    # Creating a prompt to instruct the AI to format the response properly
    # The prompt asks the AI to extract only product names from the retrieved response
    prompt = f"Format the responses properly in {response}. Just return the product names, no other text"

    # Sending the formatted prompt to the GPT-4o-mini model for processing
    openai_response = client.chat.completions.create(
        model='gpt-4o-mini',  # Using GPT-4o-mini model for response generation
        messages=[{'role': 'user', 'content': prompt}]  # Providing the prompt to the model
    )

    # Extracting and returning the AI-generated response containing only the product names
    return openai_response.choices[0].message.content

In [29]:
print(final_response("what are some of the best shoes available?"))

1. adidas Men's Predator 18+ FG Firm Ground Soccer Cleats  
2. Kenneth Cole REACTION Men's Crespo Loafer B Shoe  
3. PUMA Cali Sport Clean Women's Sneakers  
4. Mad Rock Remora Climbing Shoe  


In [30]:
# Importing the Gradio library to create a simple web-based user interface
import gradio as gr

# Creating the Gradio interface for the product recommendation system
app = gr.Interface(
    fn=final_response,        # The function that processes user input and returns recommendations
    inputs="text",            # Input component: a text box for users to enter their query
    outputs="text",           # Output component: a text box to display the AI-generated response
    title="Review Genie",     # The title of the web interface
    description="Type your question below to get the recommendations",# A brief description displayed to users
    theme="Ocean",
    allow_flagging="never"    # Disabling the flagging feature to remove the "Flag" button
)

# Launching the Gradio app to start the interface and make it accessible via web browser
app.launch()




* Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


