<a href="https://colab.research.google.com/github/flyingsaucer123/UPC_product_search/blob/main/webscraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U langchain-openai


Collecting langchain-openai
  Downloading langchain_openai-0.2.0-py3-none-any.whl.metadata (2.6 kB)
Collecting langchain-core<0.4,>=0.3 (from langchain-openai)
  Downloading langchain_core-0.3.1-py3-none-any.whl.metadata (6.2 kB)
Collecting openai<2.0.0,>=1.40.0 (from langchain-openai)
  Downloading openai-1.46.0-py3-none-any.whl.metadata (24 kB)
Collecting tiktoken<1,>=0.7 (from langchain-openai)
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain-core<0.4,>=0.3->langchain-openai)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting langsmith<0.2.0,>=0.1.117 (from langchain-core<0.4,>=0.3->langchain-openai)
  Downloading langsmith-0.1.122-py3-none-any.whl.metadata (13 kB)
Collecting tenacity!=8.4.0,<9.0.0,>=8.1.0 (from langchain-core<0.4,>=0.3->langchain-openai)
  Downloading tenacity-8.5.0-py3-none-any.whl.metadata (1.2 kB)
Collecting httpx<1,>=0.23.0 (from 

In [None]:
!pip install crewai langchain-google-genai crewai_tools



Collecting crewai
  Using cached crewai-0.61.0-py3-none-any.whl.metadata (15 kB)
Collecting langchain-google-genai
  Using cached langchain_google_genai-2.0.0-py3-none-any.whl.metadata (3.9 kB)
Collecting crewai_tools
  Using cached crewai_tools-0.12.1-py3-none-any.whl.metadata (5.1 kB)
Collecting appdirs<2.0.0,>=1.4.4 (from crewai)
  Using cached appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting auth0-python<5.0.0,>=4.7.1 (from crewai)
  Using cached auth0_python-4.7.2-py3-none-any.whl.metadata (8.9 kB)
Collecting embedchain<0.2.0,>=0.1.114 (from crewai)
  Downloading embedchain-0.1.121-py3-none-any.whl.metadata (9.3 kB)
Collecting instructor==1.3.3 (from crewai)
  Downloading instructor-1.3.3-py3-none-any.whl.metadata (13 kB)
Collecting json-repair<0.26.0,>=0.25.2 (from crewai)
  Downloading json_repair-0.25.3-py3-none-any.whl.metadata (7.9 kB)
Collecting jsonref<2.0.0,>=1.1.0 (from crewai)
  Downloading jsonref-1.1.0-py3-none-any.whl.metadata (2.7 kB)
Collecting langch

In [None]:
# Import necessary libraries
from crewai import Agent, Task, Crew, Process
from crewai_tools import SerperDevTool, ScrapeWebsiteTool
from langchain_openai import ChatOpenAI  # Updated import
from dotenv import load_dotenv
import os
load_dotenv()

# Set up your OpenAI API key
openai_api_key ="open_ai_key"
#os.environ["OPENAI_MODEL_NAME"] = 'gpt-3.5-turbo'
os.environ["OPENAI_MODEL_NAME"]="gpt-4-0125-preview"
os.environ["OPENAI_API_KEY"] = openai_api_key
# Setting up ChatOpenAI with the API key
llm = ChatOpenAI(
    model="gpt-3.5-turbo",  # Setting model to GPT-3.5 Turbo
    temperature=0.5,
    openai_api_key=openai_api_key,  # Pass in your OpenAI API key here
    verbose=True
)

# Set up the Serper API key
os.environ["SERPER_API_KEY"] = "serper_key"  # Serper API Key

# Define the Serper search tool and ScrapeWebsiteTool
search_tool = SerperDevTool()
scrape_tool = ScrapeWebsiteTool()

In [None]:

# Define the UPC Search Agent
upc_search_agent = Agent(
    role="UPC Search Specialist",
    goal="Search for the UPC code based on the product description, manufacturer, and part number.",
    backstory="You are skilled at retrieving UPC codes based on product data from various online sources.",
    tools=[search_tool],
    allow_delegation=True,
    llm=llm,  # Using ChatOpenAI for language modeling
    verbose=True,
    memory=True
)

# Define the Scraping Agent
scraping_agent = Agent(
    role="Scraping Expert",
    goal="Scrape relevant web pages to extract UPC codes from search results.",
    backstory="You specialize in scraping and extracting data from websites.",
    tools=[scrape_tool],
    allow_delegation=False,
    llm=llm,  # Using ChatOpenAI for language modeling
    verbose=True,
    memory=True
)

# Refined UPC Search Task
upc_search_task = Task(
    description=(
        "Use the product details (manufacturer: {manufacturer_name}, description: {product_description}, part number: {manufacturer_part_number}) "
        "to search the web for a matching UPC code. Use all the product details for accurate search results."
    ),
    expected_output="List of relevant web links from search results.",
    tools=[search_tool],
    agent=upc_search_agent
)

# Refined Scraping Task to scrape the web pages for UPC codes
upc_scraping_task = Task(
    description=(
        "Scrape the relevant web pages found during the search for UPC codes based on the search results. "
        "Make sure to extract the UPC code or related product information from the scraped data."
    ),
    expected_output="Scraped content that contains the UPC code or related product information.",
    tools=[scrape_tool],  # Scraping the web pages returned from search
    agent=scraping_agent
)

# Create the crew for the UPC retrieval and scraping process
crew = Crew(
    agents=[upc_search_agent, scraping_agent],
    tasks=[upc_search_task, upc_scraping_task],
    process=Process.sequential,  # Running the tasks in sequence: Search -> Scrape
    verbose=True,
    memory=True
)

# Example product data to search for UPC
inputs = {
    "manufacturer_name": "ABCO PRODUCTS",
    "product_description": "32 GAL ROUND WASTE RECEPTACLE",
    "manufacturer_part_number": "BS32G"
}

# Run the crew
result = crew.kickoff(inputs=inputs)

# Display the final result
print(result)




[1m[95m# Agent:[00m [1m[92mUPC Search Specialist[00m
[95m## Task:[00m [92mUse the product details (manufacturer: ABCO PRODUCTS, description: 32 GAL ROUND WASTE RECEPTACLE, part number: BS32G) to search the web for a matching UPC code. Use all the product details for accurate search results.[00m


[1m[95m# Agent:[00m [1m[92mUPC Search Specialist[00m
[95m## Thought:[00m [92mI should use the tool "Search the internet" to search for the UPC code based on the manufacturer, description, and part number provided for the waste receptacle by ABCO PRODUCTS.[00m
[95m## Using tool:[00m [92mSearch the internet[00m
[95m## Tool Input:[00m [92m
{
  "search_query": "ABCO PRODUCTS 32 GAL ROUND WASTE RECEPTACLE BS32G UPC code"
}[00m
[95m## Tool Output:[00m [92m

Search results: Title: Round Waste Receptacles and Lids - ABCO Cleaning Products
Link: https://abcoproducts.com/products/waste-cans/waste-cans-and-accessories/round-waste-receptacles-and-lids
Snippet: Round Waste Re

In [None]:

# Define the UPC Search Agent
upc_search_agent = Agent(
    role="UPC Search Specialist",
    goal="Search for the UPC code based on the product description, manufacturer, and part number.",
    backstory="You are skilled at retrieving UPC codes based on product data from various online sources.",
    tools=[search_tool],
    allow_delegation=True,
    verbose=True,
    memory=True
)

# Define the Scraping Agent
scraping_agent = Agent(
    role="Scraping Expert",
    goal="Scrape relevant web pages to extract UPC codes from search results.",
    backstory="You specialize in scraping and extracting data from websites.",
    tools=[scrape_tool],
    allow_delegation=False,
    verbose=True,
    memory=True
)

# Refined UPC Search Task
upc_search_task = Task(
    description=(
        "Use the product details (manufacturer: {manufacturer_name}, description: {product_description}, part number: {manufacturer_part_number}) "
        "to search the web for a matching UPC code. Use all the product details for accurate search results."
    ),
    expected_output="List of relevant web links from search results.",
    tools=[search_tool],
    agent=upc_search_agent
)

# Refined Scraping Task to scrape the web pages for UPC codes
upc_scraping_task = Task(
    description=(
        "Scrape the relevant web pages found during the search for UPC codes based on the search results. "
        "Make sure to extract the UPC code or related product information from the scraped data."
    ),
    expected_output="Scraped content that contains the UPC code or related product information.",
    tools=[scrape_tool],  # Scraping the web pages returned from search
    agent=scraping_agent
)

# Create the crew for the UPC retrieval and scraping process
crew = Crew(
    agents=[upc_search_agent, scraping_agent],
    tasks=[upc_search_task, upc_scraping_task],
    process=Process.sequential,  # Running the tasks in sequence: Search -> Scrape
    verbose=True,
    memory=True
)

# Example product data to search for UPC
inputs = {
    "manufacturer_name": "Trader joes",
    "product_description": "pound milk chocolate",
    "manufacturer_part_number": ""
}

# Run the crew with the provided inputs
result = crew.kickoff(inputs=inputs)

# Display the final result
print(result)




[1m[95m# Agent:[00m [1m[92mUPC Search Specialist[00m
[95m## Task:[00m [92mUse the product details (manufacturer: Trader joes, description: pound milk chocolate, part number: ) to search the web for a matching UPC code. Use all the product details for accurate search results.[00m


[1m[95m# Agent:[00m [1m[92mUPC Search Specialist[00m
[95m## Thought:[00m [92mThought: I will use the provided product details to search the internet for the matching UPC code.[00m
[95m## Using tool:[00m [92mSearch the internet[00m
[95m## Tool Input:[00m [92m
{
  "search_query": "Trader Joe's pound milk chocolate UPC code"
}[00m
[95m## Tool Output:[00m [92m

Search results: Title: Pound Plus Milk Chocolate Bar | Trader Joe's
Link: https://www.traderjoes.com/home/products/pdp/pound-plus-milk-chocolate-bar-020468
Snippet: Why? Because they're more than a pound of delicious, creamy, dreamy chocolate. In this case, it's Milk Chocolate with 33% cocoa solids, neatly packaged and easy .