# Product Extraction Project
The following code will use Pillow, Tesseract OCR engine, and OpenCV to identify text from products shown on the screen, and then we will then use BeautifulSoup to crawl popular marketplaces like Amazon.

## Image Extraction
The following code deals with the capture and preprocessing of webcam images, then the extraction of text from the product.

In [4]:
import cv2
import pytesseract
import requests

In [6]:
cap = cv2.VideoCapture(0)
east = cv2.dnn.TextDetectionModel

cv2.namedWindow("Scan Product Here")
imageNum = 0
imageList = []

while cap.isOpened():
    ret, frame = cap.read()

    if not ret:
        print("Can't receive frame. Exiting...")
        break
    cv2.imshow("Scan Product Here", frame)

    # draw rectangle around visible text

    # quit if q or space pressed
    if cv2.waitKey(1) == ord('q'):
        break
    # 32 represents unicode number for space bar
    elif cv2.waitKey(1) == 32:
        filename = f"opencv_img_{imageNum}.png"
        cv2.imwrite(filename, frame)
        print(f"{filename} written!")
        imageNum += 1
        imageList.append(filename)

cap.release()
cv2.destroyAllWindows()

opencv_img_0.png written!


In [12]:
def barcode_scan(filename):
    scanner = cv2.barcode.BarcodeDetector()
    image = cv2.imread(filename, cv2.IMREAD_GRAYSCALE)
    scanned_items = {}

    cv2.imshow("Barcode Image", image)
    cv2.waitKey(0)
    cv2.destroyAllWindows()
    
    res, info, type, pts = scanner.detectAndDecodeWithType(image)
    if res == True and len(info) > 0:
        # check what type of barcode it is
        for bar in info:
            res = requests.get('https://api.upcitemdb.com/prod/trial/lookup', params={'upc': bar})
            scanned_items[bar] = res.text.items
    else:
        print("Couldn't find a barcode to read!")

    return scanned_items
                
                

In [4]:
def preprocess_image(filename):
    image = cv2.imread(filename, cv2.IMREAD_GRAYSCALE)
    cv2.imshow("Filtered Image", image)
    # binarization with adaptive thresholding
    image = cv2.adaptiveThreshold(image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 9, 2)
    # denoise image
    image = cv2.fastNIMeansDenoising(image, None, 2, 7, 21)

In [13]:
items = barcode_scan('opencv_img_0.png')
print(items)

Couldn't find a barcode to read!
{}


## Web Crawling Functions
The next lines of code will deal with scraping information from Amazon and eBay, in order to extract prices and determine ratings.

In [1]:
from bs4 import BeautifulSoup
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options

In [2]:
# we want to get the urls of the first page
# Amazon is blocking my requests lol
def search_products(text):
    opts = webdriver.ChromeOptions()
    opts.add_argument('--headless')
    driver = webdriver.Chrome(options=opts, service=ChromeService(ChromeDriverManager().install()))

    res = driver.page_source
    soup = BeautifulSoup(res, 'html.parser')

    text = text.replace(' ', '+')
    url = f'https://www.amazon.com/s?k={text}'

    driver.get(url)

    # get the divs for which we can get the product name from
    names = list(soup.findAll('div', class_='a-size-medium a-color-base a-text-normal'))
    prices = list(soup.findAll('div', class_='a-price-whole'))

In [3]:
search_products("Lady Gaga")

WebDriverException: Message: unknown error: net::ERR_CONNECTION_RESET
  (Session info: chrome-headless-shell=121.0.6167.185)
Stacktrace:
	GetHandleVerifier [0x011BE123+48179]
	(No symbol) [0x01145D01]
	(No symbol) [0x0102D72D]
	(No symbol) [0x01029EAD]
	(No symbol) [0x010202CA]
	(No symbol) [0x0101F442]
	(No symbol) [0x0101E97E]
	(No symbol) [0x0101E924]
	(No symbol) [0x0101D3CD]
	(No symbol) [0x0101DB4B]
	(No symbol) [0x0102F716]
	(No symbol) [0x0109B171]
	(No symbol) [0x01083C8C]
	(No symbol) [0x0109AA1E]
	(No symbol) [0x01083A26]
	(No symbol) [0x0105B7BC]
	(No symbol) [0x0105C62D]
	GetHandleVerifier [0x014D7C33+3299139]
	GetHandleVerifier [0x01515BF2+3553026]
	GetHandleVerifier [0x01510BCC+3532508]
	GetHandleVerifier [0x0125494E+664670]
	(No symbol) [0x01150AB4]
	(No symbol) [0x0114BF08]
	(No symbol) [0x0114C02D]
	(No symbol) [0x0113DD00]
	BaseThreadInitThunk [0x761CFCC9+25]
	RtlGetAppContainerNamedObjectPath [0x77E57C5E+286]
	RtlGetAppContainerNamedObjectPath [0x77E57C2E+238]
