### Basic library imports

In [2]:
import os
import pandas as pd
import easyocr
import matplotlib.pyplot as plt
import cv2
import numpy as np
import csv

### Read Dataset

In [3]:
DATASET_FOLDER = '../dataset/'
train = pd.read_csv(os.path.join(DATASET_FOLDER, 'train.csv'))
test = pd.read_csv(os.path.join(DATASET_FOLDER, 'test.csv'))
sample_test = pd.read_csv(os.path.join(DATASET_FOLDER, 'sample_test.csv'))
sample_test_out = pd.read_csv(os.path.join(DATASET_FOLDER, 'sample_test_out.csv'))

### Run Sanity check using src/sanity.py

In [4]:
!python3 sanity.py --test_filename ../dataset/sample_test.csv --output_filename ../dataset/sample_test_out.csv

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
Parsing successfull for file: ../dataset/sample_test_out.csv


In [5]:
!python3 sanity.py --test_filename ../dataset/sample_test.csv --output_filename ../dataset/sample_test_out_fail.csv

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
Error: Invalid unit [lbs] found in 6.75 lbs. Allowed units: {'metre', 'cubic inch', 'ton', 'millimetre', 'microlitre', 'watt', 'litre', 'pound', 'imperial gallon', 'cup', 'inch', 'yard', 'gallon', 'gram', 'foot', 'quart', 'millilitre', 'centimetre', 'fluid ounce', 'kilowatt', 'pint', 'microgram', 'cubic foot', 'millivolt', 'kilovolt', 'ounce', 'volt', 'centilitre', 'milligram', 'decilitre', 'kilogram'}


In [6]:
def load_image(image_path):
    return cv2.imread(image_path)

# Function to perform OCR and display results
def perform_ocr(image_path):
    # Initialize the EasyOCR reader
    reader = easyocr.Reader(['en'])  # 'en' for English

    # Read the image
    image = load_image(image_path)
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

    # Perform OCR
    results = reader.readtext(image_rgb)

    # Create a copy of the image for drawing
    output_image = image_rgb.copy()

    # Plot the results
    # for (bbox, text, prob) in results:
    #     # Unpack the bounding box
    #     (tl, tr, br, bl) = bbox
    #     tl = (int(tl[0]), int(tl[1]))
    #     br = (int(br[0]), int(br[1]))

    #     # Draw the bounding box and text
    #     cv2.rectangle(output_image, tl, br, (0, 255, 0), 2)
    #     cv2.putText(output_image, text, (tl[0], tl[1] - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 0), 2)

    # # Display the image with bounding boxes
    # plt.figure(figsize=(20,20))
    # plt.imshow(output_image)
    # plt.axis('off')
    # plt.show()

    # Print the extracted text
    #print("Extracted Text:")
    #for (bbox, text, prob) in results:
        #print(f"{text} (Confidence: {prob:.2f})")

    # Function to extract specific information (example for weight and voltage)
    def extract_info(results):
        info = {'weight': None, 'voltage': None}
        for (bbox, text, prob) in results:
            if 'g' in text.lower() or 'kg' in text.lower():
                info['weight'] = text
            elif 'v' in text.lower():
                info['voltage'] = text
        return info

    # Extract and print specific information
    extracted_info = extract_info(results)
    print("\nExtracted Information:")
    print(f"Weight: {extracted_info['weight']}")
    print(f"Voltage: {extracted_info['voltage']}")
    return results

### Download images

In [9]:
from utils import download_image
a=0
for i in train['image_link']:
    imgpath=download_image(i, save_folder='../images')
    extr=perform_ocr(imgpath)
    os.remove(imgpath)
    s = str(extr)
    print(s)
    with open("new.csv", mode='a',encoding='utf-8') as ft:
        ftwriter = csv.writer(ft)
        ftwriter.writerow(s)
    a+=1
    if a==100:
        break    



Extracted Information:
Weight: UCUGRLDC . [uacoJi JHH4E04
Voltage: None
[([[748, 540], [870, 540], [870, 570], [748, 570]], "PROPOS'", 0.5410940829045936), ([[748, 568], [864, 568], [864, 598], [748, 598]], 'NATUREJ', 0.6326722188030237), ([[724, 604], [890, 604], [890, 628], [724, 628]], 'INGREDIENT MENAGER', 0.7749532369806997), ([[742, 640], [864, 640], [864, 666], [742, 666]], 'MULTI-USAGE', 0.9945168125325531), ([[721, 669], [881, 669], [881, 709], [721, 709]], 'TERRE dE', 0.5322155113918523), ([[703, 707], [899, 707], [899, 751], [703, 751]], 'SOMMIERES', 0.9865835101982899), ([[745, 763], [859, 763], [859, 783], [745, 783]], '400% NATUREL', 0.7727462428216421), ([[673, 807], [709, 807], [709, 823], [673, 823]], 'Argile', 0.7337089203998773), ([[713, 807], [747, 807], [747, 821], [713, 821]], '10036', 0.625724532857882), ([[751, 807], [779, 807], [779, 821], [751, 821]], 'pure', 0.9999691843986511), ([[785, 807], [855, 807], [855, 821], [785, 821]], 'et   naturelle', 0.738622959

KeyboardInterrupt: 

: 

In [10]:
pwd()

'/Users/goutham/Documents/GitHub/Amazon-hackathon/src'

In [8]:
assert len(os.listdir('../images')) > 0

In [10]:
#rm -rf ../images