In [1]:
# import libraries
import os
import numpy as np
import pandas as pd
from pytesseract import pytesseract
from sklearn.model_selection import train_test_split
from sys import platform
import cv2
from Levenshtein import distance as levenshtein_distance
from model_num_extractor.pytesseract_extractor import pytesseract_mn_extractor
from logger import get_logger
import time

In [2]:
# setting up pyte
if platform == "win32":
    pytesseract.tesseract_cmd = "C:/Users/rrai34/Downloads/tesseract-ocr-w64-setup-v5.1.0.20220510.exe"

In [3]:
# Required to run easyocr in windows - not sure why.
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [4]:
# reader = easyocr.Reader(['en'])

In [5]:
logger = get_logger("pytesseract-extractor-app")

In [6]:
df = pd.read_csv('./new_data/product_model_serial_numbers.csv')

In [7]:
df.head()

Unnamed: 0,SERVICE_JOB_ID,FILE_NAME,example,PRODUCT_LINE,MODEL_NUMBER,SERIAL_NUMBER
0,SJ48132474,image-pre-inspection-proof-of-purchase-72fddee...,data/Laundry/SJ48132474/image-pre-inspection-p...,HOME LAUNDRY GAS,DVG54R7600C/A3,0B1T5BCNC00144M
1,SJ48132474,image-pre-inspection-proof-of-purchase-bcd8901...,data/Laundry/SJ48132474/image-pre-inspection-p...,HOME LAUNDRY GAS,DVG54R7600C/A3,0B1T5BCNC00144M
2,SJ48132474,image-pre-inspection-serial-number-a2dc2e0b-7a...,data/Laundry/SJ48132474/image-pre-inspection-s...,HOME LAUNDRY GAS,DVG54R7600C/A3,0B1T5BCNC00144M
3,SJ48132474,image-pre-inspection-serial-number-d843dfb6-fb...,data/Laundry/SJ48132474/image-pre-inspection-s...,HOME LAUNDRY GAS,DVG54R7600C/A3,0B1T5BCNC00144M
4,SJ48144276,image-pre-inspection-proof-of-purchase-8e82720...,data/Laundry/SJ48144276/image-pre-inspection-p...,HOME LAUNDRY GAS,GTD42GASJ2WW,GTD42GASJ2WW


In [8]:
df.describe()

Unnamed: 0,SERVICE_JOB_ID,FILE_NAME,example,PRODUCT_LINE,MODEL_NUMBER,SERIAL_NUMBER
count,3347,3347,3347,2429,2425,2424
unique,1573,3309,3309,4,358,1141
top,SJ48595774,image-pre-inspection-serial-number-105d636a-08...,data/Laundry/SJ48547655/image-pre-inspection-s...,DISHWASHER,DW80R2031US/AA,0ERBGDAK201631J
freq,11,4,4,1031,169,11


Get rid of data points which doesn't have model number.

In [9]:
df0 = df.dropna(subset=['MODEL_NUMBER', ])
df0.describe()

Unnamed: 0,SERVICE_JOB_ID,FILE_NAME,example,PRODUCT_LINE,MODEL_NUMBER,SERIAL_NUMBER
count,2425,2425,2425,2425,2425,2424
unique,1172,2387,2387,4,358,1141
top,SJ48595774,image-pre-inspection-serial-number-105d636a-08...,data/Laundry/SJ48547655/image-pre-inspection-s...,DISHWASHER,DW80R2031US/AA,0ERBGDAK201631J
freq,11,4,4,1031,169,11


Checking how many unique service id's the dataframe has.

In [10]:
len(df0.SERVICE_JOB_ID.unique())

1172

Group the services by job id and create a new column `files` that holds all the images path as a list. Each service id is treated as a data point.

In [11]:
df_files = df0.groupby(['SERVICE_JOB_ID',])['example'].agg(files=list).reset_index()
df_files.head()

Unnamed: 0,SERVICE_JOB_ID,files
0,SJ48132474,[data/Laundry/SJ48132474/image-pre-inspection-...
1,SJ48144276,[data/Laundry/SJ48144276/image-pre-inspection-...
2,SJ48148704,[data/Laundry/SJ48148704/image-pre-inspection-...
3,SJ48155130,[data/Laundry/SJ48155130/image-pre-inspection-...
4,SJ48165170,[data/Laundry/SJ48165170/image-pre-inspection-...


In [12]:
df_mn = df0.groupby(['SERVICE_JOB_ID', 'MODEL_NUMBER']).size().reset_index(name='count')
df_mn.drop(['count'], axis=1, inplace=True)
df_mn.head()

Unnamed: 0,SERVICE_JOB_ID,MODEL_NUMBER
0,SJ48132474,DVG54R7600C/A3
1,SJ48144276,GTD42GASJ2WW
2,SJ48148704,GTX22GASK0WW
3,SJ48155130,GTD33GASK0WW
4,SJ48165170,00000000


In [13]:
df1 = df_files.join(df_mn.set_index('SERVICE_JOB_ID'), on="SERVICE_JOB_ID")
df1.head()

Unnamed: 0,SERVICE_JOB_ID,files,MODEL_NUMBER
0,SJ48132474,[data/Laundry/SJ48132474/image-pre-inspection-...,DVG54R7600C/A3
1,SJ48144276,[data/Laundry/SJ48144276/image-pre-inspection-...,GTD42GASJ2WW
2,SJ48148704,[data/Laundry/SJ48148704/image-pre-inspection-...,GTX22GASK0WW
3,SJ48155130,[data/Laundry/SJ48155130/image-pre-inspection-...,GTD33GASK0WW
4,SJ48165170,[data/Laundry/SJ48165170/image-pre-inspection-...,00000000


In [14]:
df1.describe()

Unnamed: 0,SERVICE_JOB_ID,files,MODEL_NUMBER
count,1172,1172,1172
unique,1172,1172,358
top,SJ48132474,[data/Laundry/SJ48132474/image-pre-inspection-...,DW80R2031US/AA
freq,1,1,71


In [15]:
def predict_mn(df, rotate_pic = False):
    df.reset_index(drop=True, inplace=True)
    pred = [None, ]*len(df)
    for i, row in df.iterrows():
        mod_nums = []
        files = row['files']
        for fn in files:
            mn = pytesseract_mn_extractor(fn, logger, rotate_pic)
            if mn is not None:
                mod_nums.append(mn)
        if (i % 100 == 0):
            print(f"Processed {i}th datapoint...")
        if len(mod_nums) > 0:
            pred[i] = (max(mod_nums, key=len))
    return pred

In [16]:
def get_accuracy(given, predicted, ld=5):
    accurate = 0
    for i in range(len(given) - 1):
        if predicted[i] is not None:
            if levenshtein_distance(predicted[i], given[i]) <= ld:
                accurate += 1
    return round(accurate*100/len(given), 2)

In [17]:
# only taking 10 data points for starter
# df2 = df1.iloc[:10, :]

In [27]:
X = df1.iloc[:, :-1]
y = df1.loc[:, 'MODEL_NUMBER']

Divide into train test set

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, train_size = .75)

Create a `MN` column and initialize it with `None` value. This column will store the predicted model number.

In [20]:
# use train test to predict and then get accuracy
start_time = time.time()
train_pred = predict_mn(X_train, True)
print(f"time taken: {time.time() - start_time} s")

Processed 0th datapoint...
Processed 100th datapoint...
Processed 200th datapoint...
Processed 300th datapoint...
Processed 400th datapoint...
Processed 500th datapoint...
Processed 600th datapoint...
Processed 700th datapoint...
Processed 800th datapoint...


TypeError: distance expected two Strings or two Unicodes

In [29]:
get_accuracy(y_train.to_list(), train_pred)

23.09

In [30]:
start_time = time.time()
test_pred = predict_mn(X_test, True)
print(f"time taken: {time.time() - start_time} s")

Processed 0th datapoint...
Processed 100th datapoint...
Processed 200th datapoint...
time taken: 1203.4373400211334 s


In [34]:
get_accuracy(y_test.to_list(), test_pred)

24.23

In [None]:
# pd.options.mode.chained_assignment = None
# df1['MN'] = None
# df1.head()

In [None]:
# start_time = time.time()
# # using model to extract model number for all service_job_id
# for i, row in df1.iterrows():
#     mod_nums = []
#     files = row['files']
#     for fn in files:
#         mn = pytesseract_mn_extractor(fn, logger, True)
#         if mn is not None:
#             mod_nums.append(mn)
#     if (i % 100 == 0):
#         print(f"Printing model number for {i} datapoint...")
#     if len(mod_nums) > 0:
#         df1.loc[i, 'MN'] = max(mod_nums, key=len)
# print(f"time taken: {time.time() - start_time} s")

In [None]:
# df1.head()

In [None]:
# given = df1['MODEL_NUMBER'].to_list()
# pred = df1['MN'].to_list()

In [None]:
# accurate = 0
# for i in range(len(given) - 1):
# #     print("pred = ", pred[i], " given = ", given[i])
#     if pred[i] is not None:
#         if levenshtein_distance(pred[i], given[i]) <= 5:
# #             print(f"Given = {given[i]}, \t\tPred = {pred[i]}")
#             accurate += 1

In [None]:
# accurate

In [None]:
# accuracy = round(accurate*100/len(given), 2)
# accuracy