In [19]:
import os
import numpy as np
import pandas as pd
import pytesseract
import enchant

In [20]:
import ocr
import preprocess_images

In [21]:
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract'

In [22]:
path = "E:/ADB_Project/code/data/cs_sample"

---

In [23]:
def get_special_chars(text_column):
    """"identify special characters that need to be removed before evaluatoin"""
    
    #converting to a single string
    text = ' '.join(text_column)
    
    # get a list of unique characters
    text_char = list(set(text))
    
    # get a list removing alpha numeric
    text_char_sp = [char for char in text_char if not(char.isalnum())]
    
    return text_char_sp

In [24]:
def strip_special_chars(text, schar_list, char_keep):
    """
    Strips the unwanted special characters from a given list of job descriptions

    Parameters:
    ----------
    text : list of job descriptions
    schar_list : relevant special character list
    char_keep : the special characters to be retained

    Returns:
    -------
    The list of job descriptions stripped of unwanted special characters
    """
    char_set = set([c for c in schar_list if c not in char_keep])
    
    # i2t_stripped -> stripped of special chars
    text_stripped = [''.join([c for c in item if c not in char_set]) for item in text]
    
    return text_stripped

In [31]:
def accuracy_calculator(string):
    """Checks a list of words against a dictionary and returns a ratio of valid words"""
    
    dic = enchant.Dict("en_US")
    
    valid_count = 0
    
    for word in string.split():
        if dic.check(word) == True:
            valid_count += 1
            
    return (valid_count/max(1,len(string.split())))

---

In [26]:
text = ocr.extract_text(path, n=100)
df = pd.DataFrame(text, index=np.arange(100))

In [27]:
i2t = list(df["ocrd_text"])

In [28]:
# execute the function on the i2t list to get a list of special characters
special = get_special_chars(i2t)

# define characters you want to retain
char_keep = [' ', '#', '+', '\n', '/']

# execute the function and obtain ocr output stripped of special characters
stripped = strip_special_chars(i2t, special, char_keep)

In [32]:
df["clean"] = pd.Series(stripped)
#accuracy calculation
df["plain_accuracy"] = df["ocrd_text"].apply(accuracy_calculator)
df["clean_accuracy"] = df["clean"].apply(accuracy_calculator)

In [33]:
df.head(7)

Unnamed: 0,job_id,ocrd_text,clean,plain_accuracy,clean_accuracy
0,590492.jpg,Net Architect\n\n8+ years of Hands on experien...,Net Architect\n\n8+ years of Hands on experien...,0.837209,0.883721
1,600087.png,\n\niOS DEVELOPER\n\nS.A. Knowledge Services ...,\n\niOS DEVELOPER\n\nSA Knowledge Services SA...,0.834586,0.936255
2,601828.jpg,(M\n\nMOBIZZ\nWE UNDERSTAND\n= People\n]@ss Jo...,M\n\nMOBIZZ\nWE UNDERSTAND\n People\nss Job Op...,0.817814,0.922078
3,601838.jpg,ps\n\nWE UNDERSTAND\n= People\n\n]@ss Job Oppo...,ps\n\nWE UNDERSTAND\n People\n\nss Job Opportu...,0.858934,0.97351
4,602009.jpg,Senior Software Engineer - Java\n\n \n\nOur cl...,Senior Software Engineer Java\n\n \n\nOur cli...,0.838323,0.890909
5,602186.png,WE ARE\nHIRING!\n\n-NET SOFTWARE ENGINEERS\n\n...,WE ARE\nHIRING\n\nNET SOFTWARE ENGINEERS\n\nFi...,0.733333,0.85
6,602397.jpg,OQ\n\nQUESS\n\nDELIVERING GROWTH\n\n \n\neer —...,OQ\n\nQUESS\n\nDELIVERING GROWTH\n\n \n\neer ...,0.863636,0.923664


In [None]:
# FULL MODULE

# def updated_ocr(df):
    # iterate through the plain accuracy list
        # when you find value less than X:
            # run through the image pre_processing module & replace the image 
            # for each of those images
                # do ocrd_tex
                # clean
                # plain_accuracy & clean
                

In [34]:
# TEMPORARY MODULE
# For each image less than a certain value, run the binarization, calculate the metrics

# ______IMPLEMENTATION__________

for index in df.index:
    if df.loc[index,'clean_accuracy'] < 0.9:
        vacancy = df.loc[index, 'job_id']
        binarized =  preprocess_images.binarization(os.path.join(path, vacancy))
        df.loc[index, 'ocrd_text'] = pytesseract.image_to_string(binarized)
        special = get_special_chars([df.loc[index, 'ocrd_text']])
        df.loc[index, 'clean'] = strip_special_chars([df.loc[index, 'ocrd_text']], special, char_keep)[0]
        df.loc[index, 'plain_accuracy'] = accuracy_calculator(df.loc[index, 'ocrd_text'])
        df.loc[index, 'clean_accuracy'] = accuracy_calculator(df.loc[index, 'clean'])
        


In [35]:
df.head(7)

Unnamed: 0,job_id,ocrd_text,clean,plain_accuracy,clean_accuracy
0,590492.jpg,We have opportunities for:\n\n-.Net Architect\...,We have opportunities for\n\nNet Architect\n\n...,0.75,0.862745
1,600087.png,\n\niOS DEVELOPER\n\nS.A. Knowledge Services ...,\n\niOS DEVELOPER\n\nSA Knowledge Services SA...,0.834586,0.936255
2,601828.jpg,(M\n\nMOBIZZ\nWE UNDERSTAND\n= People\n]@ss Jo...,M\n\nMOBIZZ\nWE UNDERSTAND\n People\nss Job Op...,0.817814,0.922078
3,601838.jpg,ps\n\nWE UNDERSTAND\n= People\n\n]@ss Job Oppo...,ps\n\nWE UNDERSTAND\n People\n\nss Job Opportu...,0.858934,0.97351
4,602009.jpg,Senior Software Engineer - Java\n\n \n\nOur cl...,Senior Software Engineer Java\n\n \n\nOur cli...,0.785311,0.87574
5,602186.png,\n\n \n\n \n\n \n\n \n\n \n\n-NET SOFTWARE EN...,\n\n \n\n \n\n \n\n \n\n \n\nNET SOFTWARE ENG...,0.830189,0.943396
6,602397.jpg,OQ\n\nQUESS\n\nDELIVERING GROWTH\n\n \n\neer —...,OQ\n\nQUESS\n\nDELIVERING GROWTH\n\n \n\neer ...,0.863636,0.923664


In [36]:
save_path = "../data/pipeline_sample.csv"

df.to_csv(save_path, index=False)