# Find a keyword in a document

In [21]:
import pandas as pd
import numpy as np
import PIL
import pytesseract
import spacy
import cv2
import os
from pdf2image import convert_from_path
from glob import glob

## Logic for keyword search in a document
1. Open the file using cv2
2. Parse the text document
3. Store the result in a dataframe
4. Filter the dataframe using the keyword
5. Get the x,y,h,w of the keyword in the document
6. Draw bounding lines and save the document

In [22]:
pdfPath = r"../data/2.pdf"

searchInput = 'विकिपीडिया'

In [23]:
## Generate dataframe
def generateDataFrame():
    pages = convert_from_path(pdfPath, 350)
    i = 1
    for page in pages:
        image_name = "../data/temp/finder/Page_" + str(i) + ".jpg"
        page.save(image_name, "JPEG")
        i = i+1

    imagePathList = glob("../data/temp/finder/*.jpg")

    #logging.info("total number of pages "+ str(len(imagePathList)))
    imagePathList.sort()

    finalDF = pd.DataFrame()

    col_int = ['level', 'page_num', 'block_num', 'par_num',
                'line_num', 'word_num', 'left', 'top', 'width', 'height']

    for _, path in enumerate(imagePathList):
        img = cv2.imread(path)
        data = pytesseract.image_to_data(img, lang='mar+eng')
        dataList = list(map(lambda x: x.split('\t'), data.split('\n')))
        df = pd.DataFrame(dataList[1:], columns=dataList[0])
        df.to_csv(index=False)

        ## Data cleaning
        df.dropna(inplace=True)  # drop the missing in rows
        df[col_int] = df[col_int].astype(int)
        df['conf'] = df['conf'].astype(float).astype(int)

        df = df[(df['level'] == 5) & (df['text'] == searchInput)]

        ## Draw lines on the text in the document
        #image = img.copy()
        #generateBoundedImage(image, index, df)

        finalDF = pd.concat([finalDF, df], ignore_index = True)

    
    return finalDF


In [24]:
filteredDf = generateDataFrame()

In [25]:
def generateBoundedImage(image, index:int, df:pd.DataFrame):
    ## Draw lines on the text in the document
    level = 'word'
    for l, x, y, w, h, c, t in df[['level', 'left', 'top', 'width', 'height', 'conf', 'text']].values:
        if level == 'page':
            if l == 1:
                cv2.rectangle(image, (x, y), (x+w, y+h), (0, 0, 0), 2)
            else:
                continue

        elif level == 'block':
            if l == 2:
                cv2.rectangle(image, (x, y), (x+w, y+h), (255, 0, 0), 2)
            else:
                continue
        elif level == 'para':
            if l == 3:
                cv2.rectangle(image, (x, y), (x+w, y+h), (0, 255, 0), 2)
            else:
                continue
        elif level == 'line':
            if l == 4:
                cv2.rectangle(image, (x, y), (x+w, y+h), (0, 0, 255), 2)
            else:
                continue
        elif level == 'word':
            if l == 5:
                cv2.rectangle(image, (x, y), (x+w, y+h), (0, 255, 0), 2)
                cv2.putText(image, t, (x, y), cv2.FONT_HERSHEY_PLAIN,
                            1, (255, 0, 0), 2)
            else:
                continue
    
    boundedImagePath = "../data/bounded/boundingbox_"+ str(index)+".jpeg"
    cv2.imwrite(boundedImagePath, image)

In [26]:
img = cv2.imread('../data/temp/finder/Page_1.jpg')
generateBoundedImage(img, 0, filteredDf)