In [16]:
# Generic Libraries
from PIL import Image
import os
import pandas as pd
import numpy as np
import re,string,unicodedata
import cv2

#Tesseract Library
import pytesseract

pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'


#Warnings
import warnings
warnings.filterwarnings("ignore")

#Garbage Collection
import gc

#Gensim Library for Text Processing
import gensim.parsing.preprocessing as gsp
from gensim import utils

#TextBlob Library (Sentiment Analysis)
from textblob import TextBlob, Word

#Plotting Libraries
import matplotlib.pyplot as plt
import seaborn as sns

#WordCloud Generator
from wordcloud import WordCloud,STOPWORDS


In [17]:
#Define Directory Path
#sample_images = r'C:\Users\calli\Downloads\train_images'
test_images = r'C:\Users\calli\Documents\MATLAB\archive\Sample'

In [18]:
#Custom Function to Traverse the folder
def traverse(directory):
    path, dirs, files = next(os.walk(directory))
    fol_nm = os.path.split(os.path.dirname(path))[-1]
    print(f'Number of files found in "{fol_nm}" : ',len(files))

In [20]:
#Traversing the folders
#traverse(sample_images)
traverse(test_images)

Number of files found in "archive" :  5


In [52]:
ex_txt = []   #list to store the extracted text

#Function to Extract Text
def TxtExtract(directory):
    """
    This function will handle the core OCR processing of images.
    """
    
    for subdir, dirs, files in os.walk(directory):
        for file in files:
            filepath = subdir + os.sep + file
            
            # Grayscale image
            img = Image.open(filepath).convert('L')
            ret,img = cv2.threshold(np.array(img), 125, 255, cv2.THRESH_BINARY)

            # Older versions of pytesseract need a pillow image
            # Convert back if needed
            img = Image.fromarray(img.astype(np.uint8))
            
            text = pytesseract.image_to_string(Image.open(filepath), timeout=5)
            if text == '\n':
                ex_txt.extend([[file, 'blank']])
            else:   
                ex_txt.extend([[file, text]])
                
    fol_nm = os.path.split(os.path.dirname(subdir))[-1]
    
    print(f"Text Extracted from the files in '{fol_nm}' folder & saved to list..")

In [53]:
#Extracting Text from JPG files in Sample Image Folder
#TxtExtract(sample_images)

#Extracting Text from JPG files in Dataset Folder
TxtExtract(test_images)

Text Extracted from the files in 'archive' folder & saved to list..


In [54]:
#Converting the list to dataframe for further analysis
ext_df = pd.DataFrame(ex_txt,columns=['FileName','Text'])

In [55]:
pd.set_option('display.max_rows', None)
#Inspect the dataframe
print(ext_df)

               FileName                                               Text
0   Sample_Negative.jpg  Of course gay men\n\ndress well. They didn't\n...
1   Sample_Positive.jpg                                                  
2     Sample_Random.jpg                                                  
3           Test133.jpg                                                \n
4          Test1620.jpg    \n   \n\nLOVE\n\nIS TOO BEAU\nTO BE HID\nIN ...
5          Test1767.jpg                                                  
6          Test1785.jpg  wom ol-Mehicel (om\n‘to behave as\nif the trut...
7          Test2411.jpg   \n  \n \n\nTSN G\nWON'T HAVE TO\n\n“COME OUT ...
8          Test2424.jpg              \n\nLa not dfva'd we\nWO. Tan\n\n \n
9          Test2870.jpg     \n\n“People don't know if i'm gay, straight...
10         Test3118.jpg  A KEEP CALM\nAND LOVE\n\nWHO YOU WANT\n\nYoram\n
11         Test3706.jpg  ‘Shine your soul with the same egoless humilit...
12         Test3827.jpg  

In [27]:
ext_df.to_csv (r'C:\Users\calli\Documents\ocr_text.csv', index = False, header=True);

In [None]:
print("Total Records: ", ext_df.shape[0])

In [None]:
# Create list of pre-processing func (gensim)
processes = [
               gsp.strip_tags, 
               gsp.strip_punctuation,
               gsp.strip_multiple_whitespaces,
               gsp.strip_numeric,
               gsp.remove_stopwords, 
               gsp.strip_short, 
               gsp.stem_text
            ]

# Create func to pre-process text
def proc_txt(txt):
    text = txt.lower()
    text = utils.to_unicode(text)
    for p in processes:
        text = p(text)
    return text

In [None]:
#Creating a new column with processed text
ext_df['Text_Pr'] = ext_df['Text'].apply(lambda x: proc_txt(x))

In [None]:
#Creating a seperate dataframe with non-blank Text 
ext_df_txt = ext_df[(  ext_df['Text_Pr'] != 'blank'  )]

In [None]:
print("Total Records in Text Only Dataframe: ", ext_df_txt.shape[0])

In [None]:
#Free up memory
gc.collect()