In [58]:
import os
from dotenv import load_dotenv
import logging
import traceback
import sys

logging.basicConfig(level=logging.DEBUG, filename='extraction.log')

# environment variable file
dotenv_path = os.path.abspath(
                os.path.join(
                    os.getcwd(),
                '.env')
            )
            
load_dotenv(dotenv_path)

from extraction.line import line_detection
from extraction.pytesseract_detection import pytesseract_detection
from extraction.pytesseract_ocr import pytesseract_ocr
from nltk.tokenize import sent_tokenize
import pandas as pd

from pdf2image import convert_from_bytes
import pytesseract
from tqdm import tqdm
import cv2
import nltk


class Extractor:
    def __init__(self):
        self.base_dir = os.getcwd()
        self.setup_ocr()
        nltk.download('punkt')

    
    def setup_ocr(self):
        self.size_ = 7000
        self.file_page_format = 'page_{0}.{1}'
        self.tesseract_dir = os.path.abspath(os.path.join(self.base_dir, 'TESSERACT-OCR'))
        self.extract_dir = os.path.abspath(os.path.join(self.base_dir, 'extracted_data'))
        self.mkdir_if_not_exist(self.extract_dir)
        self.all_page_filename = os.path.abspath(os.path.join(self.extract_dir, 'all_page.txt')) # path to save all text in a file
        OS = os.environ.get('OS', 'WINDOWS')
        if OS == 'WINDOWS':
            self.poppler_path = os.path.abspath(os.path.join(self.base_dir, 'poppler-0.90.1', 'bin'))
            pytesseract.pytesseract.tesseract_cmd = os.path.join(self.tesseract_dir, 'tesseract.exe')
        elif OS == 'LINUX':
            self.poppler_path = os.environ.get('POPPLER_DIRECTORY')
            pytesseract.pytesseract.tesseract_cmd = os.environ.get('TESSERACT_DIRECTORY')


    @staticmethod
    def mkdir_if_not_exist(dir):
        if not os.path.exists(dir):
            os.makedirs(dir)


    def get_file_path(self, num_page, ext):
        path = os.path.join(self.extract_dir, str(num_page))
        file_path = os.path.join(path, self.file_page_format.format(num_page, ext))
        return file_path, path


    def save_page_pdf(self, image, num_page):
        file_path, path = self.get_file_path(num_page, 'png')
        self.mkdir_if_not_exist(path)
        image.save(file_path, "PNG", quality=95, optimize=True, progressive=True)
    

    def read_page_pdf(self, num_page):
        file_path, _ = self.get_file_path(num_page, 'png')
        img_ori = cv2.imread(file_path)
        img_gray = cv2.imread(file_path, 0)
        return img_ori, img_gray


    def page_extractor(self, pdf_file, num_page):
        image = convert_from_bytes(open(pdf_file, "rb").read(), size=self.size_, poppler_path=self.poppler_path, first_page=num_page, last_page=num_page)[0]
        self.save_page_pdf(image, num_page)
        img_ori, img_gray = self.read_page_pdf(num_page)
        height_, width_ = img_gray.shape

        table, nontable = line_detection(img_ori, img_gray)
        undetected_bbox = pytesseract_detection(img_ori)
        df_combined_final = pytesseract_ocr(img_ori, img_gray, height_, width_, table, nontable, undetected_bbox)
        return df_combined_final

    
    def remove_footer(self, df):
        indx = list(df[df['top']>6600].index)
        df = df.drop(indx)
        return df

    
    def isTitle(self, text):
        n_sentence = len(sent_tokenize(text))
        return (len(text) < 50 or n_sentence < 2) and ',' not in text


    def process_text(self, text):
        text = text.replace('-, ', '')
        return text


    def write_text(self, f, df, col):
        for _, value in df.iterrows():
            text = value[col]
            text = self.process_text(text)
            if not self.isTitle(text) :
                f.write(f'{value.lines_combine}\n\n')


    def get_columns(self, df):
        left_column = df[df['left']<2000]
        right_column = df[df['left']>2000]
        return left_column, right_column


    def save_text(self, df, num_page):
        filename, _ = self.get_file_path(num_page, 'txt')
        df_left, df_right = self.get_columns(df)
        with open(filename, 'w') as f:
            self.write_text(f, df_left.sort_values('top'), 'lines_combine')
            self.write_text(f, df_right.sort_values('top'), 'lines_combine')

    
    def save_csv(self, df, num_page):
        filename, _ = self.get_file_path(num_page, 'csv')
        df.to_csv(filename, index=False)

    
    def combine_text(self, start_page, end_page, skip_if_error=False):
        with open(self.all_page_filename, 'a') as outfile:
            for num_page in tqdm(range(start_page, end_page+1)) :
                try :
                    filename, _ =  self.get_file_path(num_page, 'txt')
                    with open(filename) as infile:
                        for line in infile:
                            outfile.write(line)
                except Exception as e :
                    logging.error(traceback.format_exc())
                    logging.error(str(e))
                    if not skip_if_error :
                        sys.exit()


    def extract(self, pdf_file, page_start, page_end, skip_if_error=False):
        all_images = convert_from_bytes(open(pdf_file, "rb").read(), size=10, poppler_path=self.poppler_path)
        n_pages = len(all_images)
        logging.info(f'Extract {n_pages} images')
        for num_page in tqdm(range(page_start, page_end+1)):
            try :
                df_text = self.page_extractor(pdf_file, num_page)
                self.save_csv(df_text, num_page)
                df_text = df_text.reset_index().drop('index', axis=1)
                df_text = self.remove_footer(df_text)
                self.save_text(df_text, num_page)
                logging.info('Successfully extract page {}'.format(num_page))
            except Exception as e :
                logging.error(traceback.format_exc())
                logging.info('Fail to extract page {}'.format(num_page))
                if not skip_if_error:
                    sys.exit()

In [60]:
extractor = Extractor()
extractor.poppler_path = os.path.abspath(os.path.join(extractor.base_dir, 'poppler-0.90.1', 'bin'))
pytesseract.pytesseract.tesseract_cmd = os.path.join(extractor.tesseract_dir, 'tesseract.exe')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [30]:
extractor.poppler_path

'C:\\Users\\Admin\\Documents\\Fauzan\\Test\\extract_text\\poppler-0.90.1\\bin'

In [31]:
all_images = convert_from_bytes(open('csi1.pdf', "rb").read(), size=10, poppler_path=extractor.poppler_path)

In [32]:
n_pages = len(all_images)

In [35]:
pdf_file = 'csi1.pdf'

In [38]:
df_text = extractor.page_extractor(pdf_file, 12)

In [40]:
df_text.head(10)

Unnamed: 0,left,top,width,height,lines_combine,detected_as,group
0,240,897,516,118,Strategy,text,1
0,2394,932,518,4029,over-the-counter ((OTC) derivative products an...,text,2
0,239,1122,502,282,Credit Suisse International strategy,text,3
0,237,1333,464,4517,CSi's strategy is to provide a comprehensive r...,title,4
1,236,2130,395,934,Acquisitions (M&A) and underwriting and arrang...,text,4
0,2395,1831,376,2677,"For corporate clients, CSi provides a wide spe...",text,5
0,237,2430,642,4089,CSi believes that it is well-positioned for th...,text,6
0,2395,2430,521,1809,CSi also provides institutional-style solution...,text,7
0,2396,2918,340,409,Growth driven by one principal division,text,8
0,2397,3128,369,1872,Following the structural changes at the CSG le...,text,9


In [41]:
df_text = df_text.reset_index().drop('index', axis=1)

In [42]:
df_text.head(10)

Unnamed: 0,left,top,width,height,lines_combine,detected_as,group
0,240,897,516,118,Strategy,text,1
1,2394,932,518,4029,over-the-counter ((OTC) derivative products an...,text,2
2,239,1122,502,282,Credit Suisse International strategy,text,3
3,237,1333,464,4517,CSi's strategy is to provide a comprehensive r...,title,4
4,236,2130,395,934,Acquisitions (M&A) and underwriting and arrang...,text,4
5,2395,1831,376,2677,"For corporate clients, CSi provides a wide spe...",text,5
6,237,2430,642,4089,CSi believes that it is well-positioned for th...,text,6
7,2395,2430,521,1809,CSi also provides institutional-style solution...,text,7
8,2396,2918,340,409,Growth driven by one principal division,text,8
9,2397,3128,369,1872,Following the structural changes at the CSG le...,text,9


In [43]:
df_text = extractor.remove_footer(df_text)

In [47]:
df_text.iloc[3, :]['lines_combine']

"CSi's strategy is to provide a comprehensive range of invest-, ment banking services and to build on its strengths as a global, hub for CS group's derivative products and as a registered swap, dealer for Dodd-Frank clients, to support securities and non-se-, curities sales, trading, risk management and settlement services, for Investment Banking clients. The strategy encompasses the, provision of solutions for other divisions, and businesses, includ-, ing wealth management clients; and the provision of Merger and"

In [48]:
df_left, df_right = extractor.get_columns(df_text)

In [51]:
df_left.iloc[2, :]['lines_combine']

"CSi's strategy is to provide a comprehensive range of invest-, ment banking services and to build on its strengths as a global, hub for CS group's derivative products and as a registered swap, dealer for Dodd-Frank clients, to support securities and non-se-, curities sales, trading, risk management and settlement services, for Investment Banking clients. The strategy encompasses the, provision of solutions for other divisions, and businesses, includ-, ing wealth management clients; and the provision of Merger and"

In [61]:
for _, value in df_left.iterrows():
    print(extractor.process_text(value['lines_combine']))
#     print(value['lines_combine'])

Strategy
Credit Suisse International strategy
CSi's strategy is to provide a comprehensive range of investment banking services and to build on its strengths as a global, hub for CS group's derivative products and as a registered swap, dealer for Dodd-Frank clients, to support securities and non-securities sales, trading, risk management and settlement services, for Investment Banking clients. The strategy encompasses the, provision of solutions for other divisions, and businesses, including wealth management clients; and the provision of Merger and
Acquisitions (M&A) and underwriting and arrangement services,, and bilateral or syndicated loans, for corporate clients.
CSi believes that it is well-positioned for the post COVID-19, market environment, and will benefit from the expected recovery, in M&A and asset finance and increased capital markets activity, driven by leveraged finance. CSi intends to continue to strengthen, the connectivity to the CS group Wealth Management-related, bu