In [1]:
from pdfminer3.layout import LAParams, LTTextBox
from pdfminer3.pdfpage import PDFPage
from pdfminer3.pdfinterp import PDFResourceManager
from pdfminer3.pdfinterp import PDFPageInterpreter
from pdfminer3.converter import PDFPageAggregator
from pdfminer3.converter import TextConverter
import io
import os
import re

In [2]:
# Extracting filenames, directories and years
pdf_dict = {'filename':[], 'year':[], 'dir':[], 'text':[]}
for subdir, dirs, files in os.walk('pdf_files'):
    for file in files:
        pdf_dict['filename'].append(file)
        pdf_dict['year'].append(subdir[-4:])
        pdf_dict['dir'].append(os.path.join(subdir, file))

In [3]:
#Reading pdfs
def read_pdf(dir_chunk):
    text_chunk = []

    for pdf_dir in dir_chunk:
        resource_manager = PDFResourceManager()
        fake_file_handle = io.StringIO()
        converter = TextConverter(resource_manager, fake_file_handle, laparams=LAParams())
        page_interpreter = PDFPageInterpreter(resource_manager, converter)
        print('Reading: ' + pdf_dir)

        with open(pdf_dir, 'rb') as fh:
            for page in PDFPage.get_pages(fh,
                                          caching=True,
                                          check_extractable=True):

                page_interpreter.process_page(page)

            text = fake_file_handle.getvalue()
            text_chunk.append(text)

        converter.close()
        fake_file_handle.close()
    print('-'*50)
    
    return text_chunk

In [4]:
# Creating output folders
# categorized by years
def create_folders(year_chunk):
    for year in set(year_chunk):
        out_filename = 'text_files/' + year + '/'
        os.makedirs(os.path.dirname(out_filename), exist_ok=True)

In [5]:
# Creating txt files
# text_files/year/filename.txt

def write_txt(filename_chunk, year_chunk, text_chunk):
    for filename, year, text in zip(filename_chunk,
                                    year_chunk,
                                    text_chunk):

        txt_name = re.findall('(.*)\.', filename)[0] + '.txt'
        output_path = 'text_files/' + year + '/' + txt_name

        with open(output_path, 'w') as output:
            output.write(text)

In [6]:
## Partitioning to not exceed recursion limit
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

chunked_dir = list(chunks(pdf_dict['dir'], 12))
chunked_year = list(chunks(pdf_dict['year'], 12))
chunked_filename = list(chunks(pdf_dict['filename'], 12))

In [7]:
for c in range(len(chunked_dir)):
    print(f'Batch: {c}')
    text_chunk = read_pdf(chunked_dir[c])
    create_folders(chunked_year[c])
    write_txt(chunked_filename[c], chunked_year[c], text_chunk)

Batch: 0
Reading: pdf_files/2015/ekim-2015.pdf
Reading: pdf_files/2015/kasim-2015.pdf
Reading: pdf_files/2015/ocak-2015.pdf
Reading: pdf_files/2015/subat-2015.pdf
Reading: pdf_files/2015/haziran-2015.pdf
Reading: pdf_files/2015/agustos-2015.pdf
Reading: pdf_files/2015/mart-2015.pdf
Reading: pdf_files/2015/nisan-2015.pdf
Reading: pdf_files/2015/aralik-2015.pdf
Reading: pdf_files/2015/temmuz-2015.pdf
Reading: pdf_files/2015/mayis-2015.pdf
Reading: pdf_files/2015/eylul-2015.pdf
--------------------------------------------------
Batch: 1
Reading: pdf_files/1999/mayis-1999.pdf
Reading: pdf_files/1999/ekim-1999.pdf
Reading: pdf_files/1999/ocak-1999.pdf
Reading: pdf_files/1999/nisan-1999.pdf
Reading: pdf_files/1999/agustos-1999.pdf
Reading: pdf_files/1999/aralik-1999.pdf
Reading: pdf_files/1999/subat-1999.pdf
Reading: pdf_files/1999/haziran-1999.pdf
Reading: pdf_files/1999/temmuz-1999.pdf
Reading: pdf_files/1999/mart-1999.pdf
Reading: pdf_files/1999/eylul-1999.pdf
Reading: pdf_files/1999/kasi