# Processing of downloaded PDF books

The plan:
- Check missing metadata, and correct it if possible by hand
- Check PDFs if they can be read as text file
- Create a char/word statistics on the texts
- Check with hunspell all the texts
  - Collect statistics
- Display all kind of statistics
- Decide on threshold of different statistics which ones to use and process further

In [1]:
!pip install pdfmerge
!pip install pdfminer-six



## Check for missing metadata

In [1]:
import os
import glob
from pathlib import Path
from typing import List
from datetime import date, datetime
import json

class Statistics:
    def __init__(self, is_valid=False, size=0, pages=0, words=0, word_counts=None, number_of_different_words=0, common_misspelled_words=None, number_of_spell_errors=0):
        self.is_valid = is_valid
        self.size = size
        self.pages = pages
        self.words = words
        self.word_counts = word_counts if word_counts is not None else []
        self.number_of_different_words = number_of_different_words
        self.common_misspelled_words = common_misspelled_words if common_misspelled_words is not None else []
        self.number_of_spell_errors = number_of_spell_errors

    def words_per_pages(self):
        if self.pages > 0:
            return self.words / self.pages
        return None

    def avg_word_len(self):
        if self.words > 0:
            return self.size / self.words
        return None
    
    def to_json(self):
        return json.dumps(self.__dict__)

    @classmethod
    def from_json(cls, string):
        data = json.loads(string)
        return cls(**data)

class Metadata:
    def __init__(self, tags: List[str], author: str, title: str, release_date: date):
        self.tags = tags
        self.author = author
        self.title = title
        self.release_date = release_date

    def to_dict(self):
        return {
            "tags": self.tags,
            "author": self.author,
            "title": self.title,
            "release_date": self.release_date.isoformat()  # Convert date to string
        }
    
    def to_json(self):
        return json.dumps(self.to_dict(), indent=4, ensure_ascii=False)

    @classmethod
    def from_json(cls, string):
        data = json.loads(string)
        return cls(**data)

def discover_files(base_path):
    pdf_files = []
    # Level 1 directories
    for root, dirs, files in os.walk(base_path):
        # Level 2 directories
        for dir in dirs:
            dir_path = os.path.join(root, dir)
            pdfs = glob.glob(os.path.join(dir_path, '*.pdf'))
            for pdf in pdfs:
                json_file = pdf.replace('.pdf', '.json')
                stats_file = pdf.replace('.pdf', '.stats')
                if not os.path.exists(json_file):
                    json_file = None
                if not os.path.exists(stats_file):
                    stats_file = None
                pdf_files.append((pdf, json_file, stats_file))
        break  # Only go one level deep
    return pdf_files

base_path = 'hun_books/'
files = discover_files(base_path)
print(f"Total number of PDF / missing metadata: {len(files)}/{sum(1 for pdf, json, stats in files if json is None)}")
for pdf, json, stats in files:
    if json is None:
        filename = int(Path(pdf).stem)
        print(f"{pdf}:  https://mek.oszk.hu/{int(filename/100)*100:05}/{filename:05}/")

Total number of PDF / missing metadata: 22534/45
hun_books/004/00437.pdf:  https://mek.oszk.hu/00400/00437/
hun_books/004/00438.pdf:  https://mek.oszk.hu/00400/00438/
hun_books/004/00439.pdf:  https://mek.oszk.hu/00400/00439/
hun_books/005/00578.pdf:  https://mek.oszk.hu/00500/00578/
hun_books/010/01049.pdf:  https://mek.oszk.hu/01000/01049/
hun_books/047/04707.pdf:  https://mek.oszk.hu/04700/04707/
hun_books/059/05971.pdf:  https://mek.oszk.hu/05900/05971/
hun_books/071/07123.pdf:  https://mek.oszk.hu/07100/07123/
hun_books/109/10901.pdf:  https://mek.oszk.hu/10900/10901/
hun_books/186/18696.pdf:  https://mek.oszk.hu/18600/18696/
hun_books/222/22255.pdf:  https://mek.oszk.hu/22200/22255/
hun_books/222/22256.pdf:  https://mek.oszk.hu/22200/22256/
hun_books/222/22260.pdf:  https://mek.oszk.hu/22200/22260/
hun_books/232/23212.pdf:  https://mek.oszk.hu/23200/23212/
hun_books/253/25310.pdf:  https://mek.oszk.hu/25300/25310/
hun_books/253/25311.pdf:  https://mek.oszk.hu/25300/25311/
hun_boo

Note: all of the above should be the result of HTTP 404 - missing data

## Read in all PDF files
And since we have red it up all, we collect and run all statistics on them:
- Detect bad/unreadable PDFs
- Number of pages
- Char count
- Word count
    - Individual word count
    - Least used words list
- Running HUNSpell on text and collect the errors
    - Number of errors
    - Corrected Words list

In [14]:
import IPython
import json
import pypdf
import concurrent.futures
import re
from pathlib import Path
from collections import Counter
from pdfminer.high_level import extract_text
import subprocess

def execute_bash_command(command, input_string):
    # Run the bash command with the provided input string
    result = subprocess.run(command, input=input_string, capture_output=True, text=True, shell=True)
    # Capture the output
    output = result.stdout
    return output

def process_file(data):
    pdf, json, stats = data

    filename = int(Path(pdf).stem)
    #if filename != 6992: return None
    try:
        text = ""
        with open(pdf, 'rb') as file:
            reader = pypdf.PdfReader(file)

            # Read in PDF text
            original_text = extract_text(pdf)
            # line ending word separation rejoin
            original_text = re.sub(r'-(\s*\n\s*)+', '', original_text);
            # Lower case the whole thing
            text = original_text.lower()
            
            if "(cid:" in text:
                stat = Statistics(is_valid=False)
            else:
                # spellcheck on the original text
                corrected = execute_bash_command("hunspell -d hu-HU -l", original_text).splitlines()
                corrected_counts = sorted(Counter(corrected).items(), key=lambda item: item[1])
                
                words = [re.sub(r'\W+$', '', re.sub(r'^\W+', '', word)) for word in text.split()]
                # Regular expression to match URLs
                url_pattern = re.compile(r'https?://\S+|www\.\S+')
                # Regular expression to match numbers
                number_pattern = re.compile(r'\d')
                filtered_words = []
                for word in words:
                    if not url_pattern.search(word) and not number_pattern.search(word) and word != "":
                        filtered_words.append(word)
            
                # Count the occurrences of each word
                word_counts = Counter(filtered_words)
                # Sort the words by their count (increasing order)
                sorted_words = sorted(word_counts.items(), key=lambda item: item[1])
                
                
                stat = Statistics(
                    is_valid=True,
                    size=sum(len(item[0])*item[1] for item in sorted_words),
                    pages=len(reader.pages),
                    words=len(filtered_words),
                    number_of_different_words=len(word_counts),
                    word_counts=sorted_words,
                    common_misspelled_words=corrected_counts,
                    number_of_spell_errors=len(corrected_counts),
                )
    except Exception as e:
        print()
        print(f"Bad PDF: {pdf}, {e}")
        stat = Statistics(is_valid=False)
    with open(pdf.replace(".pdf", ".stats"), 'w', encoding='utf-8') as file:
        file.write(stat.to_json())
    return stat.is_valid


# Use all available CPU cores
with concurrent.futures.ProcessPoolExecutor() as executor:
    results = list(executor.map(process_file, files))
def replace_cid_with_char(text):
    # Define the replacement function
    def replace_match(match):
        byte_value = int(match.group(1))
        return chr(byte_value)
    
    # Use re.sub with a regular expression to find all occurrences and replace them
    result = re.sub(r'\(cid:(\d+)\)', replace_match, text)
    return result
for data, stat in zip(files, results):
    if stat:
        print(".", end="")
    else:
        print("X", end="")


The PDF <_io.BufferedReader name='hun_books/012/01209.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
The PDF <_io.BufferedReader name='hun_books/012/01214.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
The PDF <_io.BufferedReader name='hun_books/012/01240.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
The PDF <_io.BufferedReader name='hun_books/013/01316.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
The PDF <_io.BufferedRea


Bad PDF: hun_books/040/04059.pdf, list index out of range

Bad PDF: hun_books/040/04087.pdf, list index out of range


The PDF <_io.BufferedReader name='hun_books/040/04097.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
The PDF <_io.BufferedReader name='hun_books/041/04104.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
The PDF <_io.BufferedReader name='hun_books/041/04122.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
The PDF <_io.BufferedReader name='hun_books/041/04167.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
The PDF <_io.BufferedRea


Bad PDF: hun_books/044/04432.pdf, list index out of range

Bad PDF: hun_books/045/04578.pdf, list index out of range

Bad PDF: hun_books/046/04638.pdf, list index out of range


The PDF <_io.BufferedReader name='hun_books/047/04793.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
The PDF <_io.BufferedReader name='hun_books/048/04814.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
The PDF <_io.BufferedReader name='hun_books/048/04863.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
The PDF <_io.BufferedReader name='hun_books/048/04864.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
The PDF <_io.BufferedRea


Bad PDF: hun_books/064/06455.pdf, unpack requires a buffer of 64 bytes


The PDF <_io.BufferedReader name='hun_books/068/06853.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
The PDF <_io.BufferedReader name='hun_books/069/06979.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
The PDF <_io.BufferedReader name='hun_books/071/07157.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
The PDF <_io.BufferedReader name='hun_books/071/07174.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
The PDF <_io.BufferedRea


Bad PDF: hun_books/116/11673.pdf, bytes must be in range(0, 256)


The PDF <_io.BufferedReader name='hun_books/118/11815.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
The PDF <_io.BufferedReader name='hun_books/118/11820.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
The PDF <_io.BufferedReader name='hun_books/118/11840.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
The PDF <_io.BufferedReader name='hun_books/119/11921.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
Ignoring wrong pointing 


Bad PDF: hun_books/161/16102.pdf, ('Unhandled', 6)


The PDF <_io.BufferedReader name='hun_books/166/16617.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
Ignoring wrong pointing object 1417 0 (offset 8400134)
Ignoring wrong pointing object 3234 0 (offset 19291127)
Ignoring wrong pointing object 3235 0 (offset 19292084)
Ignoring wrong pointing object 4133 0 (offset 0)
Ignoring wrong pointing object 4134 0 (offset 0)
Ignoring wrong pointing object 4135 0 (offset 0)
Ignoring wrong pointing object 4786 0 (offset 0)
Ignoring wrong pointing object 4787 0 (offset 0)
Ignoring wrong pointing object 4788 0 (offset 0)
Ignoring wrong pointing object 4789 0 (offset 0)
Ignoring wrong pointing object 4790 0 (offset 0)
The PDF <_io.BufferedReader name='hun_books/169/16907.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extracta


Bad PDF: hun_books/176/17654.pdf, bytes must be in range(0, 256)


The PDF <_io.BufferedReader name='hun_books/179/17926.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
Overwriting cache for 0 1941
The PDF <_io.BufferedReader name='hun_books/180/18002.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
incorrect startxref pointer(3)
The PDF <_io.BufferedReader name='hun_books/181/18180.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
The PDF <_io.BufferedReader name='hun_books/181/18181.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you 


Bad PDF: hun_books/223/22385.pdf, bytes must be in range(0, 256)


The PDF <_io.BufferedReader name='hun_books/225/22517.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
The PDF <_io.BufferedReader name='hun_books/225/22523.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
The PDF <_io.BufferedReader name='hun_books/225/22536.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
The PDF <_io.BufferedReader name='hun_books/225/22541.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
The PDF <_io.BufferedRea


Bad PDF: hun_books/256/25645.pdf, [Errno 22] Invalid argument


The PDF <_io.BufferedReader name='hun_books/256/25683.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case


..XX..X................X..............X......X.X........X.X..............................X..XX.............................X.......X.............................................................................................................................X........................X.XX.....X...X........XX...XX.X........................X.................................................X...................................................................X.XXX.XXXX...................X.......................................................................................................................................................XXX....XXX...X.........X.........XX...X.XX......X...X.XXXXXXXXXXXXXXXXX...X....XXX........X....X..X..XX..X.....XXXXXX...X.X.......X.............X....XX.....X.........................X............X...........X..X.X.XXXXX.........X........X.....XX.......XX..X.XX.X.......X.....X..........XXXX...........X.....XX.X.X.X....X..X...XX.X.XXXXXXX..X....X.XX........X..XXX.

now run from the command shell the ./prety_print_stats utility, to get rid of those unicode char annotations

In [15]:
!./prety_print_stats

# Iteration
Spell checking reveals a lot's of errors, everywhere. By filtering books down to error percent below 4%, we are left only 28% of the books. Something odd is happenning. We need to compile a list of the most common mistakes, and check why is it that a case. Even add them after careful inspection to a custom dictionary. First hinch: probably the names are missing from the database?

In [2]:
import json
from collections import Counter

misspelled_words = []
cnt = 0
for pdf, json_file, stats in files:
    cnt += 1
    if cnt % 50 == 0:
        print(f"{cnt/len(files)*100:5.1f}%", end="\r")
    if stats:
        with open(stats, 'r', encoding='utf-8') as stats_file:
            statistics_content = stats_file.read()
        statistics = Statistics.from_json(statistics_content)
        for data in statistics.common_misspelled_words:
            for i in range(0, data[1]):
                misspelled_words.append(data[0])

print("\nCounting...")
counted = Counter(misspelled_words)
print("Ordering...")
ordered_counted = sorted(counted.items(), key=lambda item: item[1],reverse=True)

print("100 most common misspelled words:")
cnt = 0
for word, count in ordered_counted:
    cnt += 1
    if cnt > 100: break
    print(f"{count:4}: {word}")

 99.8%
Counting...
Ordering...
100 most common misspelled words:
2555323: the
1747120: of
1551296: in
1111732: and
858276: to
588792: et
540871: der
528783: te
424427: die
350080: The
298230: that
295384: In
292708: des
291246: on
283424: for
268052: őket
262134: a'
255362: an
246608: en
241423: nekem
238609: by
237656: was
228211: with
219889: őt
208829: it
190680: den
175065: es
168116: at
162691: or
161613: from
159537: are
156724: not
149795: zu
144511: Te
142001: ők
139797: engem
138545: he
137953: ta
135737: this
135299: which
134947: za
131974: his
130182: ben
125144: das
123355: er
123012: im
122028: si
119623: di
115969: Die
115170: ban
113458: so
112922: о
112734: their
110365: и
107408: dem
106421: nak
105610: have
104937: &
104343: pp.
103877: me
103309: je
103128: were
103091: ugy
102996: non
101779: igy
100507: nek
99153: em
99052: ist
96825: à
95298: nekik
95294: őket.
95235: du
93977: neked
92708: Magyarországon
91837: auf
91609: nicht
90908: ra
89864: sich
88968: but
8

## Convert all stats + metadata to CSV
To make the data analyzable, we need to convert the collected data to CSV

In [3]:
from pathlib import Path
import json

with open("hun_books.csv", 'w', encoding='utf-8') as csv:
    csv.write("id;filename;title;author;size;pages;words;unique words;spell errors;is-valid\n")
    for pdf, json_file, stats in files:
        filename = int(Path(pdf).stem)
        if stats:
            with open(stats, 'r', encoding='utf-8') as stats_file:
                statistics_content = stats_file.read()
            statistics = Statistics.from_json(statistics_content)
        else:
            statistics = None
        
        if json_file:
            with open(json_file, 'r', encoding='utf-8') as meta_file:
                meta_content = meta_file.read()
            meta = Metadata.from_json(meta_content)
        else:
            meta = None
        
        csv.write("{};{};{};{};{};{};{};{};{};{}\n".format(
            filename,
            pdf,
            meta.title.strip().replace(";", ",") if not meta is None else "",
            meta.author.strip().replace(";", ",") if not meta is None else "",
            statistics.size if not statistics is None and statistics.is_valid else "",
            statistics.pages if not statistics is None and statistics.is_valid else "",
            statistics.words if not statistics is None and statistics.is_valid else "",
            statistics.number_of_different_words if not statistics is None and statistics.is_valid else "",
            statistics.number_of_spell_errors if not statistics is None and statistics.is_valid else "",
            statistics.is_valid if not statistics is None else "",
        ))
        print(".", end="")
print()
print("Ready")
        

........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................