In [6]:
import requests
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
import cv2
import numpy as np
import pdfplumber
import pandas as pd
import tqdm as tqdm

In [7]:
def load_vietnamese_dictionary(dic_file_path):
    vietnamese_dict = set()
    with open(dic_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            if line.startswith('#') or line.strip() == '':
                continue
            word = line.split('/')[0].strip()
            vietnamese_dict.add(word)
    return vietnamese_dict
vietnamese_dictionary = load_vietnamese_dictionary('vi_VN.dic')  

def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    text = re.sub(r'[^a-zA-ZÀÁÂÃÈÉÊÌÍÒÓÔÕÙÚĂĐĨŨƠàáâãèéêìíòóôõùúăđĩũơƯăắặằẳẵâầấậẩẫêềếệểễôồốộổỗơờởỡợừữửựụ0-9.,;:/]+', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()

    return text

def process_pdf(url):
    response = requests.get(url)
    pdf_path = "temp_document.pdf"
    with open(pdf_path, "wb") as file:
        file.write(response.content)
    
    pages = convert_from_path(pdf_path)
    
    full_text = ""
    
    for page in tqdm(pages, desc="Processing Pages"):
        img = np.array(page)
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        contrast_img = cv2.convertScaleAbs(gray, alpha=1.5, beta=0)
        text = pytesseract.image_to_string(contrast_img, lang='vie')
        full_text += text + "\n"
    
    cleaned_text = clean_text(full_text)
    return cleaned_text

def detect_misspellings(text, dictionary):
    misspelled_words = []
    for word in text.split():
        if word not in dictionary:
            misspelled_words.append(word)
    return misspelled_words

def process_all_pdfs(csv_file_path, output_file):
    df = pd.read_csv(csv_file_path)
    pdf_links = df['URL'].tolist()  

    all_misspelled_words = set()

    for pdf_url in tqdm(pdf_links):
        print(f"Processing {pdf_url}")
        pdf_id = pdf_url.split('/')[-1]
        cleaned_text = process_pdf(pdf_url)

        with open(f"pdf/{pdf_id}.txt", "w", encoding="utf-8") as f_cleaned:
            f_cleaned.write(cleaned_text)

        misspelled_words = detect_misspellings(cleaned_text, vietnamese_dictionary)
        all_misspelled_words.update(misspelled_words)

    with open(output_file, 'w', encoding='utf-8') as f_out:
        for word in sorted(all_misspelled_words):  
            f_out.write(f"{word}\n")

csv_file_path = 'pdfLink.csv'  
output_file = 'all_misspelled_words.txt'  
process_all_pdfs(csv_file_path, output_file)


TypeError: 'module' object is not callable