### Science direfct htmls to pdf

In [11]:
import os
from bs4 import BeautifulSoup
from fpdf import FPDF
import html2text
import re

def clean_text(text):
    # Remove multiple newlines and spaces
    text = re.sub(r'\n\s*\n', '\n\n', text)
    text = re.sub(r' +', ' ', text)
    return text.strip()

def html_to_pdf(html_file, output_pdf):
    # Read HTML file
    with open(html_file, 'r', encoding='utf-8') as file:
        html_content = file.read()

    # Parse HTML
    soup = BeautifulSoup(html_content, 'html.parser')

    # Extract title from the specific span class
    title = soup.find('span', class_='title-text')
    title_text = title.text if title else "No title found"
    title_text = f"## Title: {title_text}"
    
    # Remove script and style elements
    for script in soup(["script", "style"]):
        script.decompose()

    # Convert HTML to plain text
    h = html2text.HTML2Text()
    h.ignore_links = True
    h.ignore_images = True
    text = h.handle(str(soup))
    
    # Clean the text
    text = clean_text(text)

    # Create PDF
    pdf = FPDF()
    pdf.add_page()
    pdf.set_auto_page_break(auto=True, margin=15)
    
    # Add title with larger font size and bold
    pdf.set_font("Arial", 'B', size=16)  # 'B' for bold
    pdf.multi_cell(0, 10, txt=f"{title_text}")
    pdf.ln(5)  # Add some space after title
    
    # Set font back to normal for main text
    pdf.set_font("Arial", size=11)
    
    # Add text to PDF
    # Split text into lines to avoid overflow
    lines = text.split('\n')
    for line in lines:
        if line.strip():  # Only add non-empty lines
            try:
                # Skip the title if it appears again in the text
                # Replace common special characters


                                    
                if title_text not in line:
                    try:
                        pdf.multi_cell(0, 5, txt=line.encode('latin-1', 'replace').decode('latin-1'))
                    except:
                        pdf.multi_cell(0, 5, txt=line.encode('latin-1', 'replace').decode('latin-1', 'ignore'))

            except:
                # If encoding fails, try to clean the text further
                clean_line = ''.join(char for char in line if ord(char) < 128)
                # clean_line = line.encode('ascii', 'replace').decode('ascii')
                # clean_line = line.encode('ascii', 'ignore').decode('ascii')





                # clean_line = ''.join(char if ord(char) < 128 else '-' for char in line)


                if title_text not in clean_line:
                    pdf.multi_cell(0, 5, txt=clean_line)

    # Save PDF
    pdf.output(output_pdf)

def process_html_files(input_directory, output_directory):
    # Create output directory if it doesn't exist
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
    
    # Process all HTML files in the input directory
    for filename in os.listdir(input_directory):
        if filename.endswith('.html'):
            html_path = os.path.join(input_directory, filename)
            pdf_path = os.path.join(output_directory, filename.replace('.html', '.pdf'))
            print(f"Converting {filename} to PDF...")
            try:
                html_to_pdf(html_path, pdf_path)
                print(f"Successfully created {pdf_path}")
            except Exception as e:
                print(f"Error processing {filename}: {str(e)}")

# Example usage
if __name__ == "__main__":
    input_dir = "ScienceDirectRaw/" # Directory containing HTML files
    output_dir = "ScienceDirectPDF/"  # Directory for output PDFs
    process_html_files(input_dir, output_dir)

Converting science_direct_1.html to PDF...
Successfully created ScienceDirectPDF/science_direct_1.pdf
Converting science_direct_10.html to PDF...
Successfully created ScienceDirectPDF/science_direct_10.pdf
Converting science_direct_11.html to PDF...
Successfully created ScienceDirectPDF/science_direct_11.pdf
Converting science_direct_12.html to PDF...
Successfully created ScienceDirectPDF/science_direct_12.pdf
Converting science_direct_13.html to PDF...
Successfully created ScienceDirectPDF/science_direct_13.pdf
Converting science_direct_14.html to PDF...
Error processing science_direct_14.html: 'latin-1' codec can't encode character '\u2013' in position 160: ordinal not in range(256)
Converting science_direct_15.html to PDF...
Successfully created ScienceDirectPDF/science_direct_15.pdf
Converting science_direct_16.html to PDF...
Successfully created ScienceDirectPDF/science_direct_16.pdf
Converting science_direct_17.html to PDF...
Successfully created ScienceDirectPDF/science_direct_1