In [None]:
# @title Libraries and Packages Imports
# Install necessary libraries
!apt-get install -y poppler-utils
!pip install --upgrade deepdoctection
!pip install tensorflow
!pip install torch
!apt-get install -y tesseract-ocr
!pip install pdfplumber
!pip install pdf2image
!pip install flask flask-cors

# Import libraries
import deepdoctection as dd
from pdf2image import convert_from_path
from pathlib import Path
from matplotlib import pyplot as plt
from flask import Flask, request, jsonify
from flask_cors import CORS
import os
import json

# Initialize the Flask app
app = Flask(__name__)
CORS(app)  # Enable CORS for all routes

# Configuration for file uploads
UPLOAD_FOLDER = 'uploads'  # Directory to save uploaded files
ALLOWED_EXTENSIONS = {'pdf', 'docx'}  # Allowed file extensions
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER

# Function to check allowed file extensions
def allowed_file(filename):
    return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS

# Example GET endpoint
@app.route('/api/hello', methods=['GET'])
def hello():
    return jsonify(message='Hello from the backend!')

# Example POST endpoint
@app.route('/api/data', methods=['POST'])
def receive_data():
    data = request.json
    name = data.get('name', 'Unknown')
    return jsonify(message=f'Hello, {name}!')

# Endpoint to handle file uploads from the front end
@app.route('/api/upload', methods=['POST'])
def upload_file():
    # Check if a file is part of the request
    if 'file' not in request.files:
        return jsonify(message='No file part in the request'), 400

    file = request.files['file']

    # Check if a file was submitted
    if file.filename == '':
        return jsonify(message='No file selected'), 400

    # Check if the file is allowed
    if file and allowed_file(file.filename):
        filename = file.filename
        file_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)

        # Save the file to the upload folder
        file.save(file_path)

        # Convert PDF to a list of images (one per page)
        pages = convert_from_path(file_path, dpi=300)

        # Check if the document pages are less than 150
        if len(pages) >= 150:
            response_message = "The document has 150 or more pages."
        else:
            response_message = "The document has fewer than 150 pages."

        # Analyze the uploaded PDF
        results = analyze_pdf(file_path)

        # Return the analysis result
        return jsonify(message=response_message, analysis_results=results), 200

    else:
        return jsonify(message='File type not allowed'), 400

# Function to analyze the PDF file
def analyze_pdf(pdf_path):
    # Convert PDF to a list of images (one per page)
    pages = convert_from_path(pdf_path, dpi=300)
    results = []

    # Process each page image using the analyzer
    analyzer = dd.get_dd_analyzer(config_overwrite=["LANGUAGE='eng'"])
    for i, page_image in enumerate(pages):
        # Convert the PIL image to a format compatible with deepdoctection
        page_image_path = Path(f"page_{i}.png")
        page_image.save(page_image_path)  # Save the image to a unique path for each page

        # Analyze the image
        analysis_result = analyzer.analyze(path=pdf_path, page_number=i+1)
        results.append(analysis_result)
        print(f"Analysis result for page {i}:", analysis_result)

        # Additional checks for abstract and font size
        if 'font_size' in analysis_result:
            font_size = analysis_result['font_size']
            print(f"Page {i} font size:", font_size)

            if font_size != 12:
                print(f"Warning: Page {i} does not have the correct font size. Expected 12pt.")
        else:
            print(f"Font size information not available for page {i}.")

        if 'abstract' in analysis_result:
            abstract_content = analysis_result['abstract']
            abstract_length_chars = len(abstract_content) if isinstance(abstract_content, str) else len(" ".join(abstract_content))
            print(f"Abstract length: {abstract_length_chars} characters")

    return results

# Run the Flask app
if __name__ == '__main__':
    # Create the upload folder if it doesn't exist
    if not os.path.exists(UPLOAD_FOLDER):
        os.makedirs(UPLOAD_FOLDER)

    app.run(port=3000)


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 49 not upgraded.
Need to get 186 kB of archives.
After this operation, 696 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.5 [186 kB]
Fetched 186 kB in 1s (310 kB/s)
Selecting previously unselected package poppler-utils.
(Reading database ... 123597 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.5_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.5) ...
Setting up poppler-utils (22.02.0-2ubuntu0.5) ...
Processing triggers for man-db (2.10.2-1) ...
Collecting deepdoctection
  Downloading deepdoctection-0.33-py3-none-any.whl.metadata (19 kB)
Collecting jsonlines==3.1.0 (from deepdoctection)
  Downloading jsonlines-3.1.0-py3-none-any.wh

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 49 not upgraded.
Need to get 4,816 kB of archives.
After this operation, 15.6 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.1-2.1build1 [236 kB]
Fetched 4,816 kB in 1s (5,595 kB/s)
Selecting previously unselected package tesseract-ocr-eng.
(Reading database ... 123627 files and directories currently installed.)
Preparing to unpack .../tesseract-ocr-

No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'
[0912 11:13.12 @utils.py:161]  INF  NumExpr defaulting to 2 threads.
[0912 11:13.12 @env_info.py:449]  WRN  Both DD_USE_TORCH and DD_USE_TF are set. Defaulting to PyTorch. If you want a different behaviour, set DD_USE_TORCH to None before importing deepdoctection.


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:3000
[0912 11:13.13 @_internal.py:97]  INF  [33mPress CTRL+C to quit[0m
