In [1]:
# Required Libraries
from IPython import display
from google.colab import drive
from pathlib import Path
import os
import shutil

In [2]:
# Download Facebook's Nougat OCR Model
!pip install git+https://github.com/facebookresearch/nougat
display.clear_output()

In [3]:
######
# Mount Google Drive, Unzip Dataset, and Sync Markdown Files
######
# Mount Google Drive
drive.mount('/content/drive')

# Path to the zip file in Google Drive and the extraction directory
dataset_zip_path = '/content/drive/My Drive/Colab Notebooks/ML Project/ML-Textbook-Dataset.zip'
extracted_dataset_path = '/content/ML-Textbook-Dataset'

# Check if the dataset directory already exists
if not os.path.exists(extracted_dataset_path):
    # If not, unzip the dataset into the specified directory
    !unzip "$dataset_zip_path" -d "$extracted_dataset_path"
else:
    print("Dataset already extracted.")

# Google Drive directory where the existing Markdown files are stored
drive_markdown_dir = '/content/drive/My Drive/Colab Notebooks/ML Project/Markdown/'

# Local directory in Colab to store the Markdown files
local_markdown_dir = '/content/OCR/'

# Ensure the local OCR directory exists
os.makedirs(local_markdown_dir, exist_ok=True)

# Sync the Markdown files from Google Drive to the local OCR directory
!rsync -a --info=progress2 "$drive_markdown_dir" "$local_markdown_dir"

Mounted at /content/drive
Archive:  /content/drive/My Drive/Colab Notebooks/ML Project/ML-Textbook-Dataset.zip
  inflating: /content/ML-Textbook-Dataset/ML-Textbook-Dataset/2307.12008 - LK99 Room Temperature Superconductor.pdf  
  inflating: /content/ML-Textbook-Dataset/ML-Textbook-Dataset/Android Apprentice (Fourth Edition).pdf  
  inflating: /content/ML-Textbook-Dataset/ML-Textbook-Dataset/Android Programming The Big Nerd Ranch Guide (3rd Edition).pdf  
  inflating: /content/ML-Textbook-Dataset/ML-Textbook-Dataset/Command-Line Rust - A Project-Based Primer for Writing Rust CLIs.pdf  
  inflating: /content/ML-Textbook-Dataset/ML-Textbook-Dataset/Computer Organization and Design - MIPS Edition.pdf  
  inflating: /content/ML-Textbook-Dataset/ML-Textbook-Dataset/Computer Science - Education Perspectives.pdf  
  inflating: /content/ML-Textbook-Dataset/ML-Textbook-Dataset/Cracking C Programming Interview.pdf  
  inflating: /content/ML-Textbook-Dataset/ML-Textbook-Dataset/Cracking Job Inter

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#####
# Process Textbooks with Nougat OCR and Update Log File
######

# Directory containing textbooks in Colab
textbooks_dir = "/content/ML-Textbook-Dataset/ML-Textbook-Dataset/"

# Local directory in Colab to store the Markdown files
local_markdown_dir = '/content/OCR/'

# Google Drive directory to save Markdown files and log file
drive_markdown_dir = "/content/drive/My Drive/Colab Notebooks/ML Project/Markdown/"
drive_log_file = "/content/drive/My Drive/Colab Notebooks/ML Project/Markdown/nougat_ocr_log.txt"

# Open the log file in append mode
with open(drive_log_file, 'a') as log_file_writer:
    log_file_writer.write("Starting OCR process...\n")

    # Iterate over each file in the directory
    for file in os.listdir(textbooks_dir):
        if file.endswith(".pdf"):
            pdf_path = os.path.join(textbooks_dir, file)
            markdown_file_name = os.path.splitext(file)[0] + ".mmd"  # Check for ".mmd" file existence
            local_markdown_file_path = os.path.join(local_markdown_dir, markdown_file_name)

            # Check if the MMD file already exists locally
            if os.path.exists(local_markdown_file_path):
                print(f"Skipping '{file}' as MMD file already exists.")
                log_file_writer.write(f"Skipped processing of '{file}' as MMD file already exists.\n")
            else:
                print(f"Processing '{file}'...")
                # Process the PDF with Nougat OCR and append output to the local log file
                nougat_command = f'nougat --markdown pdf "{pdf_path}" --out "{local_markdown_dir}"'  # Output to directory
                os.system(f'{nougat_command} 2>&1 | tee -a "{drive_log_file}"')

                # Copy the Markdown file to Google Drive if it was created
                created_markdown_file_path = os.path.join(local_markdown_dir, os.path.splitext(file)[0] + "_ocr.md")  # The actual output file
                if os.path.exists(created_markdown_file_path):
                    !cp "$created_markdown_file_path" "$drive_markdown_dir"
                    log_file_writer.write(f"Processed and saved Markdown for '{file}'.\n")
                else:
                    log_file_writer.write(f"Failed to process '{file}'.\n")

Processing 'Python Practice Tests-Interview Questions.pdf'...
