In [376]:
import os
import shutil
import easyocr
from pdf2image import convert_from_path
from PyPDF2 import PdfReader, PdfWriter
import matplotlib.pyplot as plt
from PIL import Image
import cv2
import numpy as np
from PyPDF2 import PdfReader, PdfWriter
import warnings
from tqdm import tqdm

In [377]:
# Define the base directory path
base_dir = 'lesson_plans'

# List of subdirectories to be created
classrooms = ['Orchard', 'Grove', 'Thicket']

# Remove the base directory if it exists
if os.path.exists(base_dir):
    shutil.rmtree(base_dir)
    print(f"Removed existing directory: {base_dir}")

# Create the base directory
os.makedirs(base_dir)
print(f"Created directory: {base_dir}")

# Define the directory to list files from
file_dump_directory = 'file_dump/'

files = os.listdir(file_dump_directory)
    
# Filter out .DS_Store and only list actual files (excluding directories)
date_list = [file for file in files if file != '.DS_Store']

# Create the subdirectories
for date in date_list:
    date_directory = os.path.join(base_dir, date)
    os.makedirs(date_directory)
    for classroom in classrooms:
        classroom_directory = os.path.join(date_directory, classroom)
        os.makedirs(classroom_directory)

Removed existing directory: lesson_plans
Created directory: lesson_plans


In [378]:
# Suppress the specific FutureWarning related to torch.load in EasyOCR
warnings.filterwarnings("ignore", category=FutureWarning, message=".*torch.load*")

def do_magic(pdf_path, date):
    # Convert PDF pages to images
    pages = convert_from_path(pdf_path)

    # Define the crop box (left, upper, right, lower)
    crop_box_1 = (810, 111, 995, 175)
    crop_box_2 = (180, 380, 1220, 480)  # New crop box 2

    # Initialize the EasyOCR reader
    reader = easyocr.Reader(['en'])  # You can specify the language(s) here

    # Dictionary to store the extracted text by page number, with section_1 and section_2 as sub-keys
    extracted_text = {}

    # Loop through all pages, crop for OCR, and display full image with the grid
    for i, page in enumerate(pages):
        # Perform cropping for both sections
        cropped_page_1 = page.crop(crop_box_1)
        cropped_page_2 = page.crop(crop_box_2)

        # Convert the cropped PIL images to numpy arrays for OCR
        cropped_page_1_np = np.array(cropped_page_1)
        cropped_page_2_np = np.array(cropped_page_2)

        # Use EasyOCR to read text from the cropped images
        text_1 = reader.readtext(cropped_page_1_np, detail=0)  # detail=0 to only get the text
        text_2 = reader.readtext(cropped_page_2_np, detail=0)  # detail=0 to only get the text

        # Save the extracted text in the dictionary, with the page number as the master key
        extracted_text[i] = {
            "section_1": text_1,
            "section_2": text_2
        }


    pages_to_delete = []
    parent_handout_page = []

    for page_num, sections in extracted_text.items():
        section_1_text = sections['section_1'][0].lower() if sections['section_1'] else ''
        section_2_text = sections['section_2'][0].lower() if sections['section_2'] else ''

        if section_1_text == 'prepare' and section_2_text == 'gather supplies':
            pages_to_delete.append(page_num)

        if section_1_text in ['welcome', 'live', 'love']:
            pages_to_delete.append(page_num)

        if section_1_text == 'love':
            parent_handout_page.append(page_num)


    if '_ok' in pdf_path:
        classroom = 'Orchard'
    elif '_op' in pdf_path:
        classroom = 'Grove'
    elif '_yp' in pdf_path:
        classroom = 'Thicket'


    # Load the PDF
    reader = PdfReader(pdf_path)
    writer = PdfWriter()

    # Iterate through all pages, skipping those in 'pages_to_delete'
    for page_num in range(len(reader.pages)):
        if page_num not in pages_to_delete:
            # Add the page to the writer if it's not in the pages_to_delete list
            writer.add_page(reader.pages[page_num])

    classroom_path = 'lesson_plans/' + date + '/' + classroom + '/' + classroom
    # if classroom in ['Grove', 'Thicket']:
    #     classroom_path_file_name = classroom_path + ' - '

    # Save the new PDF to the output path (generic name, replace with actual path)
    with open(classroom_path + ' - Leader Guide - ' + date + '.pdf', "wb") as output_pdf:  # Replace with for loop
        writer.write(output_pdf)


    writer = PdfWriter()

    # Iterate through all the pages and only keep the ones in 'parent_handout_page'
    for page_num in parent_handout_page:
        writer.add_page(reader.pages[page_num])

    if classroom in ['Grove', 'Thicket']:
        handout_filename = ' - Parent Handout - ' + date + '.pdf'
    elif classroom == 'Orchard':
        handout_filename = ' - Small Group Questions - ' + date + '.pdf'

    # Save the new PDF to the output path (generic name, replace with actual path)
    with open(classroom_path + handout_filename, "wb") as output_pdf:  # Replace with actual output path
        writer.write(output_pdf)



In [379]:
def move_activity_pages(pdf_path, date):
    classroom_path = 'lesson_plans/' + date + '/'
    
    # Load the PDF
    reader = PdfReader(pdf_path)
    writer = PdfWriter()
    
    for page_num in range(len(reader.pages)):
        writer.add_page(reader.pages[page_num])

    with open(classroom_path + 'Grove/' + 'Grove' + ' - Activity Sheet - ' + date + '.pdf', "wb") as output_pdf:  # Replace with actual output path
        writer.write(output_pdf)

    writer = PdfWriter()

    writer.add_page(reader.pages[1])

    with open(classroom_path + 'Thicket/' + 'Thicket' + ' - Color Sheet - ' + date + '.pdf', "wb") as output_pdf:  # Replace with actual output path
        writer.write(output_pdf)
        
    

In [380]:
def move_activity_orchard(pdf_path, date):
    classroom_path = 'lesson_plans/' + date + '/'

    # Load the PDF
    reader = PdfReader(pdf_path)
    writer = PdfWriter()

    for page_num in range(len(reader.pages)):
        writer.add_page(reader.pages[page_num])

    with open(classroom_path + 'Orchard/' + 'Orchard' + ' - Parent Handout - ' + date + '.pdf', "wb") as output_pdf:  # Replace with actual output path
        writer.write(output_pdf)


In [381]:
for date in tqdm(date_list):
    final_file_directory = os.listdir(file_dump_directory + date)
    final_file_list = [file for file in final_file_directory if file != '.DS_Store']

    if len(final_file_list) < 6:
        print('Ass')

    for file in final_file_list:
        pdf_path =file_dump_directory + date + '/' + file
        if 'leader_guide' in file:
            do_magic(pdf_path=pdf_path, date=date)
        elif 'yp_activity_pgs' in file:
            move_activity_pages(pdf_path=pdf_path, date=date)
        elif 'ok_activity_pages' in file:
            move_activity_orchard(pdf_path=pdf_path, date=date)

100%|██████████| 3/3 [00:48<00:00, 16.11s/it]
