In [153]:
import pdfplumber

with pdfplumber.open('FundamentalsofCooking10.pdf') as pdf:
    for page in pdf.pages:
        tables = page.extract_tables()
        text = page.extract_text()
        print(text)
        # tables often have: Technique | Temp | Time | Equipment | Best For

Textbook of
FUNDAMENTALS OF
COOKING
Grade X
National Vocational & Technical Training Commission (NAVTTC)
Textbook of
Fundamentals of Cooking
Grade –X
National Vocational and Technical Training commission
H-9, Islamabad
i
Author: Ms. Mariyam Usman, Senior food Preparation and Culinary Arts – Patisserie
Principle Trainer and Owner of Business ANAYA’s PATISSERIE
Reviewers:
1. Engr Abdul Maqsood, DACUM Facilitator
2. Mr. Saifullah Butt, Trainer
3. Mr. Umer Farooq, Instructor
4. Miss Farhana Ahmad Hashmi, Food consultant
5. Miss Zoha Ameen, Instructress
Designing: Gul Awan Printers, Blue Area, Islamabad.
Edition: Test Edition, 2022
ISBN:
Publishers: National Vocational & Technical Training Commission H-9, Islamabad.
Website: www.navttc.gov.pk,
All rights are preserved with the National Vocational and Technical Training Commission.
No part of this book can be copied, translated, reproduced or used for guide books, key notes,
helping books etc. without permission of NAVTTC.
ii
PREFACE
This bo

In [2]:
import pdfplumber
import pandas as pd
import numpy as np
import re
import json

## cooking methods

In [155]:
all_tables = []

with pdfplumber.open('cooking methods.pdf') as pdf:
    for page in pdf.pages:
        tables = page.extract_tables()
        
        if tables:
            for table in tables:
                # Convert to DataFrame without assuming first row is header
                repl = {"\n": "", "\uf0b7": ""}
                # print(table)
                table = [[txt.replace("\n", " ").replace("\uf0b7", " ") if txt is not None else txt 
                            for txt in tbl] 
                            for tbl in table
                        ]
                df = pd.DataFrame(table)
                all_tables.append(df)

# Stack all tables vertically, ignore column mismatches
combined_df = pd.concat(all_tables, ignore_index=True, sort=False)
combined_df = combined_df.fillna(np.nan)

In [156]:
# left shift for weird ocr extraction
combined_df[2] = combined_df[2].fillna(combined_df[3])
combined_df[3] = combined_df[3].fillna(combined_df[4])
combined_df.drop(4, axis=1, inplace=True)

In [157]:
# declare header
combined_df.columns = ["COOKING METHODS", "DESCRIPTION", "MERITS", "DEMERITS"]
combined_df = combined_df[1:]

In [158]:
# combine rows that were separated by a page
combined_df = combined_df.fillna('')
rows_to_be_combined = list(combined_df[combined_df["COOKING METHODS"] == ''].index)
for row in rows_to_be_combined:
    for col in combined_df.columns:
        combined_df.loc[row - 1, col] =  (combined_df.loc[row - 1, col] + combined_df.loc[row, col]).strip()
combined_df = combined_df.drop(rows_to_be_combined, axis=0)

In [159]:
# add type of method column
type_of_method_idx = list(combined_df[combined_df["DESCRIPTION"] == ''].index)
type_of_method_idx_mask = combined_df.index.isin(type_of_method_idx)
combined_df['TYPE OF METHOD'] = combined_df.loc[type_of_method_idx_mask, 'COOKING METHODS']
combined_df['TYPE OF METHOD'] = combined_df['TYPE OF METHOD'].ffill()
combined_df = combined_df.drop(type_of_method_idx, axis=0)

In [160]:
# save to csv
combined_df.to_csv("cooking_methods.csv")

## fundamentals of cooking

In [195]:
def extract_ingredients(df):

    recipe = {
        'dish_name': df.iloc[0, 1].replace("\n", " "),
        'prep_time': df.iloc[0, 2].replace("\n", " "),
        'cooking_time': df.iloc[0, 3].replace("\n", " "),
        'portions': df.iloc[1, 1],
        'unit_size': df.iloc[1, 3],
        'ingredients': []
    }

    # Find ingredients table
    items_row = df[df.iloc[:, 0] == 'Items'].index
    if len(items_row) > 0:
        table_start = items_row[0] + 1
        
        # Find where ingredint entries starts
        df = df.iloc[table_start:, :].replace('', np.nan).dropna(subset=[0,2])
        df.columns = range(len(df.columns))
        

        # Add ingredients
        for _, row in df.iterrows():
            recipe['ingredients'].append({
                'item': row[0].replace("\n", " ").replace("\\", " "),
                'quantity': row[1].replace("\n", " ").replace("\\", " ")
            })
            recipe['ingredients'].append({
                'item': row[2].replace("\n", " ").replace("\\", " "),
                'quantity': row[3].replace("\n", " ").replace("\\", " ")
            })

    return recipe

In [196]:
def get_info(df):
    res = pd.DataFrame()

    col_idx = 0
    for i in range(0, len(df.columns), 2):
        if i + 1 < len(df.columns):
            # fill nan in first col with vals from second col
            res[col_idx] = df.iloc[:, i].fillna(df.iloc[:, i + 1]).str.replace("\n", " ").tolist()
        col_idx+=1
    
    # reset col names
    res.columns = res.iloc[0,:]
    res = res[1:]

    # res_json = {}

    # display(res) 

    # if res.iloc[0,0] == "Staples" or res.iloc[0,0] == "Sauce":
    #     print("in if")
    #     for _, row in res.itterows():
    #         key = row.iloc[0]
    #         val = row.iloc[1:].tolist()
    #         res_json[key] = val
    # else:
    #     print("in else")
    #     for col in res.columns:
    #         key = res.iloc[0, col].replace("\n", " ")  # First row value as key
    #         val = res.iloc[1:, col].str.replace("\n", " ").tolist()  # Remaining rows as list
    #         res_json[key] = val

    return res

In [197]:
def handle_table(table, i):

    df = (pd.DataFrame(table)
        .map(lambda x: np.nan if x in ["", None] else x)
        .dropna(how="all", axis=0)
        .dropna(how="all", axis=1))
    df = df.reset_index(drop=True)
    df.columns = range(len(df.columns))
    
    if df.shape == (1,1) and df.iloc[0, 0]:
        return

    # template one: recipe ingredient
    if df.iloc[0,0] == "Name of dish":
        res = extract_ingredients(df)
        # result is json
        dish_name = res["dish_name"].replace(" ", "_")
        with open(f"foc/foc_ingredients_{dish_name}.json", "w") as f:
            json.dump(res, f, indent=2, ensure_ascii=False)
        return

    # template two: 2 lists
    elif pd.isna(df.iloc[0, 0]) and pd.isna(df.iloc[1, 1]):
        res = get_info(df)
        res.to_csv(f"foc/foc_table_{i}.csv")
        return

    # rest will be considered in text
    return

In [198]:
def handle_text(text):
    # Remove page headers like "1FUNDAMENTALS OF COOKING", "2FUNDAMENTALS OF COOKING", etc.
    text = re.sub(r'^\d+FUNDAMENTALS OF COOKING\s*$', '', text, flags=re.MULTILINE)
    
    # Check if there's a Glossary section
    glossary_match = re.search(r'\bGlossary\b', text, re.IGNORECASE)
    
    # Match section numbers
    pattern = r'(\d+(?:\.\d+)+)([^\n]+)'
    matches = list(re.finditer(pattern, text))
    
    # Keywords to stop content extraction
    stop_keywords = r'\b(Exercise|Activity|Teacher\'s guide|Glossary)\b'
    
    sections = {}
    
    # Extract numbered sections
    for i, match in enumerate(matches):
        section_title = match.group(2).strip()
        start = match.end()
        
        # Determine end position
        if i + 1 < len(matches):
            end = matches[i + 1].start()
        else:
            end = len(text)
        
        # Get initial content
        content = text[start:end]
        
        # Find first occurrence of stop keyword
        stop_match = re.search(stop_keywords, content, re.IGNORECASE)
        if stop_match:
            content = content[:stop_match.start()]
        
        # Clean up content
        content = content.strip()
        content = content.replace('\n', ' ')
        content = re.sub(r' {2,}', ' ', content)
        
        if content:
            sections[section_title] = content
    
    # Extract glossary if it exists
    if glossary_match:
        glossary_text = text[glossary_match.end():].strip()
        glossary_pattern = r'([A-Z][A-Za-z\s]+?)\s*[--]\s*((?:(?![A-Z][a-z]+\s*[--]).)+)'
        
        glossary = {}
        
        for match in re.finditer(glossary_pattern, glossary_text, re.DOTALL):
            term = match.group(1).strip()
            definition = match.group(2).strip()
            
            # Clean up definition
            definition = definition.replace('\n', ' ')
            definition = re.sub(r' {2,}', ' ', definition)
            
            glossary[term] = definition
        
        if glossary:
            sections['Glossary'] = glossary
    
    with open("foc/foc_sections.json", "w", encoding='utf-8') as f:
        json.dump(sections, f, indent=2, ensure_ascii=False)

In [200]:
all_tables = []
all_text = ""

with pdfplumber.open('FundamentalsofCooking10.pdf') as pdf:
    i = 0
    for page in pdf.pages:
        tables = page.extract_tables()
        
        if tables:
            for table in tables:
                handle_table(table, i)
                i+=1

        text = page.extract_text()

        if " ".join(text.split()[:3]) != "FUNDAMENTALS OF COOKING":
            continue
        
        else:
            all_text = all_text + text

    # saves section json
    handle_text(all_text)
