In [44]:
import re
import fitz 
from pathlib import Path
import pandas as pd

In [45]:
interim_dir = Path("../data/sample/interim").resolve()
pdf_filepaths = [p.resolve() for p in interim_dir.rglob("*.pdf")]

In [46]:
def extract_text_from_main_page(pdf, current_row):
    
    main_page = pdf[0].get_text()

    int_no = re.search(r"Int\.?\s*No[:\s]*([^\n]+?)\s{2,}", main_page)
    int_no = int_no.group(1).strip() if int_no else None

    location = re.search(r"Location:\s*([^\n]+?)\s{2,}", main_page)
    location = location.group(1).strip() if location else None

    date = re.search(r"Date:\s*([0-9\s/]+?)\s{2,}", main_page)
    date = date.group(1).strip() if date else None

    signal_id = re.search(r"Signal ID[:\s]*([0-9]+)", main_page)
    signal_id = signal_id.group(1).strip() if signal_id else None

    goms = re.search(r"GOMS[:\s]*([0-9\-]+)", main_page)
    goms = goms.group(1).strip() if goms else None
    
    audio_tactile = "Yes" if "audio tactile" in main_page.lower() else "No"
    gmp = "Yes" if "gm+" in main_page.lower() else "No"

    # Append data
    current_row["Int. No"] = int_no 
    current_row["Location"] = location
    current_row["Date"] = date
    current_row["Signal ID"] = signal_id
    current_row["GOMS"] = goms
    current_row["Audio Tactile"] = audio_tactile
    current_row["GMP"] = gmp

In [47]:
def extract_text_from_intergreen_page(pdf, current_row):
        
        # Get the other page
        for i, page in enumerate(pdf):

            text = page.get_text()
            if "INTERGREEN, PEDESTRIAN TIMES AND SPECIAL FUNCTIONS" in text:
                intergreen_page = pdf[i].get_text()
                break
    
        special_section = intergreen_page.split("SPECIAL FACILITIES")[1].split("PRE-EMPTION")[0]
        lines = [line.strip() for line in special_section.strip().splitlines() if line.strip()]
        headers = {"SIGNAL GROUP", "HOUR", "MINUTE", "SECOND", "FUNCTION"}
        filtered_lines = [line for line in lines if line not in headers]
        special_facility = "Yes" if filtered_lines else "No"

        preemption_section = intergreen_page.split("PRE-EMPTION")[1]
        lines = [line.strip() for line in preemption_section.strip().splitlines() if line.strip()]
        headers = {"SIGNAL GROUP", "PHASE", "FUNCTION", "REMARKS"}
        filtered_lines = [line for line in lines if line not in headers and not line.startswith("Pg.")]
        preemption = "Yes" if filtered_lines else "No"

        pedestrian_walking_speed = re.search(r"Pedestrian Walking Speed:.*?([0-9]+(?:\.[0-9]+)?)", intergreen_page)
        pedestrian_walking_speed = pedestrian_walking_speed.group(1) if pedestrian_walking_speed else None
        
        # Append data
        current_row["Special Facility"] = special_facility
        current_row["Preemption"] = preemption
        current_row["Pedestrian Walking Speed"] = pedestrian_walking_speed


In [48]:
def extract_text_from_timesetting_page(pdf, current_row):
       
       for page in pdf:
        
        text_page = page.get_text()
        
        if all(keyword in text_page for keyword in ["CONTROLLER TIMESETTING", "RED/YELLOW"]):
            
            df = page.find_tables().tables[0].to_pandas()
            
            for val in ["RED/YELLOW",
                        "LATE START",
                        "MINIMUM GREEN",
                        "INCREMENT",
                        "MAX. V. I. G.",
                        "MAX. EXT. GREEN",
                        "EARLY CUT-OFF",
                        "AMBER",
                        "ALL RED",
                        "SPECIAL ALL RED"]:
                            
                        row = df[df["Col0"] == val].iloc[0]
                        row_dict = {col: row[col] if pd.notna(row[col]) and str(row[col]).strip() != "" else None
                                    for col in df.columns[df.columns.get_loc("A"):]}
                        
                        current_row[val] = row_dict

In [49]:
def extract_text_from_coordination_page(pdf, current_row):
     
     for page in pdf:
          
          if "CO-ORDINATION DATA" in page.get_text():
            
            tables = page.find_tables()
            df = tables.tables[0].to_pandas()

            df.columns.values[0] = "Signal"
            df.columns.values[1] = "Function"

            df["Function"] = df["Function"].ffill()

            signals = df[df["Function"].str.contains("auto call push button", case=False, na=False)]["Signal"].tolist()
            signals_str = ", ".join(signals)

            current_row["phase_sequence"] = signals_str
          
            break

In [50]:
def extract_text_from_remarks_page(pdf, current_row):

    remarks_page = pdf[1].get_text()

    if "REMARKS" in remarks_page:
        matches = re.findall(
            r"If phase\s+(\S*)\s+is not introduced,\s+SG\s+(\S*)\s+will flash for 3 seconds \(TSM 14\)",
            remarks_page
        )

        # Right Turn Green Arrow
        any_filled = False
        for phase, sg in matches:
            line_filled = phase.strip() != "" or sg.strip() != ""
            if line_filled:
                any_filled = True

        current_row["Right Turn Green Arrow"] = "Yes" if any_filled else "No"

        # Left Turn Green Arrow
        pattern = (
            r"1\.\s+It is introduced in\s*(.*?)\s*phase\.\s*"
            r"2\.\s+SG\s*(.*?)\s*terminates with SG/Phase\s*(.*?)\s*with green arrow flashing for 3 seconds"
        )

        matches = re.findall(pattern, remarks_page, re.DOTALL)

        any_filled = any(p1.strip() or p2.strip() or p3.strip() for p1, p2, p3 in matches)
        current_row["Left Turn Green Arrow"] = "Yes" if any_filled else "No"

    else:
        current_row["Right Turn Green Arrow"] = "No"
        current_row["Left Turn Green Arrow"] = "No"

In [51]:
filepath = pdf_filepaths[0] 

with fitz.open(filepath) as pdf:

    remarks_page = pdf[1].get_text()

    #print(remarks_page)


    pattern = (
        r"1\.\s+It is introduced in\s*(.*?)\s*phase\.\s*"
        r"2\.\s+SG\s*(.*?)\s*terminates with SG/Phase\s*(.*?)\s*with green arrow flashing for 3 seconds"
    )

    matches = re.findall(pattern, remarks_page, re.DOTALL)

    print(matches)

    any_filled = any(p1.strip() or p2.strip() or p3.strip() for p1, p2, p3 in matches)

    print( any_filled )



[]
False


In [52]:
knowledge_base = []

for filepath in pdf_filepaths:

    with fitz.open(filepath) as pdf:

        current_row = {}

        extract_text_from_main_page(pdf, current_row)
        extract_text_from_remarks_page(pdf, current_row)
        extract_text_from_intergreen_page(pdf, current_row)
        extract_text_from_timesetting_page(pdf, current_row)
        extract_text_from_coordination_page(pdf, current_row)

        knowledge_base.append(current_row)

In [54]:
int_nos_with_left_turn_green = [
    entry["Int. No"]
    for entry in knowledge_base
    if entry.get("Left Turn Green Arrow") == "Yes"
]
print(int_nos_with_left_turn_green)

[]


In [55]:
import pandas as pd
from pathlib import Path

# Convert knowledge_base to a DataFrame
df = pd.DataFrame(knowledge_base)

# Create the directory if it doesn't exist
interim_dir = Path("../data/sample/processed").resolve()
interim_dir.mkdir(parents=True, exist_ok=True)

# Save the DataFrame as a CSV file
csv_path = interim_dir / "knowledge_base.csv"
df.to_csv(csv_path, index=False)

print(f"Saved knowledge base with {len(df)} entries to {csv_path}")

Saved knowledge base with 11 entries to C:\Users\chuaj\OneDrive\Desktop\Folder\5. Work\10. Ops Sheet Knowledge Base\data\sample\processed\knowledge_base.csv


In [None]:
import pandas as pd
from pathlib import Path


In [None]:
class PDFExtractor:

    def __init__(self):

        self.interim_dir = Path("../data/sample/interim").resolve()
        self.pdf_filepaths = [p.resolve() for p in self.interim_dir.rglob("*.pdf")]
        self.knowledge_base = []

    def _extract_text_from_main_page(pdf, current_row):
        
        main_page = pdf[0].get_text()

        int_no = re.search(r"Int\.?\s*No[:\s]*([^\n]+?)\s{2,}", main_page)
        int_no = int_no.group(1).strip() if int_no else None

        location = re.search(r"Location:\s*([^\n]+?)\s{2,}", main_page)
        location = location.group(1).strip() if location else None

        date = re.search(r"Date:\s*([0-9\s/]+?)\s{2,}", main_page)
        date = date.group(1).strip() if date else None

        signal_id = re.search(r"Signal ID[:\s]*([0-9]+)", main_page)
        signal_id = signal_id.group(1).strip() if signal_id else None

        goms = re.search(r"GOMS[:\s]*([0-9\-]+)", main_page)
        goms = goms.group(1).strip() if goms else None
        
        audio_tactile = "Y" if "audio tactile" in main_page.lower() else "N"
        gmp = "Y" if "gm+" in main_page.lower() else "N"

        # Append data
        current_row["Int. No"] = int_no 
        current_row["Location"] = location
        current_row["Date"] = date
        current_row["Signal ID"] = signal_id
        current_row["GOMS"] = goms
        current_row["Audio Tactile"] = audio_tactile
        current_row["GMP"] = gmp

    def _extract_text_from_intergreen_page(pdf, current_row):
            
            # Get the other page
            for i, page in enumerate(pdf):

                text = page.get_text()
                if "INTERGREEN, PEDESTRIAN TIMES AND SPECIAL FUNCTIONS" in text:
                    intergreen_page = pdf[i].get_text()
                    break
        
            special_section = intergreen_page.split("SPECIAL FACILITIES")[1].split("PRE-EMPTION")[0]
            lines = [line.strip() for line in special_section.strip().splitlines() if line.strip()]
            headers = {"SIGNAL GROUP", "HOUR", "MINUTE", "SECOND", "FUNCTION"}
            filtered_lines = [line for line in lines if line not in headers]
            special_facility = "Y" if filtered_lines else "N"

            preemption_section = intergreen_page.split("PRE-EMPTION")[1]
            lines = [line.strip() for line in preemption_section.strip().splitlines() if line.strip()]
            headers = {"SIGNAL GROUP", "PHASE", "FUNCTION", "REMARKS"}
            filtered_lines = [line for line in lines if line not in headers and not line.startswith("Pg.")]
            preemption = "Y" if filtered_lines else "N"

            pedestrian_walking_speed = re.search(r"Pedestrian Walking Speed:.*?([0-9]+(?:\.[0-9]+)?)", intergreen_page)
            pedestrian_walking_speed = pedestrian_walking_speed.group(1) if pedestrian_walking_speed else None
            
            # Append data
            current_row["Special Facility"] = special_facility
            current_row["Preemption"] = preemption
            current_row["Pedestrian Walking Speed"] = pedestrian_walking_speed

    def _extract_text_from_timesetting_page(pdf, current_row):
        
        for page in pdf:
            
            text_page = page.get_text()
            
            if all(keyword in text_page for keyword in ["CONTROLLER TIMESETTING", "RED/YELLOW"]):
                
                df = page.find_tables().tables[0].to_pandas()
                
                for val in ["RED/YELLOW",
                            "LATE START",
                            "MINIMUM GREEN",
                            "INCREMENT",
                            "MAX. V. I. G.",
                            "MAX. EXT. GREEN",
                            "EARLY CUT-OFF",
                            "AMBER",
                            "ALL RED",
                            "SPECIAL ALL RED"]:
                                
                            row = df[df["Col0"] == val].iloc[0]
                            row_dict = {col: row[col] if pd.notna(row[col]) and str(row[col]).strip() != "" else None
                                        for col in df.columns[df.columns.get_loc("A"):]}
                            
                            current_row[val] = row_dict


    def _extract_text_from_coordination_page(pdf, current_row):
        
        for page in pdf:
            
            if "CO-ORDINATION DATA" in page.get_text():
                
                tables = page.find_tables()
                df = tables.tables[0].to_pandas()

                df.columns.values[0] = "Signal"
                df.columns.values[1] = "Function"

                df["Function"] = df["Function"].ffill()

                signals = df[df["Function"].str.contains("auto call push button", case=False, na=False)]["Signal"].tolist()
                signals_str = ", ".join(signals)

                current_row["phase_sequence"] = signals_str
            
                break

    def extract_text_from_pdf(self):
         
         for filepath in self.pdf_filepaths:

            current_row = {}

            with fitz.open(filepath) as pdf:
                extract_text_from_main_page(pdf, current_row)
                extract_text_from_intergreen_page(pdf, current_row)
                extract_text_from_timesetting_page(pdf, current_row)
                extract_text_from_coordination_page(pdf, current_row)

                self.knowledge_base.append(current_row)

    def save_knowledge_base(self):

        # Convert knowledge_base to a DataFrame
        df = pd.DataFrame(self.knowledge_base)

        # Create the directory if it doesn't exist
        interim_dir = Path("../data/sample/processed").resolve()
        interim_dir.mkdir(parents=True, exist_ok=True)

        # Save the DataFrame as a CSV file
        csv_path = interim_dir / "knowledge_base.csv"
        df.to_csv(csv_path, index=False)

        print(f"Saved knowledge base with {len(df)} entries to {csv_path}")
                    

pdf_extractor = PDFExtractor()
pdf_extractor.extract_text_from_pdf()
pdf_extractor.save_knowledge_base()