In [13]:
import pdfplumber
import csv
import re
import os

filename = 'FIE_2025_Program_v4.pdf'
csv_filename = 'fie_2025_papers.csv'

In [32]:
# Regex patterns to identify key lines

# Matches "Technical Program: Monday, November 3"
day_pattern = re.compile(r"Technical Program: (Monday|Tuesday|Wednesday), November (\d+)")

# Matches e.g. "M1-RBA: Computing Undergraduate 1: Introductory Levels"
session_pattern = re.compile(r"^([A-Z]\d+-\w{3,4}): (.*)")

# Matches "Room: Riverbed A"
room_pattern = re.compile(r"Room: (.*)")

# Matches "8:00" or "10:30"
time_pattern = re.compile(r"^\d{1,2}:\d{2}$")

In [33]:

papers_data = []
    
# Current state variables
current_day = ""
current_session_code = ""
current_session_title = ""
current_room = ""
in_paper_session = False
    
# Temp holder for the paper being parsed
current_paper = {}

try:
    with pdfplumber.open(filename) as pdf:
        # The technical paper sessions start on page 25 (index 24)
        # and end on page 100 (index 99).
        start_page = 24
        end_page = 99
        
        for page_num in range(start_page, end_page + 1):
            page = pdf.pages[page_num]
            
            # Extract text line by line
            lines = page.extract_text(x_tolerance=2, y_tolerance=2).split('\n')
            
            print(f"Processing Page {page_num + 1}/{len(pdf.pages)}...")
            
            for line in lines:
                line = line.strip()
                if not line:
                    continue

                # 1. Check for a new Day
                day_match = day_pattern.search(line)
                if day_match:
                    current_day = f"{day_match.group(1)}, Nov {day_match.group(2)}"
                    in_paper_session = False # Reset session on new day
                    continue

                # 2. Check for a new Session
                session_match = session_pattern.match(line)
                if session_match:
                    # Before starting a new session, save the last paper
                    # from the previous session.
                    if current_paper:
                        papers_data.append(current_paper)
                        current_paper = {}
                        
                    new_session_code = session_match.group(1)
                    current_session_title = session_match.group(2)

                    if current_session_title.endswith("(cont.)"):
                        current_session_title = current_session_title[:-7].strip()
                    
                    # Check if this is a session to skip
                    if "Special Session" in current_session_title or \
                        "Panel Session" in current_session_title or \
                        "Focus on Exhibitors" in current_session_title or \
                        "Lunch & Keynote" in current_session_title or \
                        "Awards Luncheon" in current_session_title:
                        in_paper_session = False
                    else:
                        in_paper_session = True
                    
                    # Don't reset room if continuation
                    if new_session_code != current_session_code:
                        current_room = "" # Reset room

                    current_session_code = new_session_code
                    continue

                # 3. Check for a Room (assumes it follows a session)
                room_match = room_pattern.search(line)
                if room_match and in_paper_session:
                    current_room = room_match.group(1)
                    continue

                # 4. Check for a Paper Time
                time_match = time_pattern.match(line)
                if time_match and in_paper_session:
                    # Save the previous paper's data
                    if current_paper:
                        papers_data.append(current_paper)
                    
                    # Start a new paper record
                    current_paper = {
                        "Day": current_day,
                        "SessionCode": current_session_code,
                        "SessionTitle": current_session_title,
                        "Room": current_room,
                        "PaperTime": line,
                        "PaperTitle": "",
                        "Authors": ""
                    }
                    continue
                    
                # 5. If we are in a paper and this is not a time/session,
                #    it must be a title or author line.
                if in_paper_session and current_paper:
                    if not current_paper["PaperTitle"]:
                        # First line after time is the title
                        current_paper["PaperTitle"] = line
                    else:
                        # Subsequent lines are authors
                        # Append with a space
                        if current_paper["Authors"]:
                            current_paper["Authors"] += " " + line
                        else:
                            current_paper["Authors"] = line

except FileNotFoundError:
    print(f"'{filename}' was not found.")
except Exception as e:
    print(f"An error occurred: {e}")

# Add the very last paper
if current_paper:
    papers_data.append(current_paper)

print(f"\nSuccessfully extracted {len(papers_data)} papers.")


Processing Page 25/102...
Processing Page 26/102...
Processing Page 27/102...
Processing Page 28/102...
Processing Page 29/102...
Processing Page 30/102...
Processing Page 31/102...
Processing Page 32/102...
Processing Page 33/102...
Processing Page 34/102...
Processing Page 35/102...
Processing Page 36/102...
Processing Page 37/102...
Processing Page 38/102...
Processing Page 39/102...
Processing Page 40/102...
Processing Page 41/102...
Processing Page 42/102...
Processing Page 43/102...
Processing Page 44/102...
Processing Page 45/102...
Processing Page 46/102...
Processing Page 47/102...
Processing Page 48/102...
Processing Page 49/102...
Processing Page 50/102...
Processing Page 51/102...
Processing Page 52/102...
Processing Page 53/102...
Processing Page 54/102...
Processing Page 55/102...
Processing Page 56/102...
Processing Page 57/102...
Processing Page 58/102...
Processing Page 59/102...
Processing Page 60/102...
Processing Page 61/102...
Processing Page 62/102...
Processing P

In [34]:
# Verify data
if papers_data:
    print(f"First paper: {papers_data[0]}")
    print(f"Last paper: {papers_data[-1]}")
else:
    print("No paper data was extracted.")

unique_rooms = set(paper["Room"] for paper in papers_data if paper["Room"])
print(f"\nUnique rooms found: {len(unique_rooms)}")
for room in sorted(unique_rooms):
    print(f"- {room}")

First paper: {'Day': 'Monday, Nov 3', 'SessionCode': 'M1-RBA', 'SessionTitle': 'Computing Undergraduate 1: Introductory Levels', 'Room': 'Riverbed A', 'PaperTime': '8:00', 'PaperTitle': 'Scaffolding the Problem-Solving Process for Introductory Computing Students', 'Authors': 'Ashish Deepak Dandekar (National University of Singapore, Singapore); Nitya Lakshmanan (NUS, Singapore); Daren Ler, Adi Yoga Sidi Prabawa and Sanka Rasnayaka (National University of Singapore, Singapore)'}
Last paper: {'Day': 'Wednesday, Nov 5', 'SessionCode': 'W4-WPE', 'SessionTitle': 'Computer-Based Instruction 6: Immersive and Virtual Environments for Teaching and Learning', 'Room': 'Willow Pond E', 'PaperTime': '16:45', 'PaperTitle': 'Enhancing Computer Network Education Through Immersive Virtual Environments: a Study on NetVerse Edu', 'Authors': 'Erberson Evangelista Vieira (Federal Institute of Paraiba, Brazil); Francisco Petronio Alencar de Medeiros (Federal Institute of Paraiba & IFPB, Brazil); Paulo Ditar

In [35]:

# Write to CSV
if papers_data:
    try:
        with open(csv_filename, 'w', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=papers_data[0].keys())
            writer.writeheader()
            writer.writerows(papers_data)
        print(f"Data successfully written to '{csv_filename}'")
    except IOError:
        print(f"Error: Could not write to file '{csv_filename}'.")
else:
    print("No paper data was extracted.")

Data successfully written to 'fie_2025_papers.csv'
