In [2]:
import camelot
import os
import pandas as pd
import glob
import re


In [43]:

def clean_page_1(df):
    
    #Step 1: Prepare the DataFrame
    #Drop the first column containing the paragraph and renaming the second column
    # df = df.drop(columns=[0])
    # df.columns = ['Raw'] + list(df.columns[1:])
    print(df)
    
    #Step 2: Manually define the paramaters with units
    parameters = [
        "Speed (m/s)",
        "Cadence (steps/min)",
        "Steps",
        "Duration (s)",
        "Distance (m)"
    ]
    values = []
    
    #Step 3: Use re to extract all numeric value and store in a list in a sequential order
    for index, row in df.iterrows():
        text = str(row[1]).strip()
        # print(text)
    
        nums = re.findall(r'\d+\.\d+|\d+', text)
        if len(nums) > 0:
            values.append(nums[0])
            
    
    print(f"Total number of Extracted Values: {values}")
    
    
    
        # Step 4: Creates the new dataframe with 'parameters' and 'values' list
    
    df1 = pd.DataFrame({
        'Parameter Name (Unit)': parameters,
        'Value': values
    })
    
    print("\nPage 1 cleaning is complete")
    print(df1)
    return df1

    

def clean_page_3(df):
    new_rows = []
    
    for i, row in df.iterrows():
        # Ensure the row contains Step Length specifically, and that both columns have valid ± values
        if 'Step Length' in df[1][i] and '±' in row[2] and '±' in row[3]:  
            # Split left and right values and standard deviations
            left_value, left_sd = row[2].split('±')
            right_value, right_sd = row[3].split('±')
            
            # Append rows in a fixed order to maintain alignment
            new_rows.append(['Step Length (Left)', left_value.strip(), 'Value'])
            new_rows.append(['Step Length (Left)', left_sd.strip(), 'SD'])
            new_rows.append(['Step Length (Right)', right_value.strip(), 'Value'])
            new_rows.append(['Step Length (Right)', right_sd.strip(), 'SD'])
    
    # Create a new DataFrame with the aligned row
    df3 = pd.DataFrame(new_rows, columns=['Parameter', 'Value', 'SD'])

    # print(df3)






page_cleaning_switch = {
    1: clean_page_1,
    3: clean_page_3
}

def get_csv_name(pdf_filename):
    """
    Extracts the part of the PDF filename before the first '-' and appends '.csv'.
    Example: 'report1-2023.pdf' -> 'report1.csv'
    """
    base_name = os.path.basename(pdf_filename)
    name_part = base_name.split('-')[0]
    csv_name = f"{name_part}.csv"
    return csv_name

# -----------------------------
# 2. Main Extraction Function
# -----------------------------

def extract_data_from_pdfs(pdf_dir, pages, output_dir):
    """
    Extracts data from specified pages in all gait reports within the given directory.
    
    Parameters:
    - pdf_dir (str): Directory containing PDF files.
    - pages (str): Pages to extract data from (e.g., '1,2,3').
    
    Returns:
    - data_frames (list): List of cleaned DataFrames from all PDFs.
    """
    # Find all PDF files in the directory
    pdf_files = glob.glob(os.path.join(pdf_dir, '*.pdf'))

    if not pdf_files:
        print("No PDF files found in the specified directory.")
        return

    # Process each PDF file
    for pdf_file in pdf_files:
        print(f"\nProcessing file: {pdf_file}")
        
        try:
            # Extract tables from specified pages
            tables = camelot.read_pdf(
                pdf_file,
                pages=pages,
                flavor='stream',  # Use 'stream' or 'lattice' depending on your PDFs
                strip_text='\n',  # Remove line breaks within cells
                edge_tol=500,     # Tolerance for table edge detection; adjust as needed
                row_tol=10,       # Tolerance for row detection; adjust as needed
            )

            if tables.n == 0:
                print(f"    No tables found in {pdf_file} on pages {pages}.")
                continue

            print(f"  Found {tables.n} tables in {pdf_file} on pages {pages}.")

            # Iterate over each extracted table
            for i, table in enumerate(tables, start=1):
                # print(table)
                
                df = table.df  # Get the table as a DataFrame
                # print(df)
                # Optional: Clean the DataFrame
                df = df.dropna(how='all')           # Drop rows where all elements are NaN
                df = df.dropna(axis=1, how='all')   # Drop columns where all elements are NaN

                # Get the page number from the table
                page_number = table.page

                # Applying page-specific cleaning using the cleaning functions
                clean_function = page_cleaning_switch.get(page_number)
                if clean_function:
                    df = clean_function(df)
                else:
                    print(f"No cleaning function defined for page {page_number}")
                    continue  # Or handle as appropriate
                
            
            
        except Exception as e:
            print(f"  An error occurred while processing {pdf_file}: {e}")

    

# -----------------------------
# 3. Example Usage
# -----------------------------

pdf_dir = r"C:\Users\yangk\OneDrive\Documents\HATCH\Programming Learning\MotionCloud Reports"
pages_to_extract = '1'
output_dir = r'C:\Users\yangk\OneDrive\Documents\HATCH\Programming Learning\Extracted Data from MotionCloud Report'

# Call the extraction function
extracted_data = extract_data_from_pdfs(pdf_dir, pages_to_extract, output_dir)




            


Processing file: C:\Users\yangk\OneDrive\Documents\HATCH\Programming Learning\MotionCloud Reports\fast_mid1-gait report.pdf
  Found 1 tables in C:\Users\yangk\OneDrive\Documents\HATCH\Programming Learning\MotionCloud Reports\fast_mid1-gait report.pdf on pages 1.
                                             0                   1
0                           1. Gait Parameters                    
1                    fast_mid1-trial 5-005.mvn                    
2       All parameters shown in the report are                    
3   evaluated only for the portions of the  le                    
4        where the subject was walking, except  General parameters
5                      for the total distance.                    
6                                                        Speed1.93
7                                                            (m/s)
8                      Xsens Gait Report (XGR)             Cadence
9                                                (steps/min)136.50

In [9]:
df

NameError: name 'df' is not defined