In [24]:
import camelot
import os
import pandas as pd
import glob
import re


In [53]:


def clean_page_1(df):

    """
    Cleans and extracts data from page 1 of the PDF.

    This function processes the DataFrame extracted from page 1, extracts relevant numerical values,
    and constructs a standardized DataFrame with predefined parameters.

    Parameters:
        df (pd.DataFrame): The DataFrame extracted from the PDF table on page 1.

    Returns:
        pd.DataFrame: A cleaned DataFrame containing parameters, foot information, standard deviations, and mean values.
"""

    parameters = [
        "Speed (m/s)",
        "Cadence (steps/min)",
        "Steps",
        "Duration (s)",
        "Distance (m)"
    ]
    
    foot = ['Both'] * 5
    
    sd = ['NaN'] * 5
    
    values = []
    
    #Step 3: Use re to extract all numeric value and store in a list in a sequential order
    for index, row in df.iterrows():
        text = str(row[1]).strip()
        # print(text)
    
        nums = re.findall(r'\d+\.\d+|\d+', text)
        if len(nums) > 0:
            values.append(nums[0])
            
    
    print(f"Total number of Extracted Values: {values}")
    
    
    
        # Step 4: Creates the new dataframe with 'parameters' and 'values' list
    
    df1 = pd.DataFrame({
        'Parameter': parameters,
        'Foot':foot,
        'Standard Deviation':sd,
        'Mean': values
    })
    
    print(df1)
    print("\nPage 1 cleaning is complete")
    
    return df1


def clean_page_3(df):
    
    # Rename columns using predefined lists for gait parameter name (width, length...) and foot info (left/right) for final DataFrame format
    parameters_3 = [
    "Step Length (cm)",
    "Step Length (cm)",
    "Step Width (cm)",
    "Step Width (cm)",
    "Stride Length (cm)",
    "Stride Length (cm)"
    ]
    foot_3 = [
    "Left",
    "Right",
    "Left",
    "Right",
    "Left",
    "Right"
    ]
    
    # Empty lists to store extracted value from the PDF
    sd_3 = []
    
    value_3 = []
    
    parameter_number = 3
    
    i = 0 #counter to count through each row 
    j = 1
    
# Split data at the '+/-' sign to extract mean value and standard deviation separately
    for i in range(parameter_number):
        print(f'in this iteration, j ={j} and i ={i}')
        left_value, left_sd = df.iloc[2, j].split(' ± ')
        right_value,right_sd = df.iloc[3, j].split(' ± ')
                    
# Append data into the empty lists created earlier
        value_3.append(left_value)
        sd_3.append(left_sd)
        value_3.append(right_value)
        sd_3.append(right_sd)
        
        j+=2
        
#Generating the final DataFrame
    df3 = pd.DataFrame(list(zip(parameters_3, foot_3, sd_3, value_3, )),
                       columns=['Parameter', 'Foot', 'Standard Deviation', 'Mean' ])
    
    print("\nPage 3 cleaning is complete")
    print(df3)
    return df3
    
    
# PSCF for page 4
def clean_page_4(df):
    
    print(df)

# Predifined list of the parameters and foot infomration of the final DataFrame
    parameters_4 = [
    "Gait Cycle (s)",
    "Gait Cycle (s)",
    "Swing Phase (s)",
    "Swing Phase (s)",
    "Stance Phase (s)",
    "Stance Phase (s)"
    ]
    foot_4 = [
    "Left",
    "Right",
    "Left",
    "Right",
    "Left",
    "Right"
    ]
    
    value_4 = []
    
    sd_4 = []
    
    j = 1 # Counter to iterates through each column
    k = 2 # Row index for extracting the left foot data
    l = 3 # Row index for extracting right foot data

    for i in range(3):
        
        
        print(f'in this iteration, j ={j} and i ={i}')
        
        R0C0 = df.iloc[0, 0]
        R2C1 = df.iloc[2, 1]
        
# Possibility 1
        if (R0C0 == '') and (R2C1 ==''):
            # To extract Gait Cycle
            print('both empty')
            if i == 0:
                print('left : ' + df.iloc[k-1, j+1] )
                print('right : ' + df.iloc[l-1, j+1] )
                
                left_value, left_sd = df.iloc[k-1, j+1].split(' ± ')
                right_value,right_sd = df.iloc[l-1, j+1].split(' ± ')   
            # To extract Swing phase
            if i == 1:
                left_value, left_sd = df.iloc[k+4, j].split(' ± ')
                right_value,right_sd = df.iloc[l+4, j].split(' ± ')      
            #To extract stance phase
            if i == 2:         
                left_value, left_sd = df.iloc[6, 4].split(' ± ')
                right_value,right_sd = df.iloc[7, 4].split(' ± ')                     
# Possbility 2        
        elif R2C1 == '':
            print('r2c1 empty')
            if i == 0:
                print('left : ' + df.iloc[k, j+1])
                print('right : ' + df.iloc[l, j+1])
                      
                left_value, left_sd = df.iloc[k, j+1].split(' ± ')
                right_value,right_sd = df.iloc[l, j+1].split(' ± ')     
            if i == 1:
                left_value, left_sd = df.iloc[k+5, j].split(' ± ')
                right_value,right_sd = df.iloc[l+5, j].split(' ± ')  
            if i == 2:        
                left_value, left_sd = df.iloc[7, 4].split(' ± ')
                right_value,right_sd = df.iloc[8, 4].split(' ± ')          
# Possibility 3          
        elif R0C0 == '':
            print('r0c0 empty')
            if i == 0:
                print('left : ' + df.iloc[k-1, j])
                print('right : ' + df.iloc[l-1, j])
                      
                left_value, left_sd = df.iloc[k-1, j].split(' ± ')
                right_value,right_sd = df.iloc[l-1, j].split(' ± ')   
            if i == 1:
                left_value, left_sd = df.iloc[k+4, j].split(' ± ')
                right_value,right_sd = df.iloc[l+4, j].split(' ± ')  
                 
            if i == 2:        
                left_value, left_sd = df.iloc[6, 4].split(' ± ')
                right_value,right_sd = df.iloc[7, 4].split(' ± ')          
# Possibility 4            
        else:
            print('both not empty')
            if i == 0:
                print('left : ' + df.iloc[k , j])
                print('right : ' + df.iloc[l, j])
                      
                      
                left_value, left_sd = df.iloc[k , j].split(' ± ')
                right_value,right_sd = df.iloc[l, j].split(' ± ')
            if i == 1:
                left_value, left_sd = df.iloc[k + 5 , j].split(' ± ')
                right_value,right_sd = df.iloc[l + 5, j].split(' ± ')        
            if i == 2:        
                left_value, left_sd = df.iloc[7, 4].split(' ± ')
                right_value,right_sd = df.iloc[8, 4].split(' ± ')  
                
# Appending the extracted values to the lists created                
        value_4.append(left_value)
        sd_4.append(left_sd)
        value_4.append(right_value)
        sd_4.append(right_sd)
        
    df4 = pd.DataFrame(list(zip(parameters_4, foot_4, sd_4, value_4, )),
                    columns=['Parameter', 'Foot', 'Standard Deviation', 'Mean' ])
    
    print(df4)
    print("\nPage 4 cleaning is complete")
    
    return df4
    
def clean_page_18(df):


# Predifined list of the parameters and foot infomration of the final DataFrame
    parameters_18 = [
    "Ankle IC Angle (deg)",
    "Ankle IC Angle (deg)"
    ]
    
    foot_18 = [
    "Left",
    "Right"
    ]
 
    
    value_18 = []
    
    sd_18 = []


    for i in range(2):
        left_value, left_sd = df.iloc[4, 7].split(' ± ')
        right_value,right_sd = df.iloc[5, 7].split(' ± ')
                    
# Append data into the empty lists created earlier
        value_18.append(left_value)
        sd_18.append(left_sd)
        value_18.append(right_value)
        sd_18.append(right_sd)
        
#Generating the final DataFrame
    df3 = pd.DataFrame(list(zip(parameters_18, foot_18, sd_18, value_18, )),
                       columns=['Parameter', 'Foot', 'Standard Deviation', 'Mean' ])
    
    print("\nPage 3 cleaning is complete")
    print(df3)
    return df3
        
        
     

            

page_cleaning_switch = {
    1: clean_page_1,
    3: clean_page_3,
    4: clean_page_4,
    18: clean_page_18
}


def extract_data_from_pdfs(pdf_dir, pages, output_dir):
    # Find all PDF files in the directory
    pdf_files = glob.glob(os.path.join(pdf_dir, '*.pdf'))

    if not pdf_files:
        print("No PDF files found in the specified directory.")
        return
    
    df_per_trajec = []
    
    
    # List to store all trajectory DataFrames
    combined_trajectory_dfs = []

    # Process each PDF file
    for pdf_file in pdf_files:
        Trajec_name = os.path.basename(pdf_file).split('.')[0].split('-')[0]
        dfs_to_combine = []
        
        
        print(f"\nProcessing file: {pdf_file}")
        

        # Extract tables from specified pages
        tables = camelot.read_pdf(
            pdf_file,
            pages=pages,
            flavor='stream',  # Use 'stream' or 'lattice' depending on your PDFs
            strip_text='\n',  # Remove line breaks within cells
            edge_tol=500,     # Tolerance for table edge detection; adjust as needed
            row_tol=20,       # Tolerance for row detection; adjust as needed
        )

        if tables.n == 0:
            print(f"    No tables found in {pdf_file} on pages {pages}.")
            continue

        print(f"  Found {tables.n} tables in {pdf_file} on pages {pages}.")

        # Iterate over each extracted table
        for i, table in enumerate(tables, start=1):
            # print(table)
            
            df = table.df  # Get the table as a DataFrame
            # print(df)
            # Optional: Clean the DataFrame
            df = df.dropna(how='all')           # Drop rows where all elements are NaN
            df = df.dropna(axis=1, how='all')   # Drop columns where all elements are NaN

            # Get the page number from the table
            page_number = table.page
            
            # Applying page-specific cleaning using the cleaning functions
            clean_function = page_cleaning_switch.get(page_number)
            if clean_function:
                cleaned_df = clean_function(df)
                dfs_to_combine.append(cleaned_df)
            else:
                print(f"No cleaning function defined for page {page_number}")
                continue
            
            # Combine all cleaned DataFrames vertically into one DataFrame for every PDFs
        if dfs_to_combine:
            df_per_trajec = pd.concat(dfs_to_combine, ignore_index=True)
                        # Rename the metric columns to include the trajectory name
            df_per_trajec = df_per_trajec.rename(columns={
                'Standard Deviation': f'{Trajec_name}_SD',
                'Mean': f'{Trajec_name}_Mean'
            })
            
            # Set 'Parameter' and 'Foot' as index for alignment
            df_per_trajec = df_per_trajec.set_index(['Parameter', 'Foot'])
            
            # Append the prepared DataFrame to the list
            combined_trajectory_dfs.append(df_per_trajec)
            
     # Combine all trajectory DataFrames horizontally and make a CSV
        if combined_trajectory_dfs:
            combined_df = pd.concat(combined_trajectory_dfs, axis=1)
            combined_df = combined_df.reset_index()
       
            
        # Combine all trajectories horizontally based on 'Parameter' and 'Foot'
        final_df = pd.concat(combined_trajectory_dfs, axis=1)

        # Reset index to turn 'Parameter' and 'Foot' back into columns
        final_df = combined_df.reset_index()
        
        final_df.to_csv(output_dir, index=False)
            
            

                
                    






               
            
                
            

    

# -----------------------------
# 3. Example Usage
# -----------------------------

pdf_dir = r"C:\Users\yangk\OneDrive - National University of Singapore\P013 SmartSole\60. Verification testing\TEST-003B\Results\MotionCloud Reports"
pages_to_extract = '1,3,4,18'
output_dir = r'C:\Users\yangk\OneDrive\Documents\HATCH\Programming Learning\Gait Report PDF Generator\Extracted Data from MotionCloud Report\extracted_data.csv'

# Call the extraction function
extracted_data = extract_data_from_pdfs(pdf_dir, pages_to_extract, output_dir)






Processing file: C:\Users\yangk\OneDrive - National University of Singapore\P013 SmartSole\60. Verification testing\TEST-003B\Results\MotionCloud Reports\fast_mid1-gait report.pdf
  Found 4 tables in C:\Users\yangk\OneDrive - National University of Singapore\P013 SmartSole\60. Verification testing\TEST-003B\Results\MotionCloud Reports\fast_mid1-gait report.pdf on pages 1,3,4,18.
Total number of Extracted Values: ['1.93', '136.50', '45', '19.78', '38.56']
             Parameter  Foot Standard Deviation    Mean
0          Speed (m/s)  Both                NaN    1.93
1  Cadence (steps/min)  Both                NaN  136.50
2                Steps  Both                NaN      45
3         Duration (s)  Both                NaN   19.78
4         Distance (m)  Both                NaN   38.56

Page 1 cleaning is complete
in this iteration, j =1 and i =0
in this iteration, j =3 and i =1
in this iteration, j =5 and i =2

Page 3 cleaning is complete
            Parameter   Foot Standard Deviation

PermissionError: [Errno 13] Permission denied: 'C:\\Users\\yangk\\OneDrive\\Documents\\HATCH\\Programming Learning\\Gait Report PDF Generator\\Extracted Data from MotionCloud Report\\extracted_data.csv'