In [3]:
import camelot
import os
import pandas as pd
import glob
import re


In [12]:


def clean_page_1(df):
    '''
    Cleans and extracts data from page 1 of the PDF.
    
    Inputs:
        df: DataFrame with the raw data extracted from page 1 of the PDF)
    Returns:
        df1: cleaned DataFrame
    
    '''
    print(df)
    # Defined the parameters to extract
    parameters = [
        "Speed (m/s)",
        "Cadence (steps/min)",
        "Steps",
        "Duration (s)",
        "Distance (m)"
    ]
    
        #Lists for foot information, standard deviations and mean value
    foot = ['Both'] * 5
    
    sd = ['NaN'] * 5
    
    values = []
    
    #Use regex to extract all numeric value and store in the list in a sequential order
    
    for index, row in df.iterrows():
        text = str(row[1]).strip()
    
        nums = re.findall(r'\d+\.\d+|\d+', text)
        if len(nums) > 0:
            values.append(nums[0])
            
    
    print(f"Total number of Extracted Values: {values}")
    
    
    
    # Creates the new dataframe with 'parameters' and 'values' list
    
    df1 = pd.DataFrame({
        'Parameter': parameters,
        'Foot':foot,
        'Standard Deviation':sd,
        'Mean': values
    })
    
    print(df1)
    print("\nPage 1 cleaning is complete")
    
    return df1


def clean_page_3(df):
    
    '''
    Cleans and extracts data from page 1 of the PDF.
    
    Inputs:
        df: DataFrame with the raw data extracted from page 3 of the PDF)
    Returns:
        df3: cleaned DataFrame of page 3
    
    '''
    
    # Define paramers to extract and foot information
    parameters_3 = [
        "Step Length (cm)",
        "Step Length (cm)",
        "Step Width (cm)",
        "Step Width (cm)",
        "Stride Length (cm)",
        "Stride Length (cm)"
        ]
    
    foot_3 = [
    "Left",
    "Right",
    "Left",
    "Right",
    "Left",
    "Right"
    ]
    
    # Empty lists to store extracted value from the PDF
    sd_3 = []
    
    value_3 = []
    
    parameter_number = 3
    
    j = 1 #Column index
    
# Split data at the '+/-' sign to extract mean value and standard deviation separately
    for i in range(parameter_number):
        print(f'in this iteration, j ={j} and i ={i}')
        left_value, left_sd = df.iloc[2, j].split(' ± ')
        right_value,right_sd = df.iloc[3, j].split(' ± ')
                    
# Append data into the empty lists created earlier
        value_3.append(left_value)
        sd_3.append(left_sd)
        value_3.append(right_value)
        sd_3.append(right_sd)
        
        j+=2
        
# Generating a new DataFrame using extracted data
    df3 = pd.DataFrame(list(zip(parameters_3, foot_3, sd_3, value_3, )),
                       columns=['Parameter', 'Foot', 'Standard Deviation', 'Mean' ])
    
    print("\nPage 3 cleaning is complete")
    print(df3)
    return df3
    
    
def clean_page_4(df):
    
    '''
    Cleans and extracts data from page 4 of the PDF.
    
    Inputs:
        df: DataFrame with the raw data extracted from page 4 of the PDF)
    Returns:
        df3: cleaned DataFrame of page 4
    
    '''
    
    print(df)

# Define parameters and foot information
    parameters_4 = [
    "Gait Cycle (s)",
    "Gait Cycle (s)",
    "Swing Phase (s)",
    "Swing Phase (s)",
    "Stance Phase (s)",
    "Stance Phase (s)"
    ]
    foot_4 = [
    "Left",
    "Right",
    "Left",
    "Right",
    "Left",
    "Right"
    ]
    
    value_4 = []
    
    sd_4 = []
    
    j = 1 # Counter to iterates through each column
    k = 2 # Row index for extracting the left foot data
    l = 3 # Row index for extracting right foot data

    for i in range(3):
        
        
        print(f'in this iteration, j ={j} and i ={i}')
        
        R0C0 = df.iloc[0, 0]
        R2C1 = df.iloc[2, 1]
        
## Handle the four different data loyut possibilities


# Possibility 1
        if (R0C0 == '') and (R2C1 ==''):
            # To extract Gait Cycle
            print('both empty')
            if i == 0:
                print('left : ' + df.iloc[k-1, j+1] )
                print('right : ' + df.iloc[l-1, j+1] )
                
                left_value, left_sd = df.iloc[k-1, j+1].split(' ± ')
                right_value,right_sd = df.iloc[l-1, j+1].split(' ± ')   
            # To extract Swing phase
            if i == 1:
                left_value, left_sd = df.iloc[k+4, j].split(' ± ')
                right_value,right_sd = df.iloc[l+4, j].split(' ± ')      
            #To extract stance phase
            if i == 2:         
                left_value, left_sd = df.iloc[6, 4].split(' ± ')
                right_value,right_sd = df.iloc[7, 4].split(' ± ')                     
# Possbility 2        
        elif R2C1 == '':
            print('r2c1 empty')
            if i == 0:
                print('left : ' + df.iloc[k, j+1])
                print('right : ' + df.iloc[l, j+1])
                      
                left_value, left_sd = df.iloc[k, j+1].split(' ± ')
                right_value,right_sd = df.iloc[l, j+1].split(' ± ')     
            if i == 1:
                left_value, left_sd = df.iloc[k+5, j].split(' ± ')
                right_value,right_sd = df.iloc[l+5, j].split(' ± ')  
            if i == 2:        
                left_value, left_sd = df.iloc[7, 4].split(' ± ')
                right_value,right_sd = df.iloc[8, 4].split(' ± ')          
# Possibility 3          
        elif R0C0 == '':
            print('r0c0 empty')
            if i == 0:
                print('left : ' + df.iloc[k-1, j])
                print('right : ' + df.iloc[l-1, j])
                      
                left_value, left_sd = df.iloc[k-1, j].split(' ± ')
                right_value,right_sd = df.iloc[l-1, j].split(' ± ')   
            if i == 1:
                left_value, left_sd = df.iloc[k+4, j].split(' ± ')
                right_value,right_sd = df.iloc[l+4, j].split(' ± ')  
                 
            if i == 2:        
                left_value, left_sd = df.iloc[6, 4].split(' ± ')
                right_value,right_sd = df.iloc[7, 4].split(' ± ')          
# Possibility 4            
        else:
            print('both not empty')
            if i == 0:
                print('left : ' + df.iloc[k , j])
                print('right : ' + df.iloc[l, j])
                      
                      
                left_value, left_sd = df.iloc[k , j].split(' ± ')
                right_value,right_sd = df.iloc[l, j].split(' ± ')
            if i == 1:
                left_value, left_sd = df.iloc[k + 5 , j].split(' ± ')
                right_value,right_sd = df.iloc[l + 5, j].split(' ± ')        
            if i == 2:        
                left_value, left_sd = df.iloc[7, 4].split(' ± ')
                right_value,right_sd = df.iloc[8, 4].split(' ± ')  
                
# Appending the extracted values to the lists created                
        value_4.append(left_value)
        sd_4.append(left_sd)
        value_4.append(right_value)
        sd_4.append(right_sd)
        
    # Creating a clean DataFrame with the extracted data
        
    df4 = pd.DataFrame(list(zip(parameters_4, foot_4, sd_4, value_4, )),
                    columns=['Parameter', 'Foot', 'Standard Deviation', 'Mean' ])
    
    print(df4)
    print("\nPage 4 cleaning is complete")
    
    return df4
    
def clean_page_18(df):

    '''
        Cleans and extracts data from page 18 of the PDF.
        
        Inputs:
            df: DataFrame with the raw data extracted from page 18 of the PDF)
        Returns:
            df18: cleaned DataFrame of page 18
        
    '''
    

# Predifined list of the parameters and foot infomration of the final DataFrame
    parameters_18 = [
    "Ankle IC Angle (deg)",
    "Ankle IC Angle (deg)"
    ]
    
    foot_18 = [
    "Left",
    "Right"
    ]
 
    
    value_18 = []
    
    sd_18 = []



    left_value, left_sd = df.iloc[4, 7].split(' ± ')
    right_value,right_sd = df.iloc[5, 7].split(' ± ')
                
# Append data into the empty lists created earlier
    value_18.append(left_value)
    sd_18.append(left_sd)
    value_18.append(right_value)
    sd_18.append(right_sd)
    

# Create a new DataFrame with the extracted data
    df18 = pd.DataFrame(list(zip(parameters_18, foot_18, sd_18, value_18, )),
                       columns=['Parameter', 'Foot', 'Standard Deviation', 'Mean' ])
    
    print("\nPage 3 cleaning is complete")
    # print(df3)
    return df18
        
        
     

            
# Mapping the page numbers to their respective cleaning functions
page_cleaning_switch = {
    1: clean_page_1,
    3: clean_page_3,
    4: clean_page_4,
    18: clean_page_18
}


def extract_data_from_pdfs(pdf_dir, pages, output_dir):
    '''
    Extracts and cleans data from specified pages and combines all into a single CSV
    
    Parameters:
        pdf_dir(str) : Directory containing PDF files.
        pages (list): Pages to extract, currentlym only suppurs pages 1,3,4 and 18
        output_dir (str): Directory to save the output CSV file
    '''
    
    
    
    # Find all PDF files in the directory
    pdf_files = glob.glob(os.path.join(pdf_dir, '*.pdf'))

    if not pdf_files:
        print("No PDF files found in the specified directory.")
        return
    
    df_per_trajec = []
    
    
    # List to store all trajectory DataFrames
    combined_trajectory_dfs = []

    # Process each PDF file
    for pdf_file in pdf_files:
        Trajec_name = os.path.basename(pdf_file).split('.')[0].split('-')[0]
        dfs_to_combine = []
        
        
        print(f"\nProcessing file: {pdf_file}")
        

        # Extract tables from specified pages
        tables = camelot.read_pdf(
            pdf_file,
            pages=pages,
            flavor='stream',  # Use 'stream' or 'lattice' depending on your PDFs
            strip_text='\n',  # Remove line breaks within cells
            edge_tol=500,     # Tolerance for table edge detection
            row_tol=20,       # Tolerance for row detection
        )

        if tables.n == 0:
            print(f"    No tables found in {pdf_file} on pages {pages}.")
            continue

        print(f"  Found {tables.n} tables in {pdf_file} on pages {pages}.")

        # Iterate over each extracted table
        for i, table in enumerate(tables, start=1):
            
            df = table.df  # Get the table as a DataFrame
            df = df.dropna(how='all')           # Drop empty rows
            df = df.dropna(axis=1, how='all')   # Drop empty columns

            # Get the page number from the table
            page_number = table.page
            
            # Select the appropriate cleaning function based on page number
            clean_function = page_cleaning_switch.get(page_number)
            if clean_function:
                cleaned_df = clean_function(df)
                dfs_to_combine.append(cleaned_df)
            else:
                print(f"No cleaning function defined for page {page_number}")
                continue
            
            # Combine all cleaned DataFrames vertically into one DataFrame for the current PDF
        if dfs_to_combine:
            df_per_trajec = pd.concat(dfs_to_combine, ignore_index=False)
            
            print(df_per_trajec)
            # Rename the metric columns to include the trajectory name
            df_per_trajec = df_per_trajec.rename(columns={
                'Standard Deviation': f'{Trajec_name}_SD',
                'Mean': f'{Trajec_name}_Mean'
            })
            
            # Set 'Parameter' and 'Foot' as index for alignment
            df_per_trajec = df_per_trajec.set_index(['Parameter', 'Foot'])
            
            # Append the prepared DataFrame to the list
            combined_trajectory_dfs.append(df_per_trajec)
            
     # Combine all trajectory DataFrames horizontally and make a CSV
        if combined_trajectory_dfs:
            combined_df = pd.concat(combined_trajectory_dfs, axis=1)
            combined_df = combined_df.reset_index()
       
            
        # Combine all trajectories horizontally
        final_df = pd.concat(combined_trajectory_dfs, axis=1)

        # Reset index to turn 'Parameter' and 'Foot' back into columns
        final_df = final_df.reset_index()
        print(final_df.T)
        
        final_df.to_csv(output_dir, index=False)
            
            

                
                    






               
            
                
            

    

# -----------------------------
# 3. Example Usage
# -----------------------------

pdf_dir = r"C:\Users\yangk\OneDrive - National University of Singapore\P013 SmartSole\60. Verification testing\TEST-003B\Results\MotionCloud Reports"
pages_to_extract = '4,18'
output_dir = r'C:\Users\yangk\OneDrive\Documents\HATCH\Programming Learning\Gait Report PDF Generator\Extracted Data from MotionCloud Report\extracted_data.csv'

# Call the extraction function
extracted_data = extract_data_from_pdfs(pdf_dir, pages_to_extract, output_dir)






Processing file: C:\Users\yangk\OneDrive - National University of Singapore\P013 SmartSole\60. Verification testing\TEST-003B\Results\MotionCloud Reports\fast_mid1-gait report.pdf
  Found 2 tables in C:\Users\yangk\OneDrive - National University of Singapore\P013 SmartSole\60. Verification testing\TEST-003B\Results\MotionCloud Reports\fast_mid1-gait report.pdf on pages 4,18.
                       0             1               2             3  \
0                                         Duration (s)                 
1                   Left                   0.88 ± 0.01          Left   
2                  Right                   0.88 ± 0.01         Right   
3             Difference          0.00                    Difference   
4            Swing Phase                                Stance Phase   
5                         Duration (s)  Gait Cycle (%)                 
6                   Left   0.39 ± 0.01    43.81 ± 0.99          Left   
7                  Right   0.39 ± 0.01    44.