In [10]:
from tabula import read_pdf
from tabulate import tabulate
import re
import numpy as np
import pandas as pd
import io, os
os.chdir(os.getcwd())

In [11]:
def colTofloat(df_col):
    """
    convert dataframe from string to float and escape TypeError from type like float
    Parameters:
    -------
    df_col: :class: `pandas.Series`: dataframe column data

    Return:
    -------
    col_array: :class: `list`: converted list of data
    std_array: :class: `list`: converted list of stds
    """
    re_before_paran = re.compile("(.*?)\s*\((.*?)\)")  #regex of extracting values before left bracket
    col_array = []  # collect exp data
    std_array = []  # collect exp std
    for idx, ele in enumerate(df_col):
        if "(" in str(ele):  # detect if stds are given in the tabula, if not, assign NaN
            try:
                ele_data = re_before_paran.match(ele).group(1)
                col_array.append(float(ele_data))
                std_str = re.search(r'\((.*?)\)',ele).group(1)  # regex of extracting values between brackets
                std_array.append(float(std_str))
            except:
                std_array.append(np.nan)
                col_array.append(ele)
        else:
            std_array.append(np.nan)
            try: 
                col_array.append(float(ele))
            except:
                col_array.append(ele)
    return col_array, std_array

In [12]:
# read your desire page of the table
target_page = 4
pdf_name = 'Zhang+Lithos.pdf'
file = read_pdf(pdf_name,pages = target_page,
                         multiple_tables = True, stream = True)
table = tabulate(file)

df_table_data = pd.read_fwf(io.StringIO(table))
df_table_std = df_table_data.copy()  # keep the dataframe for std having same dimension as exp data
# seperate dataframe by multi whitespace, convert string to float for exp data and std
for col in df_table_data.columns:
    split_col = df_table_data[col].str.split(" +", n = 1, expand = True)[1]
    df_table_data[col] = colTofloat(split_col)[0]
    df_table_std[col] = colTofloat(split_col)[1]

In [13]:
df_table_data.head(10)

Unnamed: 0,-----------------------------------,-------------------------------,-------------------------,--------------------------------,-------------------------.1,--------------------------,------------------------,------------------------.1,------------------------.2,------------------------.3,-------------------------.2,------------------------.4,-------------------------.3,------------------------.5,--------------------------.1,---------------------------,-------------------------------.1,------------------------.6
0,cNAB-13 melt (20),b,47.95,,5.39,8.29,12.69,0.17,13.35,9.0,1.12,0.56,0.32,0.03,0.11,98.98,65.0,
1,NAB-11 melt (20),,47.53,,5.5,8.26,12.66,0.18,12.29,9.14,1.7,0.74,0.56,0.03,0.12,98.72,64.0,
2,ol (12),,39.96,,0.09,0.05,12.62,0.15,45.83,0.28,0.01,,0.05,0.33,0.1,99.47,87.0,0.27
3,NAB-10 melt (20),,49.45,,5.71,9.21,13.0,0.18,10.9,9.54,1.82,0.78,0.63,0.03,0.12,101.39,60.0,
4,ol (9),,38.31,,0.13,0.09,14.67,0.21,42.68,0.35,0.02,,0.06,0.32,0.15,96.99,84.0,0.31
5,NAB-9 melt (20),,48.29,,5.81,8.93,12.93,0.2,10.3,9.75,1.71,0.81,0.64,0.02,0.13,99.53,59.0,
6,ol (9),,40.29,,0.08,0.06,14.46,0.21,45.18,0.3,0.03,,0.04,0.4,0.1,101.14,85.0,0.26
7,NAB-8 melt (20),,48.33,,5.89,9.25,12.6,0.17,8.45,10.25,1.86,0.87,0.69,0.03,0.08,98.44,55.0,
8,ol (10),,39.07,,0.12,0.08,16.18,0.2,43.15,0.3,0.02,,0.04,0.29,0.22,99.68,83.0,0.26
9,sp (3),,0.0,,11.59,7.52,32.39,0.26,11.28,,,,,0.32,32.56,95.92,,


In [14]:
df_table_std.head(10)

Unnamed: 0,-----------------------------------,-------------------------------,-------------------------,--------------------------------,-------------------------.1,--------------------------,------------------------,------------------------.1,------------------------.2,------------------------.3,-------------------------.2,------------------------.4,-------------------------.3,------------------------.5,--------------------------.1,---------------------------,-------------------------------.1,------------------------.6
0,,,0.27,,0.17,0.09,0.26,0.06,0.14,0.16,0.08,0.03,0.04,0.03,0.06,,,
1,,,0.48,,0.17,0.19,0.32,0.05,0.24,0.2,0.08,0.04,0.06,0.03,0.05,,,
2,,,0.42,,0.03,0.05,0.22,0.05,0.47,0.03,0.01,,0.05,0.11,0.14,,,
3,,,0.23,,0.17,0.11,0.31,0.04,0.16,0.16,0.07,0.05,0.04,0.03,0.06,,,
4,,,0.32,,0.13,0.15,0.17,0.04,0.56,0.21,0.03,,0.03,0.12,0.17,,,
5,,,0.24,,0.19,0.11,0.33,0.07,0.14,0.12,0.08,0.04,0.04,0.03,0.06,,,
6,,,0.44,,0.04,0.06,0.29,0.07,0.41,0.04,0.05,,0.03,0.14,0.15,,,
7,,,0.22,,0.15,0.11,0.25,0.06,0.15,0.17,0.08,0.05,0.04,0.03,0.03,,,
8,,,0.31,,0.09,0.08,0.33,0.06,0.37,0.05,0.03,,0.04,0.09,0.3,,,
9,,,0.0,,1.49,0.36,0.99,0.06,0.45,,,,,0.16,1.96,,,


In [15]:
df_table_data.to_excel("table_data.xlsx")
df_table_std.to_excel("table_std.xlsx")