In [1]:
import pandas as pd
import camelot

In [2]:
filename = 'data/canopy_technical_test_input.pdf'

In [3]:
tables = camelot.read_pdf(filename, flavor='stream', pages='all')

In [4]:
df = tables[0].df
df.head()

Unnamed: 0,0,1,2,3,4,5,6
0,31.03.2018 - 30.04.2018,,,,,,
1,Account SG1234567-01-01-JPY01 in JPY,,,,,,
2,Booking Details,,,,,,
3,Booking Date,Txn Date,Booking Text,Value Date,Debit,Credit,Balance
4,31.03.2018,31.03.2018,Initial Balance,,,,0.00


### Cleaning
Remove the first few rows that are not part of the table.  
Assuming `Booking Date` will always the 1st column header

In [5]:
def find_header_row(df):
    for i,row in df.iterrows():
        if row[0] == 'Booking Date': return i
h_row = find_header_row(df)
h_row

3

In [6]:
headers = list(df.iloc[h_row,:])
headers

['Booking Date',
 'Txn Date',
 'Booking Text',
 'Value Date',
 'Debit',
 'Credit',
 'Balance']

In [7]:
# strip the top part
df = df.iloc[h_row+1:,:]
# set the header
df.columns = headers

In [8]:
df.head()

Unnamed: 0,Booking Date,Txn Date,Booking Text,Value Date,Debit,Credit,Balance
4,31.03.2018,31.03.2018,Initial Balance,,,,0.0
5,01.04.2018,01.04.2018,VALUE DATED BALANCE BROUGHT FORWARD,01.04.2018,,180431640.0,180431640.0
6,06.04.2018,06.04.2018,INTEREST-FIXED TERM LOAN,06.04.2018,472500.0,,179959140.0
7,,,Contract No: 3001-AA18091ZN72C|Interest rate:,,,,
8,,,"0.810000%|Capital: 1,000,000,000.00|Period: 16...",,,,


### Merge the rows above that don't have balance entry
This perticular column was chosen since the other columns can be empty while the balance entry always exists

In [9]:
# index if each valid entry starting row
starts = list(df.index[df['Balance'].astype(bool)])
starts

[4, 5, 6, 10, 14, 16, 18, 22, 24, 28, 30]

In [10]:
for i,j in zip(starts[:-1],starts[1:]):
    df['Booking Text'][i] = "\n".join(df['Booking Text'].loc[i:j-1])
df = df[df['Balance'].astype(bool)]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exec(code_obj, self.user_global_ns, self.user_ns)


In [11]:
df = df.reset_index(drop=True)

In [12]:
df

Unnamed: 0,Booking Date,Txn Date,Booking Text,Value Date,Debit,Credit,Balance
0,31.03.2018,31.03.2018,Initial Balance,,,,0.0
1,01.04.2018,01.04.2018,VALUE DATED BALANCE BROUGHT FORWARD,01.04.2018,,180431640.0,180431640.0
2,06.04.2018,06.04.2018,INTEREST-FIXED TERM LOAN\nContract No: 3001-AA...,06.04.2018,472500.0,,179959140.0
3,06.04.2018,06.04.2018,INTEREST-FIXED TERM LOAN\nContract No: 3001-AA...,06.04.2018,315000.0,,179644140.0
4,06.04.2018,06.04.2018,FOREX SPOT\nEUR/JPY 130.7271,06.04.2018,,472500.0,180116640.0
5,06.04.2018,06.04.2018,FOREX SPOT\nEUR/JPY 130.7021,06.04.2018,,315000.0,180431640.0
6,09.04.2018,09.04.2018,INTEREST-FIXED TERM LOAN\nContract No: 3001-AA...,09.04.2018,157500.0,,180274140.0
7,09.04.2018,09.04.2018,FOREX SPOT\nEUR/JPY 131.1407,09.04.2018,,157500.0,180431640.0
8,10.04.2018,10.04.2018,INTEREST-FIXED TERM LOAN\n\n\nContract No: 300...,10.04.2018,157500.0,,180274140.0
9,10.04.2018,10.04.2018,FOREX SPOT\nEUR/JPY 131.1953,10.04.2018,,157500.0,180431640.0
