In [3]:
import requests
import json
import os
from instabase.notebook.conf import ibconfig

# URI
INSTABASE_URI = 'https://dogfood.instabase.com'
RUN_DIFF_API = INSTABASE_URI + '/api/v1/diff/compute'
GET_FLOW_STATUS_API = INSTABASE_URI + '/api/v1/jobs/status'

# Set your folder paths
# ---------------------

PATH_TO_INPUT_FILES_A = u'/sudeep/instabase-tutorials/fs/Instabase Drive/files/doc_A/input'  # fill this
PATH_TO_INPUT_FILES_B = u'/sudeep/instabase-tutorials/fs/Instabase Drive/files/doc_B/input'  # fill this
PATH_TO_OUTPUT_FOLDER = u'/sudeep/instabase-tutorials/fs/Instabase Drive/files/out'  # fill this

# Set difference parameters
# -------------------------

SENSITIVITY = 0.8 # Number between 0 and 1
IGNORE_DOTS_SMALLER_THAN_WIDTH_PCT = 0.009
IGNORE_DOTS_SMALLER_THAN_HEIGHT_PCT = 0.008
IGNORE_MARGIN_X_PERCENT = 0.009
IGNORE_MARGIN_Y_PERCENT = 0.009
IGNORE_LINES_LONGER_THAN_WIDTH_PCT = 0.2
IGNORE_LINES_LONGER_THAN_HEIGHT_PCT = 0.01
SIGNATURE_KEYWORDS = 'Yours Faithfully,Sincerely,For and on behalf of'

# Request header
MY_API_TOKEN = 'zr3NkK0axjXUrPEOAXB8k7TuNd79OV'                                     # Fill this out: Put your API token here.
HEADERS = {
  'Authorization': 'Bearer {}'.format(MY_API_TOKEN),
}

# Process Files
# -------------
RUN_FLOW_API = INSTABASE_URI + '/api/v1/flow/run_flow_async'

def process_files(input_dir_path):
  data = json.dumps({
    'input_dir': input_dir_path,
    'ibflow_path': 'sudeep/instabase-tutorials/fs/Instabase Drive/helper_flows/process_files_flow.ibflow',
  })
  r = requests.post(RUN_FLOW_API, headers=HEADERS, data=data)
  return r.content

import StringIO
import time

def get_status(job_id):
    r = requests.get(
        GET_FLOW_STATUS_API + '?job_id=' + job_id, headers=HEADERS, cookies=ibconfig.cookies)
    print(r.content)
    return json.loads(r.content)

def get_flow_output_folder(flow_resp):
    
    job_id = flow_resp['data']['job_id']
    
    print('Showing flow results for job: ' + job_id)
    done = False
    resp = {}
    
    while not done:
        resp = get_status(job_id)
        print(resp)
        if resp['status'] != 'OK':
            print(resp['msg'])
            return

        if resp['state'] != 'DONE':
            cur_status = json.loads(resp.get('cur_status', '{}'))
            step_name = cur_status.get('stepName')
            msg = cur_status.get('curMsg')
            print('Running: ' + step_name + ', ' + msg + '...')
            time.sleep(4)
            continue
        
        done = True
        break
    
    output_folder = resp['results'][0]['output_folder']
    return output_folder


resp_A = json.loads(process_files(PATH_TO_INPUT_FILES_A))
resp_B = json.loads(process_files(PATH_TO_INPUT_FILES_B))

PATH_TO_OUTPUT_OF_PROCESS_FILES_A = get_flow_output_folder(resp_A)
PATH_TO_OUTPUT_OF_PROCESS_FILES_B = get_flow_output_folder(resp_B)

print(PATH_TO_OUTPUT_OF_PROCESS_FILES_A)
print(PATH_TO_OUTPUT_OF_PROCESS_FILES_B)


Showing flow results for job: 1dd5672c-49ad-478e-9111-182d26a8f14d
{"status": "OK", "cur_status": "{\"index\": 0, \"stepName\": \"Process Files\", \"curMsg\": \"Completed 0/1 files (0/0 pages)\", \"subindex\": 0, \"total\": 1, \"subtotal\": 0}", "state": "PENDING", "job_id": "1dd5672c-49ad-478e-9111-182d26a8f14d", "completed_count": 0, "results": [null]}
{u'status': u'OK', u'cur_status': u'{"index": 0, "stepName": "Process Files", "curMsg": "Completed 0/1 files (0/0 pages)", "subindex": 0, "total": 1, "subtotal": 0}', u'job_id': u'1dd5672c-49ad-478e-9111-182d26a8f14d', u'completed_count': 0, u'results': [None], u'state': u'PENDING'}
Running: Process Files, Completed 0/1 files (0/0 pages)...
{"status": "OK", "cur_status": "{\"finish_timestamp\": 1532545629.0, \"index\": 1, \"stepName\": \"Process Files\", \"curMsg\": \"Completed 1/1 files (2/2 pages)\", \"subindex\": 2, \"total\": 1, \"subtotal\": 2}", "state": "DONE", "job_id": "1dd5672c-49ad-478e-9111-182d26a8f14d", "completed_count":

In [4]:
def _pretty_print(d):
  try:
    d = json.loads(d)
  except:
    pass
  return json.dumps(d, sort_keys=True, indent=4, separators=(',', ': '))

def run_diff():
    data = json.dumps({
      "diffType": "diff:strikethrough",
      "folderA": PATH_TO_OUTPUT_OF_PROCESS_FILES_A,
      "folderB": PATH_TO_OUTPUT_OF_PROCESS_FILES_B,
      "analysisFolder": PATH_TO_OUTPUT_FOLDER,
      "globalParameterOverrides": {
        "global_sensitivity": SENSITIVITY,
        "ignore_dots_smaller_than_width_percent": IGNORE_DOTS_SMALLER_THAN_WIDTH_PCT,
        "ignore_dots_smaller_than_height_percent": IGNORE_DOTS_SMALLER_THAN_HEIGHT_PCT,
        "ignore_margin_x_percent": IGNORE_MARGIN_X_PERCENT,
        "ignore_margin_y_percent": IGNORE_MARGIN_Y_PERCENT,
        "ignore_lines_longer_than_width_percent": IGNORE_LINES_LONGER_THAN_WIDTH_PCT,
        "ignore_lines_shorter_than_height_percent": IGNORE_LINES_LONGER_THAN_HEIGHT_PCT,
        "extract_location_keywords": SIGNATURE_KEYWORDS
      }
    })
    r = requests.post(RUN_DIFF_API, headers=HEADERS, data=data)
    return r.content


diff_resp = run_diff()
_pretty_print(diff_resp)
print(diff_resp_dict)

{"analysisFolder": "/sudeep/instabase-tutorials/fs/Instabase Drive/files/out", "folderB": "/sudeep/instabase-tutorials/fs/Instabase Drive/files/doc_B/out/process_files_helper", "folderA": "/sudeep/instabase-tutorials/fs/Instabase Drive/files/doc_A/out/process_files_helper", "diffType": "diff:strikethrough", "globalParameterOverrides": {"ignore_dots_smaller_than_height_percent": 0.008, "ignore_lines_shorter_than_height_percent": 0.01, "extract_location_keywords": "Yours Faithfully,Sincerely,For and on behalf of", "ignore_margin_y_percent": 0.009, "ignore_margin_x_percent": 0.009, "global_sensitivity": 0.8, "ignore_lines_longer_than_width_percent": 0.2, "ignore_dots_smaller_than_width_percent": 0.009}}
{'Authorization': 'Bearer zr3NkK0axjXUrPEOAXB8k7TuNd79OV'}
https://dogfood.instabase.com/api/v1/diff/compute
{"status": "OK", "job_id": "bb50ebc7-bcdc-40f4-8a21-84af4ba0975a"}
{u'status': u'OK', u'job_id': u'bb50ebc7-bcdc-40f4-8a21-84af4ba0975a'}


In [5]:
PATH_TO_OUTPUT_OF_DIFF = get_flow_output_folder(diff_resp_dict)
print(PATH_TO_OUTPUT_OF_DIFF)

KeyError: 'data'

In [None]:
import StringIO
import time

def _display_file(full_path):
    
    _, ext = os.path.splitext(full_path)
    
    with ib.open('/' + full_path.lstrip('/'), 'r') as f:
        content = f.read()
    
    if ext == '.csv':
        display(pd.read_csv(StringIO.StringIO(content)))
    else:
        print(content)
    
def print_diff_result(flow_resp):
    
    job_id = None                                              # Fill this out: Extract the job_id from the flow_resp
    
    print('Showing flow results for job: ' +job_id)
    done = False
    resp = {}
    
    while not done:
        resp = get_status(job_id)
        print(resp)
        if resp['status'] != 'OK':
            print(resp['msg'])
            return

        if resp['state'] != 'DONE':
            cur_status = json.loads(resp.get('cur_status', '{}'))
            step_name = cur_status.get('stepName')
            msg = cur_status.get('curMsg')
            print('Running: ' + step_name + ', ' + msg + '...')
            time.sleep(4)
            continue
        
        done = True
        break
    
    output_folder = resp['results'][0]['output_folder']
    
    list_resp, err = ib.list_dir(output_folder)
    if err:
        print(err)
        return
    
    full_path = ''
    
    for node in list_resp['nodes']:
        if node['type'] != 'file':
            continue
        
        full_path = node['full_path']
        break
    
    if not full_path:
        print('No files found in the output directory')
        return
    _display_file(full_path)
