In [None]:
# set up java folder in root according to https://pslcdatashop.web.cmu.edu/about/webservices.html#java-client

import subprocess
import requests
import pandas as pd
from io import StringIO

def send_command(command):
    try:
        # Run the command in a shell and capture the output
        output = subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT, universal_newlines=True)
        return output
    except subprocess.CalledProcessError as e:
        # Handle any errors that occurred during command execution
        return e.output
    
def get_transaction_baseurl(dataset_id='5415', sample_id='7658'):
    root = 'https://pslcdatashop.web.cmu.edu/services/'
    path = f'datasets/{dataset_id}/samples/{sample_id}/transactions'
    return root+path

def get_transaction_fullurl(dataset_id='5415', sample_id='7658', offset = '0'):
    params = {
        'cfs': 'all',
        'limit': '5000',
        'offset': str(offset)
    }
    url = get_transaction_baseurl(dataset_id=dataset_id, sample_id=sample_id)
    req = requests.models.PreparedRequest()
    req.prepare_url(url, params)
    return req.url

def parse_tab_delimited_output(output):
    # Convert the output string into a list of lines
    lines = output.strip().split('\n')
    # Convert the lines into a tab-delimited string
    tab_delimited_str = '\n'.join(lines)
    # Create a Pandas DataFrame from the tab-delimited string
    df = pd.read_csv(StringIO(tab_delimited_str), delimiter='\t')
    return df
    
def send_ds_command(url):
    command = f'java -jar ./dist/datashop-webservices.jar "{url}"'
    output = send_command(command)
    return output

def get_tx_data(dataset_id='5415', sample_id='7658', offset = '0'):
    url = get_transaction_fullurl(dataset_id=dataset_id, sample_id=sample_id, offset=offset)
    output = send_ds_command(url)
    df = parse_tab_delimited_output(output)
    return df

def get_full_tx_records(dataset_id='5415', sample_id='7658'):
    ret = 5000
    offset = 0
    dfs = []
    while ret == 5000:
        print(f'Paginating transactions with offset {offset}')
        resp = get_tx_data(dataset_id=dataset_id, sample_id=sample_id, offset = offset)
        dfs.append(resp)
        ret = resp.shape[0]
        offset += resp.shape[0]
    ans = pd.concat(dfs)
    return ans
        
    

In [None]:
df = get_full_tx_records(dataset_id='5415', sample_id='7658')

In [None]:
df.to_csv('tx.csv', index=False)