In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [None]:
import os
import json
import markdown
from bs4 import BeautifulSoup
from tqdm import tqdm
from dataclasses import dataclass

@dataclass
class Sample:
  api: dict
  endpoint: dict
  payload: dict | None

def remove_markdown(string: str) -> str:
  html = markdown.markdown(string)
  return BeautifulSoup(html, features="html.parser").get_text()

apis = []
for root, dirs, files in os.walk('./data'):
  for file in files:
    if file.endswith('.json'):
      with open(os.path.join(root, file), 'r') as f:
        api = json.load(f)
        if not api['version']:
          continue
        apis.append(api)

samples = []
for api in tqdm(apis):
  for endpoint in api['version']['endpoints']:
    for payload in endpoint['responsePayloads']:
      if not payload['examples'].get('Response'):
        continue
      payload_data = payload['examples']['Response']['value']
      if not isinstance(payload_data, dict) and not isinstance(payload_data, list):
        continue
      
      strings = ''.join([api['name'], api['description'], endpoint['name'], endpoint['description']])
      if any('\u4e00' <= char <= '\u9fff' for char in strings):
        continue
      if 'amazon' in strings.lower():
        continue

      sample = Sample(
        api={
          'id': api['id'],
          'name': remove_markdown(api['name']),
          'description': remove_markdown(api['description']),
          'slug': api['slugifiedName'],
          'category': api['category'],
          'score': api['score']['popularityScore'] if api['score'] is not None else None,
        },
        endpoint={
          'id': endpoint['id'],
          'route': endpoint['route'],
          'method': endpoint['method'],
          'name': remove_markdown(endpoint['name']),
          'description': remove_markdown(endpoint['description']),
          'params': [{
            'name': p['name'],
            'type': p.get('paramType'),
            'description': p.get('description'),
            'condition': p['condition'],
          } for p in (endpoint['params']['parameters'] if endpoint['params'] else [])],
        },
        payload={
          'name': payload['name'],
          'status': payload['statusCode'],
          'json': payload_data,
        },
      )
      samples.append(sample)
            
print(len(samples))

# exclude everything that contains amazon in the name
# set token limit for JSON to e.g. 3k tokens to prevent e.g. base64 images, unless api score is >9. then limit can be 10k tokens
# score > 0

In [None]:
from tqdm import tqdm
from datasets import Dataset, DatasetDict

dataset = {
  'api_name': [],
  'api_description': [],
  'endpoint_name': [],
  'endpoint_description': [],
  'response': [],
}

count = 0
for sample in tqdm(samples):
  j = json.dumps(sample.payload['json'])
  l = len(j)
  if l > 50_000:
    continue
  if len(sample.api['description']) > 500:
    continue
  if len(sample.endpoint['description']) > 500:
    continue
  
  dataset['api_name'].append(sample.api['name'])
  dataset['api_description'].append(sample.api['description'])
  dataset['endpoint_name'].append(sample.endpoint['name'])
  dataset['endpoint_description'].append(sample.endpoint['description'])
  dataset['response'].append(j)

dataset = Dataset.from_dict(dataset)
print(len(dataset))
dataset.push_to_hub('davidfant/rapidapi-example-responses')


In [None]:
import json
from dataclasses import asdict
import json_document_splitter as jds

idx = 570
sample = samples[idx]
input_ids = tokenizer.encode(json.dumps(sample.payload['json']))

if len(input_ids) > 10_000:
  print('too long', len(input_ids))
  raise Exception()

calculate_weight = lambda x: len(tokenizer.encode(json.dumps(x.value)))

graph = jds.create_graph(sample.payload['json'])
clusters = jds.sample_clusters(graph, max_weight=512, calculate_weight=calculate_weight)
jds.visualize(graph, clusters, calculate_weight=calculate_weight, label_objects_and_arrays=False)

# print(len(samples))
# print(len([s for s in samples if s.api['score']]))
print(len(clusters))
print(json.dumps({ 'tokens': len(input_ids), **asdict(sample) }, indent=2))