In [52]:
import requests
import random
import jsonschema
from jsonschema import Draft7Validator
import time
import json
from datasets import load_dataset
from sseclient import SSEClient
from bunkatopics import Bunka
from bunkatopics.datamodel import Document, Topic, BourdieuQuery

bunka = Bunka()
# Setup base URL
baseapi_url = "https://beta.bunkasearch.com/api"
# Topics endpoint example
topics_path = "/topics/"
bourdieu_path = "/bourdieu/"
sse_topics_path = "/tasks/topics/"
sse_bourdieu_path = "/tasks/bourdieu/"
headers_json = {"Content-Type": "application/json"}
headers_sse = {"Accept": "text/event-stream"}

In [35]:
# Step 1: Fetch the OpenAPI Specitaskfication
openapi_url = f"{baseapi_url}/openapi.json"
response = requests.get(openapi_url)
openapi_spec = response.json()

# Step 2: Extract the Schema for the "/topics/" endpoint
topics_schema = openapi_spec["paths"]["/topics/"]["post"]["requestBody"]["content"][
    "application/json"
]["schema"]


# Create a Topics query body Validator with the entire document as the scope for resolving references
validator = Draft7Validator(
    topics_schema,
    resolver=jsonschema.RefResolver(base_uri=openapi_url, referrer=openapi_spec),
)

bourdieu_schema = openapi_spec["paths"]["/bourdieu/"]["post"]["requestBody"]["content"][
    "application/json"
]["schema"]


# Create a Bourdieu query body Validator with the entire document as the scope for resolving references
bourdieu_validator = Draft7Validator(
    bourdieu_schema,
    resolver=jsonschema.RefResolver(base_uri=openapi_url, referrer=openapi_spec),
)

  resolver=jsonschema.RefResolver(base_uri=openapi_url, referrer=openapi_spec),
  resolver=jsonschema.RefResolver(base_uri=openapi_url, referrer=openapi_spec),


In [28]:
# Scientific Litterature Data
dataset = load_dataset("CShorten/ML-ArXiv-Papers")["train"]["title"]

In [29]:
# Step 3: Your Data to Validate
full_docs = random.sample(dataset, 500)
data_to_validate = {
    # Fill this with the data you want to validate
    "full_docs": full_docs,
    "topic_param": {"n_clusters": 5, "clean_topics": False},
}
# Step 4: Validate the Data
for error in sorted(validator.iter_errors(data_to_validate), key=str):
    print(error.message)

In [30]:
# Step 5: Send POST Request to '/topics/'
response_topics = requests.post(
    f"{baseapi_url}{topics_path}", headers=headers_json, json=data_to_validate
)
task_response = response_topics.json()
task_id = task_response["task_id"]
print(f"Response received, Topics processing task ID = {task_id}")

Response received, Topics processing task ID = 6317a0ad-8968-4263-8e79-616242f8b80f


In [31]:
# Step 6: Wait for the task to succeed and start the visualisation front-end (web app)
result_processing = None  # Variable to store the final result
url = f"{baseapi_url}{sse_topics_path}{task_id}/progress"
try:
    with requests.get(url, stream=True, headers=headers_sse) as response:
        client = SSEClient(response)
        for event in client.events():
            result_topics = json.loads(event.data)
            # Check if the state is 'FAILURE'
            if result_topics.get("state") == "FAILURE":
                print("Desired state 'FAILURE' found")
                print(result_topics)
            # Check if the state is 'SUCCESS'
            elif result_topics.get("state") == "SUCCESS":
                print("Desired state 'SUCCESS' found")
                result_processing = result_topics.get("result", None)
                bunka.docs = [Document(**row) for row in result_processing["docs"]]
                bunka.topics = [Topic(**row) for row in result_processing["topics"]]
                bunka.start_server()
            else:
                print(result_topics)

except requests.RequestException as e:
    print("Error during connection:", e)

{'state': 'STARTED', 'progress': 0}
{'state': 'STARTED', 'progress': 0}
{'state': 'STARTED', 'progress': 0}
{'state': 'STARTED', 'progress': 0}
{'state': 'STARTED', 'progress': 0}
{'state': 'STARTED', 'progress': 0}
{'state': 'STARTED', 'progress': 0}
{'state': 'PROCESSING', 'progress': 100.2}
Desired state 'SUCCESS' found
Server on port 3000 is already running. Killing it...
NPM server started.


In [45]:
# Step 7: Prepare and validate the bourdieu query
data_to_validate = {
    # Fill this with the data you want to validate
    "full_docs": full_docs,
    "query": {
        "x_left_words": ["left"],
        "x_right_words": ["right"],
        "y_top_words": ["top"],
        "y_bottom_words": ["bottom"],
        "radius_size": 0.6,
    },
    "topic_param": {"n_clusters": 5, "clean_topics": False},
}
# Validate the Data
for error in sorted(bourdieu_validator.iter_errors(data_to_validate), key=str):
    print(error.message)

In [46]:
# Step 8: Send POST Request to '/bourdieu/'
response_topics = requests.post(
    f"{baseapi_url}{bourdieu_path}", headers=headers_json, json=data_to_validate
)
task_response = response_topics.json()
task_id = task_response["task_id"]
print(f"Response received, Bourdieu processing task ID = {task_id}")

Response received, Bourdieu processing task ID = 16034037-7814-411b-8cf7-7b4621a9f586


In [56]:
# Step 8: Wait for the task to succeed and start the visualisation front-end (web app)
result_processing = None  # Variable to store the final result
url = f"{baseapi_url}{sse_bourdieu_path}{task_id}/progress"
try:
    with requests.get(url, stream=True, headers=headers_sse) as response:
        client = SSEClient(response)
        for event in client.events():
            result_topics = json.loads(event.data)
            # Check if the state is 'FAILURE'
            if result_topics.get("state") == "FAILURE":
                print("Desired state 'FAILURE' found")
                print(result_topics)
            # Check if the state is 'SUCCESS'
            elif result_topics.get("state") == "SUCCESS":
                print("Desired state 'SUCCESS' found")
                result_processing = result_topics.get("result", None)
                bunka.bourdieu_docs = [
                    Document(**row) for row in result_processing["docs"]
                ]
                bunka.bourdieu_topics = [
                    Topic(**row) for row in result_processing["topics"]
                ]
                bunka.bourdieu_query = BourdieuQuery(**data_to_validate["query"])

                bunka.start_server_bourdieu()
            else:
                print(result_topics)

except requests.RequestException as e:
    print("Error during connection:", e)

Desired state 'SUCCESS' found
NPM server started.



> bunka-web@0.1.0 start
> react-scripts  --openssl-legacy-provider start





[36mStarting the development server...[39m
[36m[39m
[32mCompiled successfully![39m

You can now view [1mbunka-web[22m in the browser.

  [1mLocal:[22m            http://localhost:[1m3000[22m
  [1mOn Your Network:[22m  http://10.2.0.2:[1m3000[22m

Note that the development build is not optimized.
To create a production build, use [36mnpm run build[39m.

webpack compiled [1m[32msuccessfully[39m[22m
