# Generate Topics and launch the web app

In [1]:
import os
import subprocess
import webbrowser
import time
from pathlib import Path
import json
from pathlib import Path
from bunkatopics.functions.web import launch_web_app

2023-10-20 17:09:28.290814: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-10-20 17:09:28.290841: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-10-20 17:09:28.290872: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-10-20 17:09:28.299061: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-10-20 17:09:38.319676: E tensorflow/compiler/

# Load documents and topics

In [2]:
#documents_path = Path('json_examples/docs.jsonl')
#topics_path = Path('json_examples/topics.jsonl')
documents_path = Path('json_examples/bourdieu_docs.json')
topics_path = Path('json_examples/bourdieu_topics.json')
topics = []
documents = []
# Open the JSONL files and load data
# with open(topics_path, 'r') as serialized_topics:
#     for line in serialized_topics:
#         # Parse each line as a JSON object and append it to the list
#         data = json.loads(line)
#         topics.append(data)

# with open(documents_path, 'r') as serialized_documents:
#     for line in serialized_documents:
#         # Parse each line as a JSON object and append it to the list
#         data = json.loads(line)
#         documents.append(data)

with open(topics_path, 'r') as serialized_topics:
    data = json.load(serialized_topics)
    topics = data

with open(documents_path, 'r') as serialized_documents:
    data = json.load(serialized_documents)
    documents = data

source_data = {
    'documents': documents,
    'topics': topics
}

In [14]:
def transform_and_write(source_data, output_file_path):
    """
    Transform BunkaTopics output to the web front-end format
    """
    left_words = []
    right_words = []

    for doc in source_data["documents"]:
        if doc["bourdieu_dimensions"]:
            for dimension in doc["bourdieu_dimensions"]:
                left_words += [dimension["continuum"]["left_words"]]
                right_words += [dimension["continuum"]["right_words"]]
    # Traduire chaque document en utilisant une list comprehension
    dest_data_documents = [{
        "id": doc["doc_id"],
        "text": doc["content"],
        "source": None,  # Pas présent dans les données source
        "language": "en",  # Pas présent dans les données source
        "languages": ["en"],  # Pas présent dans les données source
        "created_at_timestamp_sec": None,  # Pas présent dans les données source
        "author": None,  # Pas présent dans les données source
        "embedding_light": [doc["x"], doc["y"]],
        "topic_ids": doc["term_id"],
        "rank": {
            "rank": doc["topic_ranking"]["rank"] if doc["topic_ranking"] else None,
            "rank_per_topic": {
                str(doc["topic_id"]): {
                    "rank": doc["topic_ranking"]["rank"],
                    "score": None,
                    "score_bin": None,
                    "count_specific_terms": None,
                    "specificity_bin": None,
                    "bunka_score": None
                }
            } if doc["topic_ranking"] else {}
        },
        # doc["embedding"] n'est pas présent dans les données cibles
        "dimensions": [{
            "id": " / ".join([" ".join(dimension["continuum"]["left_words"]), " ".join(dimension["continuum"]["right_words"])]),
            "score": dimension["distance"]
        } for dimension in doc["bourdieu_dimensions"] if doc["bourdieu_dimensions"]],
    } for doc in source_data["documents"]]

    # Traduire chaque topic en utilisant une list comprehension
    dest_data_topics = [{
        "id": topic["topic_id"],
        "size": topic["size"],
        "percent": None,  # Pas présent dans les données source
        "parent_topic_id": None,  # Pas présent dans les données source
        "centroid": {
            "cluster_id": topic["topic_id"],
            "x": topic["x_centroid"],
            "y": topic["y_centroid"]
        },
        "convex_hull": {
            "cluster_id": topic["topic_id"],
            "x_coordinates": topic["convex_hull"]["x_coordinates"],
            "y_coordinates": topic["convex_hull"]["y_coordinates"]
        },
        "explanation": {
            "topic_id": topic["topic_id"],
            "name": topic["name"],
            "specific_terms": topic["term_id"],
            "top_terms": topic["top_term_id"],
            "top_entities": topic["top_doc_id"]
        }
    } for topic in source_data["topics"]]

    with open(output_file_path, 'w') as outfile:
        # Écrire l'objet JSON 
        json.dump({
            "documents": dest_data_documents,
            "topics": dest_data_topics,
            "query": {
                "text": "Test",
                "top_k": 400,
                "min_doc_retrieved": 100,
                "max_toxicity": 0.8,
                "languages": None,
                "topics": {
                    "shape": [
                        6,
                        2
                    ],
                    "convex_hull_interpolation": True,
                    "min_doc_per_topic": 20,
                    "ngrams": [
                        1,
                        2
                    ],
                    "min_count_term": 3,
                    "top_terms_included": 20000,
                    "text_type": "term_id",
                    "n_terms_in_name": 5,
                    "number_top_terms_returned": 20,
                    "number_specific_terms_returned": 20,
                    "top_n_specificity_fn": 200,
                    "specificity_weight": 6,
                    "popularity_weight": 3,
                    "feature_binned_number": 10
                },
                "intensity_dimensions": [
                    {
                        "id": "arts",
                        "kind": "intensity",
                        "words": [
                            "arts",
                            "sculpture",
                            "architecture",
                            "painting",
                            "drawing",
                            "music",
                            "literature",
                            "poetry",
                            "theater",
                            "dance",
                            "movie",
                            "photography",
                            "cinema",
                            "cooking",
                            "fashion"
                        ]
                    },
                ],
                "continuum_dimensions": [
                    {
                        "id": "positive / negative",
                        "kind": "continuum",
                        "left_id": "positive",
                        "right_id": "negative",
                        "left_words": left_words,
                        "right_words": right_words
                    },
                ]
            },
            "nb_documents": len(source_data["documents"]),
        }, outfile, indent=4)


output_file_path = Path('/home/elishowk/src/BunkaTopics') / 'web' / 'public' / 'localSearchResults.json'
print(output_file_path)

transform_and_write(source_data, output_file_path)
#launch_web_app(source_data)


/home/elishowk/src/BunkaTopics/web/public/localSearchResults.json
