<a href="https://colab.research.google.com/github/dhruvtre/Lossfunk_Code/blob/main/Empirical_Observation_Extraction_and_Clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## PDF Functions

In [None]:
!pip install pyMuPDF
!pip install pymupdf4llm

In [2]:
import os
import requests
import pymupdf
import time

In [3]:
def prepare_urls_for_download(url_string: str) -> list:
    """
    Takes a comma-separated string of PDF URLs and prepares a list of dictionaries
    with 'pdf_url' and 'paper_title' for the download function.

    Args:
        url_string: A string containing one or more PDF URLs separated by commas.

    Returns:
        A list of dictionaries, each with 'pdf_url' and 'paper_title'.
    """
    urls = [url.strip() for url in url_string.split(',') if url.strip()]
    prepared_list = []
    for i, url in enumerate(urls):
        # Create a simple paper title from the URL or just use an index
        # You might want a more sophisticated way to get a title if possible
        paper_title = f"paper_{i+1}" # Simple title for now

        prepared_list.append({
            'pdf_url': url,
            'paper_title': paper_title
        })
    return prepared_list

# Example usage:
# url_input = "https://arxiv.org/pdf/2407.12345.pdf, https://proceedings.mlr.press/v1/paper_a.pdf"
# paper_list_for_download = prepare_urls_for_download(url_input)
# print(paper_list_for_download)

In [4]:
def download_pdf(pdf_url, paper_title, download_dir="Hypothesis_Generator_Explorer_Test"):
    """Download a PDF from a URL and save it locally"""
    # Create download directory if it doesn't exist
    if not os.path.exists(download_dir):
        os.makedirs(download_dir)

    file_path = os.path.join(download_dir, f"{paper_title}.pdf")

    try:
        response = requests.get(pdf_url, timeout=30)
        response.raise_for_status()  # Raise exception for HTTP errors

        with open(file_path, 'wb') as f:
            f.write(response.content)
        return file_path
    except Exception as e:
        print(f"Error downloading {pdf_url}: {e}")
        return None

# # Quick Test to Check Download
# download_test_pdf_url = first_paper_sample[0]["pdf_url"]
# download_test_paper_title = first_paper_sample[0]["title"]
# file_path = download_pdf(pdf_url=download_test_pdf_url, paper_title=download_test_paper_title)

In [5]:
def extract_pdf_text_md(pdf_path, paper_title, download=False):
    """Extract text from a PDF file and convert to markdown"""
    print("Checking for pdf path before markdown extraction.")
    if not pdf_path or not os.path.exists(pdf_path):
      print("No PDF path found for markdown extraction.")
      return None

    try:
        # Get the directory from the pdf_path
        print("Getting PDF directory name.")
        pdf_dir = os.path.dirname(pdf_path)

        # Convert the document to markdown
        import pymupdf4llm
        print(f"Getting markdown from {paper_title}.")
        md_text = pymupdf4llm.to_markdown(pdf_path)

        if download:
          # Create markdown file path using the same directory and paper title
          md_file_path = os.path.join(pdf_dir, f"{paper_title}.md")

          # Write the text to file in UTF8-encoding
          with open(md_file_path, 'wb') as f:
            f.write(md_text.encode())

          return md_text, md_file_path

        else:
          return md_text

    except Exception as e:
        print(f"Error extracting markdown from {pdf_path}: {e}")
        return None

# # Quick Test for Extract Markdown Function
# file_path="/content/downloaded_pdfs_sample_1/TimeSuite: Improving MLLMs for Long Video Understanding via Grounded Tuning.pdf"
# download_test_content = extract_pdf_text_md(file_path, download_test_paper_title)

In [6]:
import os
import pickle

def cache_paper_text(paper_id, paper_title, text_data, cache_dir="paper_text_cache"):
    """
    Cache or retrieve paper text data.

    Args:
        paper_id: Unique paper ID for filename
        paper_title: Paper title (for logging)
        text_data: If provided, save to cache. If None, try to load from cache
        cache_dir: Directory to store cached files

    Returns:
        tuple: (success: bool, data: any)
        - If saving: (True, text_data) on success
        - If loading: (True, loaded_data) if exists, (False, None) if not
    """
    # Create cache directory if needed
    os.makedirs(cache_dir, exist_ok=True)

    # Use paper_id for filename to avoid issues with special characters in titles
    cache_file = os.path.join(cache_dir, f"{paper_id}.pkl")

    if text_data is not None:
        # SAVE mode
        try:
            with open(cache_file, 'wb') as f:
                pickle.dump(text_data, f)
            print(f"‚úÖ Cached: {paper_title[:50]}...")
            return True, text_data
        except Exception as e:
            print(f"‚ùå Cache save failed for {paper_title}: {e}")
            return False, None
    else:
        print("Text data is empty. Skipping caching.")
        return False, None

## LLM Utils

In [46]:
# Getting OpenRouter API Key
from google.colab import userdata
openrouter_api_key = userdata.get('OpenRouter_dhruv_key')

import requests
import json
from typing import Optional
import re
import pickle

In [8]:
def send_ai_request(user_message, system_prompt=None, model="google/gemini-2.5-pro-preview-03-25", file=None, file_data=None, file_name=None, temperature=0.7):
    """Send a request to the OpenRouter API and return the response"""
    # Construction message
    messages_array = []
    if system_prompt:
      system_prompt_message = {
          "role": "system",
          "content": system_prompt
      }
      messages_array.append(system_prompt_message)

    user_message_prompt_message = [{
        "type": "text",
        "text": user_message
    }]
    if file:
        user_message_prompt_message.append({
            "type": "file",
            "file": {
            "filename": file_name,
            "file_data": file_data
            }
        }
        )
    user_message_prompt_message = str(user_message_prompt_message)

    user_message = {
        "role": "user",
        "content": user_message_prompt_message
    }

    messages_array.append(user_message)

    response = requests.post(
        url="https://openrouter.ai/api/v1/chat/completions",
        headers={
            "Authorization": f"Bearer {openrouter_api_key}"
        },
        data=json.dumps({
            "model": model,
            "messages": messages_array,
            "temperature": temperature,
            # "max_tokens": 5000,
            "transforms" : ["middle-out"]
        })
    )
    return response.json()

In [9]:
def parse_ai_response(response, reasoning=False):
    """Extract the content and usage metrics from API response"""
    try:
        content = response['choices'][0]['message']['content']
        usage = response['usage']
        if reasoning:
            reasoning_text = response['choices'][0]['message']['reasoning']
            return {
              'content': content,
              'usage': usage,
              'reasoning': reasoning_text,
              'success': True
          }
        elif reasoning is False:
            return {
              'content': content,
              'usage': usage,
              'success': True
          }
    except (KeyError, IndexError) as e:
        return {
            'content': None,
            'usage': None,
            'success': False,
            'reasoning': None,
            'error': str(e),
            'response': response
        }

In [10]:
def extract_json_between_markers(llm_output: str) -> dict | None:
    # Regular expression pattern to find JSON content between ```json and ```
    json_pattern = r"```json(.*?)```"
    matches = re.findall(json_pattern, llm_output, re.DOTALL)

    if not matches:
        # Fallback: Try to find any JSON-like content in the output
        json_pattern = r"\{.*?\}"
        matches = re.findall(json_pattern, llm_output, re.DOTALL)

    for json_string in matches:
        json_string = json_string.strip()
        try:
            parsed_json = json.loads(json_string)
            return parsed_json
        except json.JSONDecodeError:
            # Attempt to fix common JSON issues
            try:
                # Remove invalid control characters
                json_string_clean = re.sub(r"[\x00-\x1F\x7F]", "", json_string)
                parsed_json = json.loads(json_string_clean)
                return parsed_json
            except json.JSONDecodeError:
                continue  # Try next match

    return None  # No valid JSON found

## Paper Loading and Chunking

In [182]:
# Function for Loading Paper Text from Cache
def load_cached_paper_text(paper_id, cache_dir="paper_text_cache"):
   """
   Load cached paper text for a given paper ID.

   Args:
       paper_id: Unique paper ID
       cache_dir: Directory where cached files are stored

   Returns:
       tuple: (success: bool, text_data: any)
       - If found: (True, loaded_text_data)
       - If not found or error: (False, None)
   """
   cache_file = os.path.join(cache_dir, f"{paper_id}.pkl")

   if not os.path.exists(cache_file):
       print(f"‚ùå No cached text found for paper ID: {paper_id}")
       return False, None

   try:
       with open(cache_file, 'rb') as f:
           text_data = pickle.load(f)
       print(f"‚úÖ Loaded cached text for paper ID: {paper_id}")
       return True, text_data
   except Exception as e:
       print(f"‚ùå Error loading cache for paper ID {paper_id}: {e}")
       return False, None


# Function for Chunking by Paragraph
def chunk_by_paragraph(text, min_length=1000):
   """
   Chunk text by paragraphs, merging short chunks with previous ones.

   Args:
       text: The full paper text
       min_length: Minimum character length for a standalone chunk

   Returns:
       List of paragraph chunks
   """
   # Split on double newlines
   raw_paragraphs = text.split('\n\n')

   # Clean paragraphs first
   paragraphs = [para.strip() for para in raw_paragraphs if para.strip()]

   if not paragraphs:
       return []

   chunks = []

   for i in range(len(paragraphs)):
       para = paragraphs[i]

       if len(para) < min_length and chunks:
           # Short paragraph and we have a previous chunk - merge with previous
           chunks[-1] += "\n\n" + para
       else:
           # Either long enough or it's the first chunk
           chunks.append(para)

   return chunks

## Prompt and Chunk Labelling

In [27]:
def property_labelling_prompt(property, property_description, paper_title, chunk, paper_abstract=None):
    property_labelling_prompt = f'''You are a scientist and expert information extractor following philosophy of science principles.

    Your task is to identify and extract {property} from research text.

    {property_description}

    Your final output should be in the following format:
    <thinking>
    [Your reasoning about whether the property is present]
    </thinking>
    <label>
    [PROPERTY_NAME if property present, NOT otherwise]
    </label>
    <extracted_content>
    [If property present: State the extracted content as a complete, self-contained claim. If NOT: Write "NONE"]
    </extracted_content>
    <citations>
    [If property present: Include citation markers from the chunk. If NOT or no citations: Write "NONE"]
    </citations>

    Chunk to analyze:
    {chunk}

    Paper context:
    Title: {paper_title}

    Use paper context to understand the chunk, but only extract {property} directly stated IN the chunk.

    '''
    return property_labelling_prompt

In [57]:
LABELLING_MODEL = "openai/gpt-5"

# Single paper chunk labelling
def label_all_chunks_per_paper(paper_dict, property_name, property_description, model=LABELLING_MODEL, max_chunks=None):
  # paper_id = paper_dict['id']
  paper_title = paper_dict['paper_title']
  pdf_url = paper_dict['pdf_url']
  # paper_abstract = paper_dict['abstract']
  paper_chunks = paper_dict['chunks']
  print(f"Labelling chunks for paper: {paper_title}, {pdf_url}")
  print(f"Total chunks: {len(paper_chunks)}")
  print(f"{'='*60}\n")
  labeled_chunks = []
  if max_chunks:
      paper_chunks = paper_chunks[:max_chunks]

  print(f"Starting labelling for: {len(paper_chunks)}")
  for i in range(len(paper_chunks)):
    print(f"Labelling chunk {i+1}/{len(paper_chunks)}")
    chunk = paper_chunks[i]
    # print(chunk[:100])
    prompt = property_labelling_prompt(property_name, property_description, paper_title, chunk)
    # print(prompt[:100])
    # print(f"Prompt for chunk {i+1}/{len(paper_chunks)} created.")
    response = send_ai_request(prompt, model=model)
    # print(response)
    print(f"AI request for chunk {i+1}/{len(paper_chunks)} completed.")
    parsed_response = parse_ai_response(response)
    # print(parsed_response)

    if parsed_response['success']:
      print(f"AI response for chunk {i+1}/{len(paper_chunks)} parsed.")
      labeled_chunk = {
              'chunk_n': i,
              'chunk_text': chunk,
              'label_output': parsed_response
              }
      labeled_chunks.append(labeled_chunk)
      print(f"AI request for chunk {i+1}/{len(paper_chunks)} appended.")

  # Add to paper dict
  if max_chunks:
    paper_dict[f'labeled_chunks_{max_chunks}'] = labeled_chunks
    print(f"AI request for labelling {max_chunks} chunks complete. Appended.")
    return True
  else:
    paper_dict[f'labeled_chunks_all'] = labeled_chunks
    print(f"AI request for labelling {max_chunks} chunks complete. Appended.")
    return True

## Embedding Preparation

In [60]:
from sentence_transformers import SentenceTransformer

In [15]:
def parse_property_output(content):
    """
    Extract thinking, label, extracted_content, and citations from the LLM output.

    Args:
        content: The raw LLM response string

    Returns:
        dict with keys: thinking, label, extracted_content, citations
    """
    import re

    # Initialize with defaults
    parsed = {
        'thinking': '',
        'label': 'NOT',
        'extracted_content': 'NONE',
        'citations': 'NONE'
    }

    try:
        # Extract thinking
        thinking_match = re.search(r'<thinking>(.*?)</thinking>', content, re.DOTALL)
        if thinking_match:
            parsed['thinking'] = thinking_match.group(1).strip()

        # Extract label
        label_match = re.search(r'<label>(.*?)</label>', content, re.DOTALL)
        if label_match:
            parsed['label'] = label_match.group(1).strip()

        # Extract extracted content
        ext_content_match = re.search(r'<extracted_content>(.*?)</extracted_content>', content, re.DOTALL)
        if ext_content_match:
            parsed['extracted_content'] = ext_content_match.group(1).strip()

        # Extract citations
        cite_match = re.search(r'<citations>(.*?)</citations>', content, re.DOTALL)
        if cite_match:
            parsed['citations'] = cite_match.group(1).strip()

    except Exception as e:
        print(f"Error parsing observation output: {e}")

    return parsed

In [70]:
def prepare_chunks_for_embedding(papers, property_label, labeled_chunks_key='labeled_chunks_20'):
    """
    Extract and prepare chunks with a specific property label for embedding.

    Args:
        papers: List of paper dictionaries with labeled chunks
        property_label: Label to filter for (e.g., 'OBS', 'METHODOLOGY', 'LIMITATION')
        labeled_chunks_key: Key in paper dict for labeled chunks (e.g., 'labeled_chunks_20', 'labeled_chunks_all')

    Returns:
        List of dictionaries with paper_id, paper_title, extracted_content, has_citations, citations
    """
    extracted_content_for_embedding = []

    for paper in papers:
        if labeled_chunks_key not in paper:
            continue

        for chunk in paper[labeled_chunks_key]:
          parsed_content = parse_property_output(chunk['label_output']['content'])
          chunk['parsed'] = parsed_content
          if chunk['parsed']['label'] == property_label:
                extracted_content_for_embedding.append({
                    # 'paper_id': paper['id'],
                    'paper_url' : paper['pdf_url'],
                    'paper_title': paper['paper_title'],
                    'extracted_content': chunk['parsed']['extracted_content'],
                    'has_citations': chunk['parsed']['citations'] != 'NONE',
                    'citations': chunk['parsed']['citations']
                })
          else:
            print(f"{chunk['parsed']['label']} is not the same as {property_label}")

    return extracted_content_for_embedding

In [73]:
def create_embeddings(extracted_content_for_embedding, model_name='allenai/scibert_scivocab_uncased', batch_size=16, save_path=None):
   """
   Create embeddings for a list of extracted content using sentence-transformers.

   Args:
       extracted_content_for_embedding: List of dicts with 'extracted_content' key
       model_name: Name of the sentence-transformer model
       batch_size: Number of texts to encode at once,
       save_path: Optional path to save embeddings as pickle. If None, don't save.

   Returns:
       numpy array of embeddings, same order as input list
   """

   print(f"Loading embedding model: {model_name}")
   model = SentenceTransformer(model_name)

   # Extract just the text
   texts = [item['extracted_content'] for item in extracted_content_for_embedding]

   print(f"Embedding {len(texts)} extractions...")
   print(f"Model embedding dimension: {model.get_sentence_embedding_dimension()}")

   # Encode in batches (more efficient)
   embeddings = []
   for i in range(0, len(texts), batch_size):
       batch = texts[i:i+batch_size]
       batch_embeddings = model.encode(batch, show_progress_bar=True)
       embeddings.extend(batch_embeddings)

   embeddings_array = np.array(embeddings)
   print(f"‚úÖ Created embeddings matrix: {embeddings_array.shape}")

   # Optional save
   if save_path:
        with open(save_path, 'wb') as f:
            pickle.dump({
                'embeddings': embeddings_array,
                'extractions': extracted_content_for_embedding
            }, f)

   return embeddings_array

## Embedding Clustering

In [169]:
def cluster_extractions_kmeans(embeddings, k=4):
      """
      Cluster extractions using KMeans.

      Args:
          embeddings: numpy array of embeddings (n_samples, n_features)
          k: Number of clusters (default: 5)

      Returns:
          cluster_labels: Array of cluster assignments
          clusterer: Fitted KMeans object
      """
      from sklearn.cluster import KMeans
      # from sklearn.preprocessing import normalize

      print(f"Clustering {embeddings.shape[0]} extractions with KMeans (k={k})...")

      # Normalize embeddings for cosine distance
      # embeddings_normalized = normalize(embeddings, norm='l2')

      # Perform clustering
      clusterer = KMeans(n_clusters=k, random_state=42)
      cluster_labels = clusterer.fit_predict(embeddings)

      # Print summary
      print(f"\nüìä Clustering Results:")
      print(f"   Clusters found: {k}")
      for i in range(k):
          size = list(cluster_labels).count(i)
          print(f"   Cluster {i}: {size} extractions")

      return cluster_labels, clusterer

In [74]:
import hdbscan
from sklearn.metrics.pairwise import cosine_distances
import numpy as np

In [124]:
def cluster_extractions(embeddings, min_cluster_size=5, min_samples=3):
    """
    Cluster extractions using HDBSCAN on high-dimensional embeddings.
    """

    print(f"Clustering {embeddings.shape[0]} extractions...")
    print(f"Parameters: min_cluster_size={min_cluster_size}, min_samples={min_samples}")

    # Ensure embeddings are float64
    embeddings_float64 = embeddings.astype(np.float64)

    # Calculate cosine distance matrix
    distance_matrix = cosine_distances(embeddings_float64)

    clusterer = hdbscan.HDBSCAN(
        min_cluster_size=min_cluster_size,
        min_samples=min_samples,
        metric='precomputed',
        cluster_selection_method='eom',
        prediction_data=True
    )

    cluster_labels = clusterer.fit_predict(distance_matrix)

    # Print summary
    n_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)
    n_noise = list(cluster_labels).count(-1)

    print(f"\nüìä Clustering Results:")
    print(f"   Clusters found: {n_clusters}")
    print(f"   Noise points: {n_noise} ({n_noise/len(cluster_labels)*100:.1f}%)")

    # Cluster sizes
    if n_clusters > 0:
        print("\n   Cluster sizes:")
        for i in range(n_clusters):
            size = list(cluster_labels).count(i)
            print(f"   Cluster {i}: {size} extractions")

    return cluster_labels, clusterer

## Summarising Clusters

In [187]:
def summarize_cluster_extractions(cluster_extractions, property_name, property_description, cluster_id, model="openai/gpt-5"):
    """
    Generate a short summary label for a cluster based on its extractions.

    Args:
        cluster_extractions: List of extraction dictionaries in the cluster
        property_name: Name of the property being summarized
        property_description: Description of the property being summarized
        cluster_id: Cluster identifier
        model: LLM model to use

    Returns:
        Summary string (3-4 words)
    """
    # Sample up to 10 extractions for the summary
    sample_size = min(10, len(cluster_extractions))
    sampled_extractions = cluster_extractions[:sample_size]

    # Create extraction list for prompt
    extraction_list = "\n".join([f"{i+1}. {ext_cont['extracted_content']}" for i, ext_cont in enumerate(sampled_extractions)])
    print(extraction_list)

    prompt = f"""You are analyzing clusters of {property_name} from research papers.

    {property_name} is defined as:
    {property_description}

    Review these {len(sampled_extractions)} extractions from Cluster {cluster_id} and provide a SHORT label (4-5 words maximum) that captures their common theme or phenomenon.

    You should also output a brief description of the label covering any specific details showcasing what the label encapsulates.

    Your final output should only include:

    Label:
    Description:

    Output only the cluster label and description, nothing else. Keep them as specific as possible.

    Extractions:
    {extraction_list}
    """

    response = send_ai_request(prompt, model=model)
    print(response)
    parsed = parse_ai_response(response)

    if parsed['success']:
        # Clean up the response - just get the label
        label = parsed['content'].strip()
        # # Remove quotes if present
        # label = label.strip('"\'')
        # # Take first 4 words if longer
        # words = label.split()
        # if len(words) > 4:
        #     label = " ".join(words[:4])
        return label
    else:
        return f"Cluster {cluster_id}"

    cluster_summaries = {}
    print("Generating cluster summaries...\n")

    # Determine max cluster ID
    max_cluster_id = int(cluster_labels.max()) if len(cluster_labels) > 0 else -1

    for cluster_id in range(max_cluster_id + 1):  # Loop through all clusters
        cluster_extractions = [ext for i, ext in enumerate(extracted_content_for_embedding) if cluster_labels[i] == cluster_id]

        if len(cluster_extractions) > 0:
            summary = summarize_cluster_extractions(cluster_extractions, property_name, property_description, cluster_id)
            cluster_summaries[cluster_id] = summary
            print(f"Cluster {cluster_id}: {summary} ({len(cluster_extractions)} extractions)")

    print("\n‚úÖ Cluster summaries generated!")

## UMAP Visualisation

In [174]:
def create_visualization_data(embeddings, cluster_labels, extracted_content_list,
                              umap_neighbors=15, umap_min_dist=0.1,
                              random_state=42, save_path=None):
    """
    Create 2D visualization data from embeddings using existing cluster labels and UMAP.

    Args:
        embeddings: numpy array of high-dimensional embeddings (n_samples, n_features)
        cluster_labels: Cluster assignments from cluster_extractions() (n_samples,)
        extracted_content_list: List of dictionaries with extracted content (will be modified in-place)
        umap_neighbors: Number of neighbors for UMAP (default: 15)
        umap_min_dist: Minimum distance for UMAP (default: 0.1)
        random_state: Random seed for reproducibility (default: 42)
        save_path: Optional path to save visualization data as pickle. If None, don't save.

    Returns:
        dict with keys:
            - 'embeddings_2d': 2D UMAP projections (n_samples, 2)
            - 'cluster_labels': The input cluster_labels (unchanged)
            - 'extractions': The input extracted_content_list (with cluster labels added)
    """
    import umap
    import numpy as np


    # Step 2: UMAP dimensionality reduction for visualization
    print("Projecting to 2D with UMAP...")
    reducer = umap.UMAP(
        n_neighbors=umap_neighbors,
        min_dist=umap_min_dist,
        metric='cosine',
        random_state=random_state
    )
    embeddings_2d = reducer.fit_transform(embeddings)

    n_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)
    print(f"‚úÖ UMAP complete! Shape: {embeddings_2d.shape}")
    print(f"   Visualizing {n_clusters} clusters")

    # Step 3: Prepare return data
    visualization_data = {
        'embeddings_2d': embeddings_2d,
        'cluster_labels': cluster_labels,
        'extractions': extracted_content_list
    }

    # Step 4: Optional save
    if save_path:
        import pickle
        with open(save_path, 'wb') as f:
            pickle.dump(visualization_data, f)
        print(f"‚úÖ Saved visualization data to {save_path}")

    return visualization_data

In [176]:
def visualize_clusters_with_summaries(embeddings_2d, cluster_labels,
                                      extracted_content_list, cluster_summaries,
                                      property_name="Extractions", title_suffix="",
                                      width=1200, height=800, show=True):
    """
    Create enhanced scatter plot with cluster summary labels as annotations.

    Args:
        embeddings_2d: 2D UMAP projections (n_samples, 2)
        cluster_labels: Cluster assignments (n_samples,)
        extracted_content_list: List of extraction dicts with 'extracted_content', 'paper_title', 'has_citations'
        cluster_summaries: Dict mapping cluster_id to summary string
        property_name: Name of property being visualized (for title)
        title_suffix: Optional suffix for title
        width: Plot width in pixels
        height: Plot height in pixels
        show: Whether to display the plot immediately

    Returns:
        plotly.graph_objects.Figure object
    """
    import plotly.graph_objects as go
    import plotly.express as px
    import numpy as np

    # Determine number of clusters dynamically (exclude noise points with -1)
    unique_clusters = sorted([c for c in set(cluster_labels) if c != -1])
    n_clusters = len(unique_clusters)

    # Generate color palette dynamically
    colors = px.colors.qualitative.Set3[:n_clusters] if n_clusters <= 12 else px.colors.qualitative.Set3

    # Create the scatter plot
    fig = go.Figure()

    # Add points for each cluster with summary labels
    for idx, cluster_id in enumerate(unique_clusters):
        mask = cluster_labels == cluster_id
        cluster_points = embeddings_2d[mask]

        # Get cluster extractions
        cluster_extractions = [ext for i, ext in enumerate(extracted_content_list) if cluster_labels[i] == cluster_id]

        # Create hover text
        hover_texts = []
        for ext in cluster_extractions:
            hover_text = f"<b>{ext['paper_url']}...</b><br><br>{ext['extracted_content'][:200]}...<br><br>Citations: {ext['has_citations']}"
            hover_texts.append(hover_text)

        # Use summary as legend label if available, otherwise use cluster ID
        if cluster_id in cluster_summaries:
            cluster_name = f"{cluster_summaries[cluster_id]} (n={len(cluster_extractions)})"
        else:
            cluster_name = f"Cluster {cluster_id} (n={len(cluster_extractions)})"

        # Use color from palette (cycle if more clusters than colors)
        color = colors[idx % len(colors)]

        fig.add_trace(go.Scatter(
            x=cluster_points[:, 0],
            y=cluster_points[:, 1],
            mode='markers',
            name=cluster_name,
            marker=dict(
                size=12,
                color=color,
                line=dict(width=1, color='white')
            ),
            text=hover_texts,
            hoverinfo='text'
        ))

    # Add cluster labels as annotations (only for clusters with summaries)
    for cluster_id in unique_clusters:
        if cluster_id in cluster_summaries:
            mask = cluster_labels == cluster_id
            if mask.any():
                cluster_points = embeddings_2d[mask]
                # Place label at cluster centroid
                center_x = cluster_points[:, 0].mean()
                center_y = cluster_points[:, 1].mean()

                # Get color for this cluster
                cluster_idx = unique_clusters.index(cluster_id)
                color = colors[cluster_idx % len(colors)]

                fig.add_annotation(
                    x=center_x,
                    y=center_y,
                    text=cluster_summaries[cluster_id],
                    showarrow=False,
                    font=dict(size=14, color='black'),
                    bgcolor='rgba(255,255,255,0.8)',
                    bordercolor=color,
                    borderwidth=2
                )

    # Build title
    title = f"{property_name} by Cluster Type"
    if title_suffix:
        title += f": {title_suffix}"

    fig.update_layout(
        title=title,
        xaxis_title="UMAP 1",
        yaxis_title="UMAP 2",
        width=width,
        height=height,
        hovermode='closest',
        font=dict(size=12)
    )

    if show:
        fig.show()

    return fig

In [23]:
def visualize_clusters(embeddings_2d, cluster_labels, extracted_content_list,
                      property_name="Extractions", title_suffix="",
                      width=1000, height=700, show=True):
    """
    Create a basic scatter plot visualization of clusters.

    Args:
        embeddings_2d: 2D UMAP projections (n_samples, 2)
        cluster_labels: Cluster assignments (n_samples,)
        extracted_content_list: List of extraction dicts with 'extracted_content', 'paper_title', 'has_citations'
        property_name: Name of property being visualized (for title)
        title_suffix: Optional suffix for title
        width: Plot width in pixels
        height: Plot height in pixels
        show: Whether to display the plot immediately

    Returns:
        plotly.graph_objects.Figure object
    """
    import plotly.graph_objects as go
    import plotly.express as px
    import numpy as np

    # Determine number of clusters dynamically (exclude noise points with -1)
    unique_clusters = sorted([c for c in set(cluster_labels) if c != -1])
    n_clusters = len(unique_clusters)

    # Generate color palette dynamically
    colors = px.colors.qualitative.Set3[:n_clusters] if n_clusters <= 12 else px.colors.qualitative.Set3

    # Create the scatter plot
    fig = go.Figure()

    # Add points for each cluster
    for idx, cluster_id in enumerate(unique_clusters):
        mask = cluster_labels == cluster_id
        cluster_points = embeddings_2d[mask]

        # Get cluster extractions
        cluster_extractions = [ext for i, ext in enumerate(extracted_content_list) if cluster_labels[i] == cluster_id]

        # Create hover text
        hover_texts = []
        for ext in cluster_extractions:
            hover_text = f"<b>{ext['paper_title'][:50]}...</b><br><br>{ext['extracted_content'][:200]}...<br><br>Citations: {ext['has_citations']}"
            hover_texts.append(hover_text)

        # Use color from palette (cycle if more clusters than colors)
        color = colors[idx % len(colors)]

        fig.add_trace(go.Scatter(
            x=cluster_points[:, 0],
            y=cluster_points[:, 1],
            mode='markers',
            name=f'Cluster {cluster_id}',
            marker=dict(
                size=10,
                color=color,
                line=dict(width=1, color='white')
            ),
            text=hover_texts,
            hoverinfo='text'
        ))

    # Build title
    title = f"{property_name} Cluster Map"
    if title_suffix:
        title += f": {title_suffix}"

    fig.update_layout(
        title=title,
        xaxis_title="UMAP 1",
        yaxis_title="UMAP 2",
        width=width,
        height=height,
        hovermode='closest'
    )

    if show:
        fig.show()

    return fig

## Run Code

In [189]:
property_dictionary = {
    "empirical_observations" : {
        "title" : "Empirical Observations",
        "description" : '''An empirical observation is:
                            - Information gathered through direct or indirect observation/experimentation
                            - Evidence that could confirm, disconfirm, or arbitrate between scientific hypotheses
                            - Includes: measured results, experimental findings, observed phenomena, detected patterns, or statistical outcomes
                            - Can be from this paper's work OR cited from other studies (both are scientifically valuable)"'''
    },
    "observational_statements" : {
        "title" : "Observational Statements",
        "description" : '''An observation statement is a sentence that:
                        Is about publicly observable, intersubjective features of the world.
                        Uses only: observational terms, logical terms (and, or, not, etc.),and math.
                        Is directly testable by suitable observation by (in principle) any competent observer.'''
    },
    "theoretical_statements" : {
        "title" : "Theoretical Statements",
        "description" : '''A theoretical statement is a sentence that:
                          Contains at least one theoretical term‚Äîi.e. terms introduced by a scientific theory, typically referring to unobservable or highly theory-laden entities/structures.
                          Gets its empirical bite only indirectly: via laws, correspondence rules, bridge principles, models that connect it to observation statements.'''
    }
}

In [183]:
# Taking List of URLs as Input
url_input = input("Enter a list of arxiv pdf urls separated with a comma.")

# Making List Object
paper_list_for_download = prepare_urls_for_download(url_input)
print(f"Length of paper list for download: {len(paper_list_for_download)}")

# Taking Download Directory Name
download_directory_name = input("Enter the name of the directory you want to download to.")
print(f"Downloading to directory: {download_directory_name}")

# Downloading Each URL Content
for item in paper_list_for_download:
  print(f"Downloading {item['paper_title']}...")
  file_path = download_pdf(pdf_url=item["pdf_url"], paper_title=item["paper_title"], download_dir=download_directory_name)
  if file_path:
    print(f"Downloaded {item['paper_title']} to {file_path}")

    # If Downloaded, Extracting Text PDF to MD
    download_text = extract_pdf_text_md(file_path, item["paper_title"], download=False)
    if download_text:
      print(f"Extracted text for {item['paper_title']}")
      print(f"Caching the text data.")

      # If Extracted, Saving to Cache
      status, text_data = cache_paper_text(paper_id=item['paper_title'], paper_title=item['paper_title'], text_data=download_text, cache_dir=download_directory_name)
      if status:
        print(f"Cached text for {item['paper_title']}")
    time.sleep(1)
  else:
    print(f"Download failed for {item['paper_title']}")
    time.sleep(1)


# Chunking MD Text
for item in paper_list_for_download:
  print(f"Loading cached paper text for {item['paper_title']} on {item['pdf_url']}.")
  status, text_data = load_cached_paper_text(paper_id=item['paper_title'], cache_dir=download_directory_name)
  if status:
    chunks = chunk_by_paragraph(text_data)
    item['chunks'] = chunks
  else:
    print(f"‚ùå No cached text found for paper ID: {item['paper_title']}")
    item['chunks'] = []


# Labelling Each Chunk
for item in paper_list_for_download:
  status = label_all_chunks_per_paper(item, "empirical_observations", property_dictionary['empirical_observations']['description'])
  if status:
    print(f"‚úÖ Paper {item['paper_title']} labelled.")


# Checking Labelling Output
for item in paper_list_for_download:
  print(item.keys())
  print(item['labeled_chunks_all'][0].keys())
  print(item['labeled_chunks_all'][0]['label_output'])

# Preparing Output for Embedding
content_for_embedding = prepare_chunks_for_embedding(paper_list_for_download, property_label="empirical_observations", labeled_chunks_key='labeled_chunks_all')
print(len(content_for_embedding))
print(type(content_for_embedding[0]))
print(content_for_embedding[0].keys())

# Checking Content for Embedding
for item in content_for_embedding:
  print(item['paper_url'])
  print(item['extracted_content'])

# Creating Embeddings
embeddings = create_embeddings(content_for_embedding, model_name='allenai/scibert_scivocab_uncased', save_path='label_all_embeddings')

# First, let's check what we're working with
print("Debug info:")
print(f"Embeddings shape: {embeddings.shape}")
print(f"Embedding dimensions: {embeddings.shape[1]}")

# Let's look at the similarity distribution
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Calculate similarities
similarities = cosine_similarity(embeddings)
# Get upper triangle (excluding diagonal)
upper_tri = np.triu(similarities, k=1)
flat_sims = upper_tri[upper_tri > 0]

print(f"\nSimilarity stats:")
print(f"Mean similarity: {flat_sims.mean():.3f}")
print(f"Std similarity: {flat_sims.std():.3f}")
print(f"Min similarity: {flat_sims.min():.3f}")
print(f"Max similarity: {flat_sims.max():.3f}")

# Try simpler clustering - KMeans first to see if there ARE patterns
from sklearn.cluster import KMeans

# Try different k values
for k in [3, 4, 5, 8, 6]:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans_labels = kmeans.fit_predict(embeddings)

    print(f"\nKMeans with k={k}:")
    for i in range(k):
        count = list(kmeans_labels).count(i)
        print(f"  Cluster {i}: {count} observations")


# Doing final clustering with k=4
k=4
k_means_clustering_labels, clusterer = cluster_extractions_kmeans(embeddings, k=k)

for i in range(len(content_for_embedding)):
    content_for_embedding[i]['cluster_id'] = k_means_clustering_labels[i]

# Generating cluster summary

cluster_summaries = {}
for cluster_id in range(k):
      # Get all extractions for this cluster
      cluster_extractions = [ext for ext in content_for_embedding if ext['cluster_id'] == cluster_id]

      if len(cluster_extractions) > 0:
          summary = summarize_cluster_extractions(
              cluster_extractions,
              property_name="empirical_observations",
              property_description=property_dictionary['empirical_observations']['description'],
              cluster_id=cluster_id
          )
          cluster_summaries[cluster_id] = summary
          print(f"Cluster {cluster_id}: {summary} ({len(cluster_extractions)} extractions)")

      print("\n‚úÖ Cluster summaries generated!")
      print("\nFinal summaries:", cluster_summaries)


# Generating 2d visualisation data using UMAP

visualisation_data = create_visualization_data(embeddings, k_means_clustering_labels, content_for_embedding)

# Generating visualisation with summary

visualize_clusters_with_summaries(visualisation_data['embeddings_2d'], visualisation_data['cluster_labels'], visualisation_data['extractions'], cluster_summaries=cluster_summaries, property_name="Empirical Observations")

Enter a list of arxiv pdf urls separated with a comma.https://arxiv.org/pdf/2510.09901, https://arxiv.org/pdf/2410.07076, https://arxiv.org/pdf/2307.10635, https://arxiv.org/pdf/2505.04651, https://arxiv.org/pdf/2505.04651, https://arxiv.org/pdf/2503.24047
Length of paper list for download: 6
Enter the name of the directory you want to download to.Hypothesis_generator_test
Downloading to directory: Hypothesis_generator_test
Downloading paper_1...
Downloaded paper_1 to Hypothesis_generator_test/paper_1.pdf
Checking for pdf path before markdown extraction.
Getting PDF directory name.
Getting markdown from paper_1.
Extracted text for paper_1
Caching the text data.
‚úÖ Cached: paper_1...
Cached text for paper_1
Downloading paper_2...
Downloaded paper_2 to Hypothesis_generator_test/paper_2.pdf
Checking for pdf path before markdown extraction.
Getting PDF directory name.
Getting markdown from paper_2.
Extracted text for paper_2
Caching the text data.
‚úÖ Cached: paper_2...
Cached text for pa



Embedding 94 extractions...
Model embedding dimension: 768


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

‚úÖ Created embeddings matrix: (94, 768)
Debug info:
Embeddings shape: (94, 768)
Embedding dimensions: 768

Similarity stats:
Mean similarity: 0.846
Std similarity: 0.063
Min similarity: 0.551
Max similarity: 0.985

KMeans with k=3:
  Cluster 0: 10 observations
  Cluster 1: 48 observations
  Cluster 2: 36 observations

KMeans with k=4:
  Cluster 0: 9 observations
  Cluster 1: 32 observations
  Cluster 2: 18 observations
  Cluster 3: 35 observations

KMeans with k=5:
  Cluster 0: 9 observations
  Cluster 1: 32 observations
  Cluster 2: 16 observations
  Cluster 3: 36 observations
  Cluster 4: 1 observations

KMeans with k=8:
  Cluster 0: 6 observations
  Cluster 1: 20 observations
  Cluster 2: 19 observations
  Cluster 3: 1 observations
  Cluster 4: 16 observations
  Cluster 5: 18 observations
  Cluster 6: 9 observations
  Cluster 7: 5 observations

KMeans with k=6:
  Cluster 0: 7 observations
  Cluster 1: 8 observations
  Cluster 2: 14 observations
  Cluster 3: 29 observations
  Cluste

KeyError: 'cluster_id'

In [186]:
for i in range(len(content_for_embedding)):
    content_for_embedding[i]['cluster_id'] = k_means_clustering_labels[i]

# Generating cluster summary

cluster_summaries = {}
for cluster_id in range(k):
      # Get all extractions for this cluster
      cluster_extractions = [ext for ext in content_for_embedding if ext['cluster_id'] == cluster_id]

      if len(cluster_extractions) > 0:
          summary = summarize_cluster_extractions(
              cluster_extractions,
              property_name="empirical_observations",
              property_description=property_dictionary['empirical_observations']['description'],
              cluster_id=cluster_id
          )
          cluster_summaries[cluster_id] = summary
          print(f"Cluster {cluster_id}: {summary} ({len(cluster_extractions)} extractions)")

      print("\n‚úÖ Cluster summaries generated!")
      print("\nFinal summaries:", cluster_summaries)


# Generating 2d visualisation data using UMAP

visualisation_data = create_visualization_data(embeddings, k_means_clustering_labels, content_for_embedding)

# Generating visualisation with summary

visualize_clusters_with_summaries(visualisation_data['embeddings_2d'], visualisation_data['cluster_labels'], visualisation_data['extractions'], cluster_summaries=cluster_summaries, property_name="Empirical Observations")

1. - The TOMATO-Chem benchmark contains 51 chemistry/materials papers: Polymer Chemistry 21, Organic Chemistry 22, Inorganic Chemistry 3, Analytical Chemistry 5; publication venues include Nature/Science 27, Nature subjournals 20, and other top journals 4. [Table 1; Table 2]
- In inspiration retrieval (Q1), GPT-4o achieves high hit ratios even with aggressive down-selection: for corpus sizes 150/300/1000/3000, hit ratios are 92.8/76.8/61.4/NA% at top 20%, 96.7/83.7/60.8/NA% at top 4%, 96.4/88.9/69.0/46.7% at top 0.8%, and 95.8/86.9/70.6/52.0% at top 0.016%. [Table 3]
- Smaller screening windows improve retrieval: with a screen window size of 60 selecting 3 in one round (5% of corpus) the hit ratio is 71.6%, whereas a window of 15 selecting 3 for two rounds (4% of corpus) yields 83.7%. [Table 4]
- Model comparison for inspiration retrieval (corpus size 300) shows GPT-4o outperforming Llama-3.1 models: GPT-4o achieves 96.7% (top 20%), 83.7% (top 4%), 60.8% (top 0.8%); Llama-3.1-405B: 95.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



‚úÖ UMAP complete! Shape: (94, 2)
   Visualizing 4 clusters


In [None]:
# Labelling Each Chunk
for item in paper_list_for_download:
  status = label_all_chunks_per_paper(item, "observational_statements", property_dictionary['observational_statements']['description'])
  if status:
    print(f"‚úÖ Paper {item['paper_title']} labelled.")


# Checking Labelling Output
for item in paper_list_for_download:
  print(item.keys())
  print(item['labeled_chunks_all'][0].keys())
  print(item['labeled_chunks_all'][0]['label_output'])

# Preparing Output for Embedding
content_for_embedding = prepare_chunks_for_embedding(paper_list_for_download, property_label="observational_statements", labeled_chunks_key='labeled_chunks_all_observational')
print(len(content_for_embedding))
print(type(content_for_embedding[0]))
print(content_for_embedding[0].keys())

# Checking Content for Embedding
for item in content_for_embedding:
  print(item['paper_url'])
  print(item['extracted_content'])

# Creating Embeddings
embeddings = create_embeddings(content_for_embedding, model_name='allenai/scibert_scivocab_uncased', save_path='label_all_embeddings_observational')

# First, let's check what we're working with
print("Debug info:")
print(f"Embeddings shape: {embeddings.shape}")
print(f"Embedding dimensions: {embeddings.shape[1]}")

# Let's look at the similarity distribution
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Calculate similarities
similarities = cosine_similarity(embeddings)
# Get upper triangle (excluding diagonal)
upper_tri = np.triu(similarities, k=1)
flat_sims = upper_tri[upper_tri > 0]

print(f"\nSimilarity stats:")
print(f"Mean similarity: {flat_sims.mean():.3f}")
print(f"Std similarity: {flat_sims.std():.3f}")
print(f"Min similarity: {flat_sims.min():.3f}")
print(f"Max similarity: {flat_sims.max():.3f}")

# Try simpler clustering - KMeans first to see if there ARE patterns
from sklearn.cluster import KMeans

# Try different k values
for k in [3, 4, 5, 8, 6]:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans_labels = kmeans.fit_predict(embeddings)

    print(f"\nKMeans with k={k}:")
    for i in range(k):
        count = list(kmeans_labels).count(i)
        print(f"  Cluster {i}: {count} observations")

for i in range(len(content_for_embedding)):
    content_for_embedding[i]['cluster_id'] = k_means_clustering_labels[i]

# Doing final clustering with k=4
k=4
k_means_clustering_labels, clusterer = cluster_extractions_kmeans(embeddings, k=k)


# Generating cluster summary

cluster_summaries = {}
for cluster_id in range(k):
      # Get all extractions for this cluster
      cluster_extractions = [ext for ext in content_for_embedding if ext['cluster_id'] == cluster_id]

      if len(cluster_extractions) > 0:
          summary = summarize_cluster_extractions(
              cluster_extractions,
              property_name="observational_statements",
              property_description=property_dictionary['observational_statements']['description'],
              cluster_id=cluster_id
          )
          cluster_summaries[cluster_id] = summary
          print(f"Cluster {cluster_id}: {summary} ({len(cluster_extractions)} extractions)")

      print("\n‚úÖ Cluster summaries generated!")
      print("\nFinal summaries:", cluster_summaries)


# Generating 2d visualisation data using UMAP

visualisation_data = create_visualization_data(embeddings, k_means_clustering_labels, content_for_embedding)

# Generating visualisation with summary

visualize_clusters_with_summaries(visualisation_data['embeddings_2d'], visualisation_data['cluster_labels'], visualisation_data['extractions'], cluster_summaries=cluster_summaries, property_name="observational_statements")

Labelling chunks for paper: paper_1, https://arxiv.org/pdf/2510.09901
Total chunks: 36

Starting labelling for: 36
Labelling chunk 1/36
AI request for chunk 1/36 completed.
AI response for chunk 1/36 parsed.
AI request for chunk 1/36 appended.
Labelling chunk 2/36
AI request for chunk 2/36 completed.
AI response for chunk 2/36 parsed.
AI request for chunk 2/36 appended.
Labelling chunk 3/36
AI request for chunk 3/36 completed.
AI response for chunk 3/36 parsed.
AI request for chunk 3/36 appended.
Labelling chunk 4/36
AI request for chunk 4/36 completed.
AI response for chunk 4/36 parsed.
AI request for chunk 4/36 appended.
Labelling chunk 5/36
AI request for chunk 5/36 completed.
AI response for chunk 5/36 parsed.
AI request for chunk 5/36 appended.
Labelling chunk 6/36
AI request for chunk 6/36 completed.
AI response for chunk 6/36 parsed.
AI request for chunk 6/36 appended.
Labelling chunk 7/36
AI request for chunk 7/36 completed.
AI response for chunk 7/36 parsed.
AI request for chu

In [None]:
# Labelling Each Chunk
for item in paper_list_for_download:
  status = label_all_chunks_per_paper(item, "theoretical_statements", property_dictionary['theoretical_statements']['description'])
  if status:
    print(f"‚úÖ Paper {item['paper_title']} labelled.")


# Checking Labelling Output
for item in paper_list_for_download:
  print(item.keys())
  print(item['labeled_chunks_all'][0].keys())
  print(item['labeled_chunks_all'][0]['label_output'])

# Preparing Output for Embedding
content_for_embedding = prepare_chunks_for_embedding(paper_list_for_download, property_label="theoretical_statements", labeled_chunks_key='labeled_chunks_all_theoretical')
print(len(content_for_embedding))
print(type(content_for_embedding[0]))
print(content_for_embedding[0].keys())

# Checking Content for Embedding
for item in content_for_embedding:
  print(item['paper_url'])
  print(item['extracted_content'])

# Creating Embeddings
embeddings = create_embeddings(content_for_embedding, model_name='allenai/scibert_scivocab_uncased', save_path='label_all_embeddings_theoretical')

# First, let's check what we're working with
print("Debug info:")
print(f"Embeddings shape: {embeddings.shape}")
print(f"Embedding dimensions: {embeddings.shape[1]}")

# Let's look at the similarity distribution
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Calculate similarities
similarities = cosine_similarity(embeddings)
# Get upper triangle (excluding diagonal)
upper_tri = np.triu(similarities, k=1)
flat_sims = upper_tri[upper_tri > 0]

print(f"\nSimilarity stats:")
print(f"Mean similarity: {flat_sims.mean():.3f}")
print(f"Std similarity: {flat_sims.std():.3f}")
print(f"Min similarity: {flat_sims.min():.3f}")
print(f"Max similarity: {flat_sims.max():.3f}")

# Try simpler clustering - KMeans first to see if there ARE patterns
from sklearn.cluster import KMeans

# Try different k values
for k in [3, 4, 5, 8, 6]:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans_labels = kmeans.fit_predict(embeddings)

    print(f"\nKMeans with k={k}:")
    for i in range(k):
        count = list(kmeans_labels).count(i)
        print(f"  Cluster {i}: {count} observations")


# Doing final clustering with k=4
k=4
k_means_clustering_labels, clusterer = cluster_extractions_kmeans(embeddings, k=k)


# Generating cluster summary

cluster_summaries = {}
for cluster_id in range(k):
      # Get all extractions for this cluster
      cluster_extractions = [ext for ext in content_for_embedding if ext['cluster_id'] == cluster_id]

      if len(cluster_extractions) > 0:
          summary = summarize_cluster_extractions(
              cluster_extractions,
              property_name="theoretical_statements",
              property_description=property_dictionary['theoretical_statements']['description'],
              cluster_id=cluster_id
          )
          cluster_summaries[cluster_id] = summary
          print(f"Cluster {cluster_id}: {summary} ({len(cluster_extractions)} extractions)")

      print("\n‚úÖ Cluster summaries generated!")
      print("\nFinal summaries:", cluster_summaries)


# Generating 2d visualisation data using UMAP

visualisation_data = create_visualization_data(embeddings, k_means_clustering_labels, content_for_embedding)

# Generating visualisation with summary

visualize_clusters_with_summaries(visualisation_data['embeddings_2d'], visualisation_data['cluster_labels'], visualisation_data['extractions'], cluster_summaries=cluster_summaries, property_name="theoretical_statements")