### required installations

In [None]:
pip install langchain langchain-community langchain-experimental langchain-google-genai us pyvis



### import required libraries

In [None]:
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
import pandas as pd
from IPython.display import display, Markdown
from langchain_google_genai import ChatGoogleGenerativeAI
import random
import getpass
import os
import io
import json
import re

### load the different dvl llm chains into memory

#### User Insight Type

In [None]:

def identify_insight(user_request, llm):
    """
    Identifies the type of insight needed based on a user's visualization request.

    Args:
        user_request (str): The user's request for a visualization.
        llm (LLM): The Language Model to process the request.

    Returns:
        dict: A structured dictionary containing:
              - "insight_need_type": The classified insight type.
              - "key_variables": A list of identified variables.
              - "reasoning": Explanation of the classification.
    """

    # Define Prompt Template
    identify_insight_prompt = PromptTemplate(
        input_variables=["user_request"],
        template="""
        You are an expert in data visualization needs analysis. The user has provided the following visualization request: "{user_request}".

        Below is a structured reference on different **Insight Need Types** that users may require when requesting data visualizations. Classify the user request into one of these **eight main categories** and determine if there is a **subtype**.

        --- **Insight Need Types** ---

        **1. Categorization and Clustering**
           - Assigns data into distinct **categories or clusters**.
           - Example: "Group employees by department."

        **2. Ordering, Ranking, and Sorting**
           - Arranges objects based on ranking, value, or order.
           - Example: "Show the top 10 highest-paying jobs."

        **3. Distribution (Outliers and Gaps)**
           - Displays how objects are **spread across a dataset**, detecting **gaps or outliers**.
           - Example: "Show a histogram of employee salaries."

        **4. Trends (Process and Time-Based Analysis)**
           - Tracks **gradual changes over time** (short-term, long-term trends).
           - Example: "Plot revenue growth over the past five years."

        **5. Comparison (Similarities and Differences)**
           - Examines multiple objects to **highlight differences and similarities**.
           - Example: "Compare average salaries of men and women in the company."

        **6. Geospatial Location (Mapping Data to Physical Spaces)**
           - Assigns data to a **geographical location**.
           - Example: "Show a heatmap of customer purchases by region."

        **7. Composition (Part-to-Whole Relationships and Text Structuring)**
           - **Part-to-whole relationships** (e.g., pie charts, hierarchical structures).
           - Example: "Show the percentage breakdown of expenses by category."

        **8. Correlations and Relationships (Finding Patterns Between Variables)**
           - Identifies **connections between multiple variables** (one-to-one, many-to-many).
           - Example: "Analyze the relationship between years of experience and salary."

        --- **Your Task** ---
        - **Step 1:** Analyze the user request carefully.
        - **Step 2:** Classify it into the **most relevant insight need type** from the list above.
        - **Step 3:** If applicable, specify a **subtype** (e.g., "Comparison → Side-by-side bar chart").
        - **Step 4:** Extract the **key variables** that should be used in the analysis (e.g., occupation, job_family).
        - **Step 5:** Provide a brief **reasoning** explaining why the chosen insight need type and visualization subtype is appropriate.

        --- **Expected Output Format** ---
        Return the response in the following structured format:

        **Insight Need Type**: [Selected Type] → [Subtype if applicable]
        **Identified Variables**: [List of variables]
        **Reasoning**: [Brief explanation on why this classification was chosen]
        """
    )

    # Create the Chain
    identify_insight_chain = LLMChain(
        llm=llm,
        prompt=identify_insight_prompt,
        output_key="insight_analysis"
    )

    # Display the user request
    display(Markdown("### "+ "The user request is:"))
    display(Markdown(user_request))

    # Run the Chain
    result = identify_insight_chain.invoke({"user_request": user_request})

    # Extract the raw output
    insight_analysis = result["insight_analysis"]

    # Parse the structured output
    def parse_insight_output(insight_text):
        lines = insight_text.strip().split("\n")
        lines = [line for line in lines if line != '']

        # Ensure correct indexing
        insight_need_type = lines[0].replace("**Insight Need Type**: ", "").strip()

        # Extract key variables safely
        key_variables_line = lines[1].replace("**Identified Variables**: ", "").strip()
        key_variables = [var.strip() for var in key_variables_line.split(",") if var.strip()]

        # Extract reasoning
        reasoning = lines[2].replace("**Reasoning**: ", "").strip()

        return {
            "insight_need_type": insight_need_type,
            "key_variables": key_variables,
            "reasoning": reasoning
        }

    # Convert output into structured JSON
    insight_json = parse_insight_output(insight_analysis)

    # Display the final output
    display(Markdown("### Identified Insight Needs: "))
    display(Markdown("Insight Need Type is "+ insight_json["insight_need_type"]))
    display(Markdown("Key Variables to use : " + str(insight_json["key_variables"])))
    display(Markdown("Reasoning : " + insight_json["reasoning"]))

    return insight_json

#### identify data scale

In [None]:

def identify_data_scale(df, insight_need_type, key_variables, llm):
    """
    Identifies the data scale types of key variables in the dataset.

    Args:
        df (pd.DataFrame): The dataset to analyze.
        insight_need_type (str): The identified insight type.
        key_variables (list): List of key variables relevant to the analysis.
        llm (LLM): The Language Model to process the request.

    Returns:
        dict: A structured dictionary containing:
              - "missing_variables": List of missing variables.
              - "categorized_variables": Mapping of found variables to their data scale type and recommended Python data type.
    """

    # Capture DataFrame info
    df_info_str = df.dtypes.to_string()

    # Convert df.head() and df.describe() to JSON for readability
    df_head_str = df.head().to_json(orient="records", indent=2)
    df_summary_str = df.describe().to_json(indent=2)

    # Define the Prompt Template
    identify_data_scale_prompt = PromptTemplate(
        input_variables=["df_info", "df_head", "df_summary", "insight_need_type", "key_variables"],
        template="""
        You are an expert in data analysis and need to classify dataset variables based on their **data scale types**.

        --- **Data Scale Types** ---
        1️⃣ **Nominal Scale (Categorical Data)**
          - **Definition**: Qualitative scale with no inherent order.
          - **Examples**: Gender (Male/Female), Job Titles, Countries.

        2️⃣ **Ordinal Scale (Ranked Data)**
          - **Definition**: Categories have a specific order, but differences are not measurable.
          - **Examples**: Satisfaction Ratings (Strongly Agree → Agree → Neutral → Disagree).

        3️⃣ **Interval Scale (Numeric Data Without Absolute Zero)**
          - **Definition**: Numeric values with meaningful differences, but zero is arbitrary.
          - **Examples**: Temperature in Celsius/Fahrenheit, IQ Scores.

        4️⃣ **Ratio Scale (Numeric Data With Absolute Zero)**
          - **Definition**: Numeric values where zero represents total absence.
          - **Examples**: Population counts, Weight, Height, Income, Distance.

        --- **Dataset Information** ---
        **Insight Need Type**: {insight_need_type}
        **Key Variables**: {key_variables}

        --- **Dataset Overview** ---
        **df.info() Output:**
        {df_info}

        **df.head() Output (Sample Rows):**
        {df_head}

        **df.describe() Output (Summary Statistics):**
        {df_summary}

        --- **Your Task** ---
        - **Step 1:** Identify the actual dataset column names that match the **Key Variables**.
          - If a key variable is missing, output: `"missing_variables": [list of missing variables]`.
          - If a key variable exists in the dataset, return a **dictionary** mapping the **key variable** to the actual column name.
          - Example:
            ```json
            {{
                "variable_mappings": {{
                    "Income": "annual_salary",
                    "Age": "customer_age"
                }}
            }}
            ```
          - **IMPORTANT:** Use these mapped dataset column names in all further steps.

        - **Step 2:** **Only classify the found variables (from Step 1) into one of the four Data Scales**:
          - **Nominal Scale** → Categorical with no ordering.
          - **Ordinal Scale** → Ordered categories without fixed differences.
          - **Interval Scale** → Numeric with equal intervals, but no absolute zero.
          - **Ratio Scale** → Numeric with equal intervals and absolute zero.

        - **Step 3:** Determine the appropriate **Python data type** for each found variable.
          - If the current type is incorrect, **generate a JSON output** specifying how to convert the variable.

        --- **IMPORTANT: JSON OUTPUT ONLY** ---
        Return ONLY valid **JSON**.
        **DO NOT classify variables that are not listed in the "Key Variables".**
        **DO NOT return Python code** or explanations.
        **DO NOT enclose JSON in Python code blocks**.

        --- **Expected Output Format (Strict JSON for Key Variables Only)** ---
        ```json
        {{
            "missing_variables": [list of missing variables, if any],
            "categorized_variables": {{
                "found_variable": {{
                    "chosen_scale": "Nominal/Ordinal/Interval/Ratio",
                    "suggested_python_dtype": "int/float/str/category",
                    "conversion_needed": true/false,
                    "conversion_code": "Python conversion script if needed"
                }}
            }}
        }}
        ```
        """
    )

    display(Markdown("### "+ "Data Scale Types"))

    # Create the Chain
    identify_data_scale_chain = LLMChain(
        llm=llm,
        prompt=identify_data_scale_prompt,
        output_key="data_scale_analysis"
    )

    # Run the Chain
    data_scale_result = identify_data_scale_chain.invoke({
        "df_info": df_info_str,
        "df_head": df_head_str,
        "df_summary": df_summary_str,
        "insight_need_type": insight_need_type,
        "key_variables": key_variables
    })

    # Extract the raw JSON output
    chain2_output_str = data_scale_result["data_scale_analysis"]
    display(Markdown(chain2_output_str))

    return chain2_output_str

#### post-process data if needed

In [None]:

def post_process_data(df, chain2_output_str):
    """
    Performs post-processing on the dataset by cleaning and executing LLM-generated conversion scripts.

    Args:
        df (pd.DataFrame): The original dataset.
        chain2_output_str (str): JSON string output from the LLM, containing variable classifications and conversions.

    Returns:
        tuple: (df_filtered, column_data_types)
              - df_filtered: DataFrame containing only key variables after conversions.
              - column_data_types: Dictionary mapping variables to their suggested Python data types.
    """

    # 🔹 Step 1: Print the original DataFrame types before processing
    display(Markdown("### "+ "Data Pre-Processing:"))
    display(Markdown("Data before processing: \n "))
    display(Markdown("```\n" + df.dtypes.to_string() + "\n```"))

    # 🔹 Step 2: Clean the LLM output before parsing JSON
    cleaned_output = re.sub(r"json", "", chain2_output_str)  # Remove "json" keyword if present
    cleaned_output = re.sub(r"```", "", cleaned_output)  # Remove code block markers

    # Convert cleaned JSON string into a dictionary
    data_scale_analysis = json.loads(cleaned_output)

    # 🔹 Step 5: Apply conversions using `exec()`
    for var, details in data_scale_analysis["categorized_variables"].items():
        if details["conversion_needed"] and details["conversion_code"]:
            # print(f"\nExecuting conversion for {var}: {details['conversion_code']}\n")
            exec(details["conversion_code"])  # Executes the conversion script

    # 🔹 Step 3: Extract key variables from the LLM output
    key_variables = list(data_scale_analysis["categorized_variables"].keys())

    # 🔹 Step 4: Filter the DataFrame to only include key variables
    df_filtered = df[key_variables]

    # 🔹 Step 6: Verify if conversion was successful
    # display(Markdown(print("\n\n Data after processing: \n ",(df_filtered.dtypes))))
    display(Markdown("Data after processing: \n "))
    display(Markdown("```\n" + df_filtered.dtypes.to_string() + "```"))

    # Create a dictionary mapping variables to their suggested Python data types
    column_data_types = {
        column: details["suggested_python_dtype"]
        for column, details in data_scale_analysis["categorized_variables"].items()
    }

    return df_filtered, column_data_types

#### choose best analysis type

In [None]:

def select_best_analysis_type(user_request, insight_json, df_filtered, column_data_types, llm):
    """
    Determines the best study type (Temporal, Geospatial, Topical, or Network) based on the dataset and visualization request.

    Args:
        user_request (str): The user's request for a visualization.
        insight_json (dict): The dictionary containing insight need type.
        df_filtered (pd.DataFrame): The processed dataset containing only key variables.
        column_data_types (dict): Dictionary mapping key variables to their data types.
        llm (LLM): The Language Model to process the request.

    Returns:
        dict: A structured dictionary containing:
              - "chosen_study_type": The best study type (Temporal, Geospatial, Topical, or Network).
              - "reasoning": Explanation of why this study type was chosen.
    """

    # Define the Prompt Template
    select_study_type_prompt = PromptTemplate(
        input_variables=[
            "user_request",
            "insight_need_type",
            "df_info",
            "df_head",
            "key_variable_types"
        ],
        template="""You are an expert in data-driven analysis and visualization. Your task is to determine the **most appropriate type of study** (Temporal, Geospatial, Topical, or Network) based on the given dataset, user request and insight need type.

    ---
    ### **Study Types Overview**

    #### **1. Temporal Study ("When")**
    - **Purpose:** Answers questions related to time-based trends, events, bursts, or seasonality.
    - **Typical Data:** Time-series data, timestamped records.
    - **Key Visualizations:** Line graphs, stacked bar graphs, time histograms, flow maps.
    - **Common Applications:** Stock market trends, sales over time, publication frequencies.

    #### **2. Geospatial Study ("Where")**
    - **Purpose:** Analyzes spatial distributions and relationships based on location data.
    - **Typical Data:** Latitude/longitude, address-based data, regional statistics.
    - **Key Visualizations:** Choropleth maps, proportional symbol maps, cartograms.
    - **Common Applications:** Disease spread, migration patterns, resource allocation.

    #### **3. Topical Study ("What")**
    - **Purpose:** Analyzes text data to identify key topics, term frequencies, or content trends.
    - **Typical Data:** Large text corpora (books, articles, papers), word frequency distributions.
    - **Key Visualizations:** Word clouds, stream graphs, self-organizing maps.
    - **Common Applications:** Sentiment analysis, keyword tracking, academic research trends.

    #### **4. Network Study ("With Whom")**
    - **Purpose:** Examines relationships, collaborations, and interactions between entities.
    - **Typical Data:** Node-edge datasets, social networks, organizational structures.
    - **Key Visualizations:** Node-link diagrams, Sankey diagrams, adjacency matrices.
    - **Common Applications:** Social network analysis, knowledge graphs, citation networks.

    ---

    ### **User Request**
    - **Requested Analysis:** "{user_request}"

    ### **Insight Need Type**
    - **Insight Needed:** {insight_need_type}

    ### **Dataset Overview**
    - **DataFrame Info:**
      {df_info}
    - **Sample Data:**
      {df_head}

    ### **Key Variables and Their Types**
    - {key_variable_types}

    ---

    ### **Your Task**
    1 **Analyze** the user request, insight need type and dataset structure.
    2 **Determine** which of the four study types (Temporal, Geospatial, Topical, or Network) best fits the given data and analysis requirements.
    3 **Return** the study type and provide a brief justification.

    ---

    **Expected Output Format (JSON)**
    ```json
    {{
        "chosen_study_type": "Temporal / Geospatial / Topical / Network",
        "reasoning": "Brief explanation of why this study type was chosen based on the dataset and user request."
    }}
    ```
    """

    )

    # Create the Chain
    select_study_type_chain = LLMChain(
        llm=llm,
        prompt=select_study_type_prompt,
        output_key="study_type_selection"
    )

    # Run the Chain
    study_type_result = select_study_type_chain.invoke({
        "user_request": user_request,
        "insight_need_type": insight_json["insight_need_type"],
        "df_info": df_filtered.dtypes.to_string(),  # Convert DataFrame info to string
        "df_head": str(df_filtered.head()),  # Convert DataFrame head to string
        "key_variable_types": column_data_types
    })

    # Extract and clean the output
    study_type_choice = study_type_result["study_type_selection"]

    # Remove unnecessary formatting
    cleaned_output = re.sub(r"json", "", study_type_choice)
    cleaned_output = re.sub(r"```", "", cleaned_output)

    # Convert cleaned JSON string into a dictionary
    study_type_analysis = json.loads(cleaned_output)

    # Extract and display the chosen study type
    chosen_study_type = study_type_analysis["chosen_study_type"]
    display(Markdown("### " + "Chosen Analysis Type"))
    display(Markdown(chosen_study_type))
    display(Markdown(study_type_analysis["reasoning"]))

    return study_type_analysis

#### choose best visualization type

In [None]:

def choose_best_visualization(user_request, insight_json, df_filtered, analysis_type, llm):
    """
    Determines the best visualization type for a given dataset and user request.

    Args:
        user_request (str): The user's request for a visualization.
        insight_json (dict): The dictionary containing insight need type.
        df_filtered (pd.DataFrame): The processed dataset containing only key variables.
        analysis_type (str): The chosen analysis type (Temporal, Geospatial, Topical, or Network).
        llm (LLM): The Language Model to process the request.

    Returns:
        dict: A structured dictionary containing:
              - "chosen_visualization": The best visualization type.
              - "reasoning": Explanation of why this visualization was chosen.
    """

    # Define the Prompt Template
    choose_visualization_prompt = PromptTemplate(
        input_variables=["user_request", "insight_need_type", "df_info", "df_head", "analysis_type"],
        template="""
        You are an expert in data visualization. Your task is to analyze the given data, user request, insight need type, the analysis type and select the most appropriate visualization type.

        Below is a structured reference for different **visualization types** and their appropriate use cases:
        ---
        **Visualization Types and Their Purpose**

        **Tables**
        Tables are structured grids consisting of rows and columns where each cell contains a data value.
        They are commonly used for structured data representation and can include frequency distributions,
        percentages, and summary statistics. Tables support sorting, grouping, filtering, and interactive selection,
        making them effective for displaying raw or aggregated numerical and categorical data.

        **Charts**
        Charts provide a visual representation of data without a strict reference system.
        They are widely used to depict quantitative and qualitative relationships in an easily interpretable format.
        Charts can be **comparative (bar charts), proportional (pie charts), or relational (bubble charts)**.
        They can also use **size, color, and position** to encode additional data properties.

        **Graphs**
        Graphs use a coordinate system to represent relationships between variables.
        They provide a structured way to analyze **trends, correlations, and distributions**
        by mapping data points to a well-defined axis.
        Graphs allow for advanced binning, interpolation, and smoothing to improve data interpretation.
        They are particularly useful for **time-series data, correlations, and multivariate analysis**.

        **Maps**
        Maps link data to **geographical locations** and are used to show spatial distributions of data.
        They encode additional data variables using **color, size, and patterns**,
        making them effective for **geospatial analysis, density estimation, and regional comparisons**.

        **Network Layouts**
        Network visualizations are used to represent **relationships between data entities**.
        They consist of **nodes (objects) and edges (connections)**, which may be directed, weighted, or hierarchical.
        These layouts are useful for understanding **complex interactions, dependencies, and clustering in datasets**.
        ---

        ### **User Request**
        The user wants to visualize: "{user_request}"

        ### **Insight Need Type**
        - {insight_need_type}

        ### **Dataset Overview**
        The dataset after processing the key variables is summarized as follows:

        **DataFrame Info:**
        {df_info}

        **DataFrame Sample Rows:**
        {df_head}

        ### **Chosen Analysis Type**
        The analysis will use:
        {analysis_type}

        ### **Your Task**
        - **Step 1:** Analyze the **user request**, the **insight need type** and the **analysis type**.
        - **Step 2:** Review the **data structure** from the `df_info()` and `df_head()` output.
        - **Step 3:** Choose the **best visualization type** from the provided reference based on:
            - The **type of data** (numerical, categorical, temporal, geospatial, relational).
            - The **insight need type** (e.g., trends, distributions, correlations, comparisons).
            - The **user’s intended goal** (e.g., exploratory analysis, detailed reporting, summarization).
            - The **analysis type** (e.g., temporal, geospatial, topical and network).

        **Expected Output Format (JSON)**
        ```json
        {{
            "chosen_visualization": "Selected Visualization Type",
            "reasoning": "Brief explanation of why this visualization was chosen based on the data and insight need."
        }}
        ```
        """
    )

    # Create the Chain
    choose_visualization_chain = LLMChain(
        llm=llm,
        prompt=choose_visualization_prompt,
        output_key="visualization_choice"
    )

    # Run the Chain
    visualization_result = choose_visualization_chain.invoke({
        "user_request": user_request,
        "insight_need_type": insight_json["insight_need_type"],
        "df_info": str(df_filtered.dtypes.to_string()),  # Convert DataFrame info to string
        "df_head": str(df_filtered.head()),   # Convert DataFrame head to string,
        "analysis_type": analysis_type
    })

    # Extract and clean the output
    visualization_choice = visualization_result["visualization_choice"]

    # Remove unnecessary formatting
    cleaned_output = re.sub(r"json", "", visualization_choice)
    cleaned_output = re.sub(r"```", "", cleaned_output)

    # Convert cleaned JSON string into a dictionary
    visualization_data = json.loads(cleaned_output)

    # Extract and display the chosen visualization type
    chosen_visualization_type = visualization_data["chosen_visualization"]
    print("\n\n")
    display(Markdown("### "+ "Chosen Visualization Type"))
    display(Markdown(chosen_visualization_type))
    display(Markdown(visualization_data['reasoning']))

    return visualization_data

#### choose graphic symbols

In [None]:

def choose_best_graphic_symbol(user_request, insight_json, df_filtered, column_data_types, chosen_visualization_type, analysis_type, llm):
    """
    Determines the best graphic symbol type for a given dataset and user request.

    Args:
        user_request (str): The user's request for a visualization.
        insight_json (dict): The dictionary containing insight need type.
        df_filtered (pd.DataFrame): The processed dataset containing only key variables.
        column_data_types (dict): Dictionary mapping key variables to their data types.
        chosen_visualization_type (str): The chosen visualization type from the previous step.
        analysis_type (str): The chosen analysis type (Temporal, Geospatial, Topical, or Network).
        llm (LLM): The Language Model to process the request.

    Returns:
        dict: A structured dictionary containing:
              - "chosen_graphic_symbol": The best graphic symbol type.
              - "reasoning": Explanation of why this symbol was chosen.
    """

    # Define the Prompt Template
    choose_graphic_symbol_prompt = PromptTemplate(
        input_variables=[
            "user_request", "insight_need_type",
            "df_info", "df_head", "key_variable_types", "chosen_visualization_type", "analysis_type"
        ],
        template="""
    You are an expert in data visualization and graphic design. Your task is to analyze the dataset, user request, insight need type, analysis type and visualization type to determine the **most appropriate graphic symbol** to represent the data.

    Below is a structured reference on **graphic symbol types** and their appropriate use cases:
    ---
    Graphic symbols are geometric elements used to represent data records in visualizations. They encode different data variables using graphic variable types such as position, size, color, and shape. These symbols help identify, differentiate, and structure information effectively.

    Types of Graphic Symbols

1. Geometric Symbols
Geometric symbols are defined by their dimensionality and include:
	•	Points: Represent discrete locations on a map or scatter plot.
	•	Lines: Connect points and are used to denote relationships, movement, or trends (e.g., line graphs, network visualizations).
	•	Areas: Enclose a region to show boundaries, density, or proportions (e.g., choropleth maps).
	•	Surfaces: Represent continuous data fields over a 2D space (e.g., heatmaps, topographic maps).
	•	Volumes: Three-dimensional representations for depth-related data (e.g., 3D bar charts, terrain models).

Use Cases:
	•	Points: Scatter plots, maps, proportional symbol maps.
	•	Lines: Line graphs, network diagrams, flow maps.
	•	Areas: Choropleth maps, stacked area charts.
	•	Surfaces: Heatmaps, 3D terrain maps.
	•	Volumes: 3D bar charts, volumetric visualizations.

2. Linguistic Symbols
Linguistic symbols use letters, numbers, and punctuation to represent data. These include:
	•	Labels & Text: Used for axis labels, map annotations, or word clouds.
	•	Numeric Values: Data encoding through formatted numbers.
	•	Words & Sentences: Text-based data representations in infographics.

Use Cases:
	•	Text labels: Titles, axis labels, data callouts.
	•	Words & Numbers: Word clouds, tag clouds, typographic maps.
	•	Proportional Typeface: Font size encoding (e.g., larger text for higher values).

3. Pictorial Symbols
Pictorial symbols are visual representations of objects used to improve interpretation:
	•	Icons & Images: Simple visual markers (e.g., airplane icon for airports).
	•	Silhouettes & Profiles: Abstract depictions of categories (e.g., gender icons).
	•	Proportional Pictograms: Icon sizes represent numerical values.

Use Cases:
	•	Icons: Maps, dashboards, infographics.
	•	Silhouettes: Demographic visuals, symbolic representations.
	•	Proportional Pictograms: Population pyramids, gender-based visualizations.

4. Statistical Glyphs
Statistical glyphs are compact visual representations of data, often using:
	•	Chernoff Faces: Encodes multiple data variables into facial expressions.
	•	Small Multiples: Repeated small visualizations for comparison.
	•	Data Dots & Bar Glyphs: Encodes quantitative values using length or density.

Use Cases:
	•	Chernoff Faces: Multivariate human perception studies.
	•	Small Multiples: Comparative analysis (e.g., crime rates by region).
	•	Bar Glyphs: Encoded bar-based data for visual quick reading.

Combinations of Graphic Symbols
Graphic symbols can be combined to enhance visualization:
	•	Geometric + Linguistic Symbols: Word clouds overlaid on maps.
	•	Pictorial + Statistical Glyphs: Infographics with pictogram-based charts.
	•	Multiple Graphic Variables: Encoding size, color, and shape simultaneously.

How to Choose a Graphic Symbol for Data Variables
	•	If the data is spatial/geographical, use Points, Lines, or Areas.
	•	If the data involves text-based representation, use Linguistic Symbols.
	•	If the visualization requires real-world object representation, use Pictorial Symbols.
	•	If the data needs compact statistical representation, use Statistical Glyphs.
	•	If multiple dimensions need to be encoded visually, combine symbols.

    ---

    ### **User Request**
    The user wants to visualize: "{user_request}"

    ### **Insight Need Type**
    - {insight_need_type}

    ### **Analysis Type**
    - {analysis_type}

    ### **Chosen Visualization Type**
    The visualization will use:
    {chosen_visualization_type}

    ### **Dataset Overview**
    The dataset after processing the key variables is summarized as follows:

    **DataFrame Info:**
    {df_info}

    **DataFrame Sample Rows:**
    {df_head}

    ### **Key Variable Data Types**
    The data types of key variables after processing:
    {key_variable_types}

    ### **Your Task**
    - **Step 1:** Analyze the **user request**, the **insight need type** the **chosen analysis type** and the **chosen visualization type**.
    - **Step 2:** Review the **data structure** from the `df_info()` and `df_head()` output.
    - **Step 3:** Examine the **data types of key variables** to determine the best graphic symbol type.
    - **Step 4:** Choose the **best graphic symbol** from the provided reference based on:
        - The **type of data** (categorical, numerical, ordinal, geospatial, network).
        - The **insight need type** (e.g., trends, distributions, correlations, comparisons).
        - The **user’s intended goal** (e.g., exploratory analysis, detailed reporting, summarization).
        - The **analysis type** (e.g., temporal, geospatial, topical and network).
        - The **chosen visualization type** (e.g., scatter plot, choropleth map).

    **Expected Output Format (JSON)**
    ```json
    {{
        "chosen_graphic_symbol": "Selected Graphic Symbol",
        "reasoning": "Brief explanation of why this symbol was chosen based on the data, insight need, and variable types."
    }}
    ```
    """
    )

    # Create the Chain
    choose_graphic_symbol_chain = LLMChain(
        llm=llm,
        prompt=choose_graphic_symbol_prompt,
        output_key="graphic_symbol_choice"
    )

    # Run the Chain
    graphic_symbol_result = choose_graphic_symbol_chain.invoke({
        "user_request": user_request,
        "insight_need_type": insight_json["insight_need_type"],
        "df_info": df_filtered.dtypes.to_string(),  # Convert DataFrame info to string
        "df_head": str(df_filtered.head()),  # Convert DataFrame head to string
        "key_variable_types": column_data_types,  # JSON output from post-processing
        "chosen_visualization_type": chosen_visualization_type,  # Output from visualization choice
        "analysis_type": analysis_type  # Output from analysis choice
    })

    # Extract and clean the output
    graphic_symbol_choice = graphic_symbol_result["graphic_symbol_choice"]

    # Remove unnecessary formatting
    cleaned_output = re.sub(r"json", "", graphic_symbol_choice)
    cleaned_output = re.sub(r"```", "", cleaned_output)

    # Convert cleaned JSON string into a dictionary
    graphic_symbol_data = json.loads(cleaned_output)

    # Extract and display the chosen graphic symbol type
    chosen_graphic_symbol = graphic_symbol_data["chosen_graphic_symbol"]
    graphic_symbol_explanation = graphic_symbol_data["reasoning"]
    display(Markdown("### " + "Chosen Graphic Symbol"))
    display(Markdown(chosen_graphic_symbol))
    display(Markdown(graphic_symbol_explanation))

    return graphic_symbol_data

#### choose graphic variable data

In [None]:
def choose_best_graphic_variable(user_request, insight_json, df_filtered, column_data_types, chosen_graphic_symbol, chosen_visualization_type, chosen_analysis_type, llm):
    """
    Determines the best graphic variable type for a given dataset and user request.

    Args:
        user_request (str): The user's request for a visualization.
        insight_json (dict): The dictionary containing insight need type.
        df_filtered (pd.DataFrame): The processed dataset containing only key variables.
        column_data_types (dict): Dictionary mapping key variables to their data types.
        chosen_graphic_symbol (str): The chosen graphic symbol from the previous step.
        chosen_visualization_type (str): The chosen visualization type from the previous step.
        chosen_analysis_type (str): The chosen analysis type (Temporal, Geospatial, Topical, or Network).
        llm (LLM): The Language Model to process the request.

    Returns:
        dict: A structured dictionary containing:
              - "chosen_graphic_variable": The best graphic variable(s).
              - "reasoning": Explanation of why this variable was chosen.
    """

    # Define the Prompt Template
    choose_graphic_variable_prompt = PromptTemplate(
        input_variables=[
            "user_request", "insight_need_type",
            "df_info", "df_head", "key_variable_types", "chosen_graphic_symbol", "chosen_visualization_type", "chosen_analysis_type"
        ],
        template="""
    You are an expert in data visualization and perceptual design.
    Your task is to analyze the dataset, user request, insight need type, chosen analysis type, chosen visualization type and chosen graphic symbol to determine the **most effective graphic variable(s)** to visually encode the data.

    Below is a structured reference on **graphic variable types** and their appropriate use cases:
    ---
    Graphic variable types are visual properties used to encode data values in a visualization. These variables modify graphic symbols (such as points, lines, areas, pictorial symbols, or statistical glyphs) to enhance their expressiveness. They help define relationships, differentiate between elements, and improve interpretability.

    Types of Graphic Variables and Their Functions

1. Spatial Variables (Position & Location)
	•	Definition: Specifies where an element is placed in 2D or 3D space.
	•	Usage: Most effective for encoding quantitative data since human perception excels at detecting relative positioning.
	•	Examples: Scatter plots, maps, network layouts.

2. Retinal Variables (Used for Non-Spatial Encodings)
These variables modify appearance properties of symbols without changing their spatial position.

2.1 Size
	•	Definition: Controls relative magnitude of a symbol (small vs. large).
	•	Usage: Best suited for quantitative comparisons (e.g., bubble charts, proportional symbol maps).
	•	Limitations: Size perception is non-linear; small differences are harder to distinguish.

2.2 Shape
	•	Definition: Differentiates categories using distinct geometric forms (e.g., circles, triangles, squares).
	•	Usage: Best for categorical (nominal) data, where distinct symbols aid identification.
	•	Examples: Scatter plots with different markers for categories.

2.3 Orientation
	•	Definition: Adjusts rotation angle of a symbol (e.g., tilted lines).
	•	Usage: Occasionally used for ordinal or directional data (e.g., wind direction in weather maps).
	•	Limitations: Humans struggle to distinguish small rotation differences.

2.4 Color (Hue & Value)
	•	Hue (Categorical Encoding):
	•	Encodes qualitative (nominal) categories using distinct colors (e.g., red = emergency, blue = calm).
	•	Best used for: Grouping elements without implying order.
	•	Value (Lightness/Darkness for Quantitative Encoding):
	•	Encodes sequential or ordinal values (lighter = lower value, darker = higher value).
	•	Best used for: Choropleth maps, heatmaps, density plots.
	•	Saturation (Intensity):
	•	Indicates importance or uncertainty (e.g., faded color suggests missing or unreliable data).

2.5 Texture & Pattern
	•	Definition: Uses repeating marks, dots, or grid patterns to encode information.
	•	Usage: Best for distinguishing overlapping elements (e.g., land use maps with different textures).

2.6 Transparency
	•	Definition: Alters opacity to indicate uncertainty, density, or layering.
	•	Usage: Frequently used in scatterplots with high overlap and uncertainty visualizations.

2.7 Depth (3D Perspective)
	•	Definition: Simulates perspective depth in 3D visualizations.
	•	Usage: Used in 3D bar charts, terrain models, and volumetric visualizations.
	•	Limitations: Can cause occlusion issues (one element hiding another).

2.8 Blur & Optics
	•	Definition: Adds blur or focus to control emphasis.
	•	Usage: Highlights important elements while fading others (e.g., blurred background in network layouts).

2.9 Motion & Animation
	•	Definition: Uses movement speed, direction, and rhythm to encode information.
	•	Usage: Best for time-series visualizations, transitions, and attention guidance (e.g., animated flow maps).
    ---

    ### **User Request**
    The user wants to visualize: "{user_request}"

    ### **Insight Need Type**
    - {insight_need_type}

    ### **Analysis Type**
    - {chosen_analysis_type}

    ### **Chosen Visualization Type**
    The visualization will use:
    {chosen_visualization_type}

    ### **Dataset Overview**
    The dataset after processing the key variables is summarized as follows:

    **DataFrame Info:**
    {df_info}

    **DataFrame Sample Rows:**
    {df_head}

    ### **Key Variable Data Types**
    The data types of key variables after processing:
    {key_variable_types}

    ### **Chosen Graphic Symbol**
    The visualization will use: **{chosen_graphic_symbol}**

    ### **Your Task**
    - **Step 1:** Analyze the **user request** and the **insight need type**, the **chosen analysis type**, the **chosen visualization type** and the **chosen graphic symbol**.
    - **Step 2:** Review the **data structure** from the `df_info()` and `df_head()` output.
    - **Step 3:** Examine the **data types of key variables** to determine the best graphic variable.
    - **Step 4:** Choose the **best graphic variable(s)** from the provided reference based on:
        - The **type of data** (categorical, numerical, ordinal, spatial, network).
        - The **insight need type** (e.g., trends, distributions, correlations, comparisons).
        - The **chosen graphic symbol** and how best to visually encode it.
    - **Step 5:** If needed, suggest **multiple graphic variables** (e.g., **color + size** for dual encoding).

    **Expected Output Format (JSON)**
    ```json
    {{
        "chosen_graphic_variable": "Selected Graphic Variable(s)",
        "reasoning": "Brief explanation of why this variable was chosen based on the data, visualization type, and perceptual principles."
    }}
    ```
    """
    )

    # Create the Chain
    choose_graphic_variable_chain = LLMChain(
        llm=llm,
        prompt=choose_graphic_variable_prompt,
        output_key="graphic_variable_choice"
    )

    # Run the Chain
    graphic_variable_result = choose_graphic_variable_chain.invoke({
        "user_request": user_request,
        "insight_need_type": insight_json["insight_need_type"],
        "df_info": df_filtered.dtypes.to_string(),  # Convert DataFrame info to string
        "df_head": str(df_filtered.head()),  # Convert DataFrame head to string
        "key_variable_types": column_data_types,  # JSON output from post-processing
        "chosen_graphic_symbol": chosen_graphic_symbol,  # Output from previous chain
        "chosen_visualization_type": chosen_visualization_type,  # Output from visualization choice
        "chosen_analysis_type": chosen_analysis_type  # Output from analysis choice
    })

    # Extract and clean the output
    graphic_variable_choice = graphic_variable_result["graphic_variable_choice"]

    # Remove unnecessary formatting
    cleaned_output = re.sub(r"json", "", graphic_variable_choice)
    cleaned_output = re.sub(r"```", "", cleaned_output)

    # Convert cleaned JSON string into a dictionary
    graphic_variable_data = json.loads(cleaned_output)

    # Extract and display the chosen graphic variable type
    chosen_graphic_variable = graphic_variable_data["chosen_graphic_variable"]
    graphic_variable_explanation = graphic_variable_data["reasoning"]
    display(Markdown("### " + "Chosen Graphic Variable"))
    display(Markdown(chosen_graphic_variable))
    display(Markdown(graphic_variable_explanation))

    return graphic_variable_data

#### execute python code generated from llm

In [None]:

def execute_visualization_from_json(visualization_json):
    """
    Executes the generated Python visualization code from the parsed JSON output.

    Parameters:
        visualization_json (dict): JSON dictionary containing visualization details,
                                   including 'generated_code' for each visualization.

    Returns:
        None: The function runs the visualization code dynamically.
    """

    for viz_key, viz_details in visualization_json.items():
        print(f"\n🔹 Running: {viz_details['name']} Visualization")

        try:
            # Extract the Python code
            viz_code = viz_details["generated_code"]
            print("🔻 Python Code:\n", viz_code)

            # Execute the visualization code safely
            exec(viz_code, globals())

            print(f"✅ Successfully executed: {viz_details['name']}\n")

        except Exception as e:
            print(f"❌ Error while executing {viz_details['name']} Visualization")
            print("🔻 Exception Message:", str(e))
            print("🔻 Traceback:\n", traceback.format_exc())

#### temporal analysis

In [None]:

def run_temporal_analysis_chain(
    llm,
    user_request,
    insight_need_type,
    chosen_visualization_type,
    df_filtered,
    chosen_graphic_symbol,
    graphic_symbol_explanation,
    chosen_graphic_variable,
    graphic_variable_explanation
):
    """
    Executes the temporal analysis chain and returns the parsed JSON output.

    Parameters:
        llm: LangChain LLM instance
        user_request (str): The user’s request for visualization.
        insight_need_type (str): The insight requirement output from Chain 1.
        chosen_visualization_type (str): The visualization type determined from previous chains.
        df_filtered (pd.DataFrame): The processed dataset.
        chosen_graphic_symbol (str): Selected graphic symbol for visualization.
        graphic_symbol_explanation (str): Explanation for choosing the graphic symbol.
        chosen_graphic_variable (str): Selected graphic variable for visualization.
        graphic_variable_explanation (str): Explanation for choosing the graphic variable.

    Returns:
        dict: Parsed JSON output containing visualization recommendations.
    """

    # Define the Prompt Template
    temporal_analysis_prompt = PromptTemplate(
        input_variables=[
            "user_request", "insight_need_type", "chosen_visualization_type", "df_info", "df_head", "df_summary",
            "chosen_graphic_symbol", "graphic_symbol_explanation",
            "chosen_graphic_variable", "graphic_variable_explanation"
        ],
        template="""
        You are a highly skilled data visualization expert specializing in temporal analysis.
        Your goal is to analyze the provided dataset and generate the best possible **time-based visualizations**.

        ---
        ### **Temporal Studies Context**
        Temporal Studies and Visualization Types

Overview
Temporal analysis and visualization techniques are used to answer "When" questions. These techniques help identify trends, bursts, seasonality, and patterns over time. Different approaches are required to manage various time-related challenges such as aggregation, time zones, outliers, and time slicing.

Data Preprocessing
A time series consists of events or observations ordered in one dimension: time. It may be:
- Continuous: Observations recorded at equal intervals (e.g., every second).
- Discrete: Observations recorded irregularly (e.g., event-driven logs).

Temporal information can be static (fixed dates) or dynamic (real-time data streams).

Resolution and Aggregation
- Time can be expressed in milliseconds, seconds, minutes, hours, days, weeks, months, years, decades, centuries.
- Aggregation can be based on astronomical time (e.g., years, seasons) or cultural time (e.g., fiscal years, business quarters).

Time Zones
Different regions use different time zones. Aligning data across time zones ensures correct comparisons.

Outliers
- Definition: Extreme values that deviate significantly from normal patterns.
- Example: A sudden traffic spike on a website due to viral content.
- Outliers can be filtered out or highlighted for deeper investigation.

Time Slicing
- Disjoint Slicing: Separates time intervals completely.
- Overlapping Slicing: Allows segments to share part of the time range.
- Cumulative Slicing: Accumulates data across periods (e.g., rolling averages).

Temporal Trends and Patterns
A time series can be broken down into:
1. General Trends: Long-term progression.
2. Cyclical Components: Recurring changes (e.g., business cycles).
3. Seasonal Components: Repeating yearly patterns (e.g., winter sales).
4. Random Components: Unpredictable variations.

To analyze trends, smoothing techniques can be applied (e.g., moving averages).

Bursts
- Definition: A sudden increase in activity for a short period.
- Example: An unexpected surge in search engine queries.
- Burst detection algorithms identify and rank significant events.

Temporal Visualization Types

1. Trends and Distributions
- Timeline/Chronological Graphs: Show event occurrences over time.
- Bar Graphs: Used for discrete time segments.
- Line Graphs: Highlight trends in continuous time series.
- Histograms: Show frequency distributions.

Example: Stock price trends over time.

2. Time-Based Comparisons
- Stacked Bar Graphs: Compare multiple datasets over time.
- 100% Stacked Line Graphs: Show relative proportions.
- Circular Time Graphs: Represent periodic patterns (e.g., hourly energy usage).

3. Flows Over Time
- Flow Maps: Show movement and volume of entities over time (e.g., migration patterns).
- Space-Time Cube Maps: Represent data in three dimensions (X, Y for space, Z for time).

Example: Hurricane track visualization.

4. Derivatives and Change Detection
- Velocity & Acceleration Graphs: Analyze change rates (e.g., stock market fluctuations).
- Animated Sequences: Display data evolution over time (e.g., animated COVID-19 case maps).

Application of Temporal Studies
1. Finance: Stock price trends, inflation changes.
2. Healthcare: Epidemic progression, patient recovery rates.
3. Climate Science: Temperature shifts, seasonal rainfall.
4. Social Media & Search Trends: Burst analysis, trending topics.

        ---
        ### **User Request**
        The user wants to visualize: "{user_request}"

        ### **Insight Need Type**
        - {insight_need_type}

        ### **Chosen Visualization Type**
        The visualization will use:
        {chosen_visualization_type}

        ---
        ### **Dataset Overview**
        **DataFrame Info:**
        {df_info}

        **DataFrame Sample Rows:**
        {df_head}

        **DataFrame Summary Statistics:**
        {df_summary}

        ---
        ### **Chosen Graphic Symbol**
        **Symbol:** {chosen_graphic_symbol}
        **Reasoning:** {graphic_symbol_explanation}

        ### **Chosen Graphic Variable**
        **Variable:** {chosen_graphic_variable}
        **Reasoning:** {graphic_variable_explanation}

        ---
        ### **Your Tasks**
        - **Step 1:** Based on the dataset structure, insight need and chosen_visualization_type, select the **best** temporal visualization.
        - **Step 2:** For the chosen visualization, list available **software tools** that can generate it (Power BI, Tableau, Python, etc.).
        - **Step 3:** Choose the most **suitable Python libraries** (preferably interactive ones like Plotly, Bokeh, Altair, etc.).
        - **Step 4:** Identify any **data pre-processing** steps necessary before visualization with respect to the current dataset structure.
        - **Step 5:** Write **Python code** that:
            - **Uses the existing dataset (df_filtered) in memory** without generating a new sample dataset.
            - **DO NOT create a new DataFrame**. Use `df_filtered` directly.
            - **Processes the dataset** correctly.
            - **Generates a high-quality visualization** with:
              - Proper **titles, labels, legends, and data labels** for readability.
              - Interactive features if applicable.
            - **Outputs the visualization.**

        ---
        ### **Expected JSON Output Format**
        ```json
        {{
            "visualization_1": {{
                "name": "Visualization Type 1",
                "recommended_software": ["Software 1", "Software 2", "Software 3"],
                "selected_libraries": ["Library 1", "Library 2"],
                "preprocessing_steps": "Necessary preprocessing before visualization.",
                "generated_code": "Python code for Visualization 1."
            }}
        }}
        ```
        """
    )

    # Create the Chain
    temporal_analysis_chain = LLMChain(
        llm=llm,
        prompt=temporal_analysis_prompt,
        output_key="temporal_analysis_output"
    )

    # Execute the Chain
    temporal_analysis_result = temporal_analysis_chain.invoke({
        "user_request": user_request,
        "insight_need_type": insight_need_type,
        "chosen_visualization_type": chosen_visualization_type,
        "df_info": df_filtered.dtypes.to_string(),
        "df_head": str(df_filtered.head()),
        "df_summary": str(df_filtered.describe(include='all')),
        "chosen_graphic_symbol": chosen_graphic_symbol,
        "graphic_symbol_explanation": graphic_symbol_explanation,
        "chosen_graphic_variable": chosen_graphic_variable,
        "graphic_variable_explanation": graphic_variable_explanation
    })

    # Extract and Clean Output
    temporal_analysis_output = temporal_analysis_result["temporal_analysis_output"]
    cleaned_output = re.sub(r"```json", "", temporal_analysis_output)
    cleaned_output = re.sub(r"```", "", cleaned_output)

    # Convert cleaned JSON string into a dictionary
    try:
        parsed_json = json.loads(cleaned_output)
        for key, visualization in parsed_json.items():
            markdown_output = f"""
        **Visualization Name**
        **{visualization.get('name', 'N/A')}**

        **Recommended Software**
        - {', '.join(visualization.get('recommended_software', []))}

        **Selected Libraries**
        - {', '.join(visualization.get('selected_libraries', []))}

        **Preprocessing Steps**
        {visualization.get('preprocessing_steps', 'N/A')}
        """
        # Display dynamically formatted Markdown
            display(Markdown(markdown_output))
        return parsed_json
    except json.JSONDecodeError:
        raise ValueError("Invalid JSON response from LLM. Please check the output formatting.")

#### geospatial analysis

In [None]:

def run_geospatial_analysis_chain(
    llm,
    user_request,
    insight_need_type,
    chosen_visualization_type,
    df_filtered,
    chosen_graphic_symbol,
    graphic_symbol_explanation,
    chosen_graphic_variable,
    graphic_variable_explanation
):
    """
    Executes the geospatial analysis chain and returns the parsed JSON output.

    Parameters:
        llm: LangChain LLM instance
        user_request (str): The user’s request for geospatial visualization.
        insight_need_type (str): The insight requirement output from previous chains.
        chosen_visualization_type (str): The visualization type determined from previous chains.
        df_filtered (pd.DataFrame): The processed dataset.
        chosen_graphic_symbol (str): Selected graphic symbol for visualization.
        graphic_symbol_explanation (str): Explanation for choosing the graphic symbol.
        chosen_graphic_variable (str): Selected graphic variable for visualization.
        graphic_variable_explanation (str): Explanation for choosing the graphic variable.

    Returns:
        dict: Parsed JSON output containing visualization recommendations.
    """

    # Define the Prompt Template
    geospatial_analysis_prompt = PromptTemplate(
        input_variables=[
            "user_request", "insight_need_type", "chosen_visualization_type", "df_info", "df_head", "df_summary",
            "chosen_graphic_symbol", "graphic_symbol_explanation",
            "chosen_graphic_variable", "graphic_variable_explanation"
        ],
        template="""
        You are a highly skilled data visualization expert specializing in geospatial analysis.
        Your goal is to analyze the provided dataset and generate the best possible **geospatial visualizations**.

        ---
        ### **Geospatial Studies Context**
        Geospatial Studies – "Where"
        Geospatial analysis (or geostatistical analysis) helps answer "Where" questions by using statistical methods and spatial data visualization techniques. It focuses on understanding the location of objects, relationships between places, movement patterns, and spatial clustering. This is useful in fields such as geography, urban planning, epidemiology, and logistics.

        Data Preprocessing
        Before visualization, geospatial data needs geocoding (assigning latitude/longitude) and georeferencing (linking data to a map coordinate system).

        Geocoding
        - Converts addresses, postal codes, or geographic coordinates into mappable locations.
        - Uses geographic gazetteers or GPS satellite triangulation to pinpoint positions.

        Distance Calculation
        - Measures shortest paths between locations using Great Circle Distance (Earth’s curvature).
        - Used in routing, logistics, and disaster response planning.

        Diffusion Matrices
        - Used to model movement of tangible objects (e.g., vehicles, goods, money) and intangible elements (e.g., ideas, rumors, reputation).

        Clustering
        - Identifies patterns in geospatial data.
        - Common methods: K-means clustering, density-based clustering, or predefined geographic classifications (e.g., census tracts).

        Visual Generalization
        - Large-scale maps simplify features (e.g., aggregation of cities into regions).
        - Important for preserving map readability while retaining spatial accuracy.

        Geospatial Visualization Types

        1. Discrete Space Mapping
        - Represents distinct data points with specific locations.
        - Examples: Dot Density Maps (e.g., crime reports, disease cases).

        2. Continuous Space Mapping
        - Uses gradients to visualize non-discrete variables over an area.
        - Examples:
          - Elevation Maps (altitude visualization).
          - Isarithmic Maps (temperature, pollution, population density).

        3. Thematic Maps for Categorization
        - Choropleth Maps: Color-coded regions based on data values (e.g., population density).
        - Proportional Symbol Maps: Symbol sizes represent magnitudes (e.g., city population).

        4. Cartograms (Distorted Maps)
        - Adjusts area sizes based on data values.
        - Example: A cartogram of world population enlarges countries like India and China.

        5. Temporal-Spatial Representations
        - Space-Time Cube Maps: Uses 3D coordinates (X, Y for space, Z for time).
        - Flow Maps: Show directional movements (e.g., migration, trade routes).
        - Strip Maps: Emphasize road networks and travel routes.

        6. Specialized Maps for Movement Analysis
        - Vector Fields: Depicts wind currents, ocean tides, or pressure systems.
        - Isochrone Maps: Show travel time zones (e.g., 30-minute commute areas).
        - Subway/Route Maps: Optimize transit systems with clean visual layouts.

        Applications of Geospatial Analysis
        1. Urban Planning – Identifying traffic congestion, housing patterns.
        2. Disaster Response – Mapping flood-prone zones, evacuation routes.
        3. Epidemiology – Tracking disease outbreaks.
        4. Business Intelligence – Customer density heatmaps for market targeting.
        5. Climate Science – Visualizing temperature shifts, deforestation.

       ---
        ### **User Request**
        The user wants to visualize: "{user_request}"

        ### **Insight Need Type**
        - {insight_need_type}

        ### **Chosen Visualization Type**
        The visualization will use:
        {chosen_visualization_type}

        ---
        ### **Dataset Overview**
        **DataFrame Info:**
        {df_info}

        **DataFrame Sample Rows:**
        {df_head}

        **DataFrame Summary Statistics:**
        {df_summary}

        ---
        ### **Chosen Graphic Symbol**
        **Symbol:** {chosen_graphic_symbol}
        **Reasoning:** {graphic_symbol_explanation}

        ### **Chosen Graphic Variable**
        **Variable:** {chosen_graphic_variable}
        **Reasoning:** {graphic_variable_explanation}

        ---
        ### **Your Tasks**
        - **Step 1:** Based on the dataset structure, insight need and chosen_visualization_type, select the **best** geospatial visualizations.
        - **Step 2:** For the chosen visualization, list available **software tools** that can generate it (QGIS, Power BI, Tableau, Python, etc.).
        - **Step 3:** Choose the most **suitable Python libraries** (preferably interactive ones like GeoPandas, Plotly, etc.). **DO NOT USE FOLIO** .
        - **Step 4:** **DO NOT INCLUDE ANY PIP INSTALLS** in the code.
        - **Step 5:** Identify any **data pre-processing** steps necessary before visualization.
        - **Step 6:** Write **Python code** that:
            - **Uses the existing dataset (df_filtered) in memory** without generating a new sample dataset.
            - **DO NOT create a new DataFrame**. Use `df_filtered` directly.
            - **Processes the dataset** correctly for the selected geospatial visualization.
            - **Generates a high-quality map-based visualization** with:
              - Proper **titles, data labels, legends, and color gradients** for readability.
              - Interactive features if applicable.
            - The script **must be fully executable** without requiring users to modify any placeholders.
            - **Dynamically fetch** required datasets such as **GeoJSON files** or external geographic datasets.
            - **DO NOT use placeholders** like `geojson_data = {{"type": "FeatureCollection", "features": []}}`.
            - Include **error handling** for missing data or API failures.
            - Ensure the **selected libraries match the visualization type**:
              - If using **Plotly**, ensure the JSON structure matches its requirements and **ensure the map is correctly rendered by handling missing or mismatched location data appropriately to prevent empty maps.**
            - **The code must execute successfully without any user modifications.**
            - **Outputs the visualization.**
        ---
         ---
        ### **Execution Environment: Google Colab**
        This code will be executed in a **Google Colab environment**. Ensure that:
        - **All visualizations are adapted for Colab compatibility**.
        - **Ensure Google Colab compatibility** by adapting visualization display:
        - **For Pyvis**, generate an HTML file and display using:
          ```python
          net.show("network_graph.html")
          from IPython.core.display import display, HTML
          display(HTML("network_graph.html"))

        - **For Plotly,
          - **ensure the map is correctly rendered by handling missing or mismatched location data appropriately to prevent empty maps.**

        ---
        ### **Expected JSON Output Format**
        ```json
        {{
            "visualization_1": {{
                "name": "Geospatial Visualization Type 1",
                "recommended_software": ["Software 1", "Software 2", "Software 3"],
                "selected_libraries": ["Library 1", "Library 2"],
                "preprocessing_steps": "Necessary preprocessing before visualization.",
                "generated_code": "Python code for Geospatial Visualization 1 using df_filtered."
            }}
        }}
        ```
        """
    )

    # Create the Chain
    geospatial_analysis_chain = LLMChain(
        llm=llm,
        prompt=geospatial_analysis_prompt,
        output_key="geospatial_analysis_output"
    )

    # Execute the Chain
    geospatial_analysis_result = geospatial_analysis_chain.invoke({
        "user_request": user_request,
        "insight_need_type": insight_need_type,
        "chosen_visualization_type": chosen_visualization_type,
        "df_info": df_filtered.dtypes.to_string(),
        "df_head": str(df_filtered.head()),
        "df_summary": str(df_filtered.describe(include='all')),
        "chosen_graphic_symbol": chosen_graphic_symbol,
        "graphic_symbol_explanation": graphic_symbol_explanation,
        "chosen_graphic_variable": chosen_graphic_variable,
        "graphic_variable_explanation": graphic_variable_explanation
    })

    # Extract and Clean Output
    geospatial_analysis_output = geospatial_analysis_result["geospatial_analysis_output"]
    cleaned_output = re.sub(r"```json", "", geospatial_analysis_output)
    cleaned_output = re.sub(r"```", "", cleaned_output)

    # Convert cleaned JSON string into a dictionary
    try:
        parsed_json = json.loads(cleaned_output)
        for key, visualization in parsed_json.items():
            markdown_output = f"""
        **Visualization Name**
        **{visualization.get('name', 'N/A')}**

        **Recommended Software**
        - {', '.join(visualization.get('recommended_software', []))}

        **Selected Libraries**
        - {', '.join(visualization.get('selected_libraries', []))}

        **Preprocessing Steps**
        {visualization.get('preprocessing_steps', 'N/A')}
        """
        # Display dynamically formatted Markdown
            display(Markdown(markdown_output))
        return parsed_json
    except json.JSONDecodeError:
        raise ValueError("Invalid JSON response from LLM. Please check the output formatting.")

#### topical

In [None]:
def run_topical_analysis_chain(
    llm,
    user_request,
    insight_need_type,
    chosen_visualization_type,
    df_filtered,
    chosen_graphic_symbol,
    graphic_symbol_explanation,
    chosen_graphic_variable,
    graphic_variable_explanation
):
    """
    Executes the topical analysis chain and returns the parsed JSON output.

    Parameters:
        llm: LangChain LLM instance
        user_request (str): The user’s request for textual or topical visualization.
        insight_need_type (str): The insight requirement output from Chain 1.
        chosen_visualization_type (str): The visualization type determined from previous chains.
        df_filtered (pd.DataFrame): The processed dataset.
        chosen_graphic_symbol (str): Selected graphic symbol for visualization.
        graphic_symbol_explanation (str): Explanation for choosing the graphic symbol.
        chosen_graphic_variable (str): Selected graphic variable for visualization.
        graphic_variable_explanation (str): Explanation for choosing the graphic variable.

    Returns:
        dict: Parsed JSON output containing visualization recommendations.
    """

    # Define the Prompt Template
    topical_analysis_prompt = PromptTemplate(
        input_variables=[
            "user_request", "insight_need_type", "chosen_visualization_type",
            "df_info", "df_head", "df_summary",
            "chosen_graphic_symbol", "graphic_symbol_explanation",
            "chosen_graphic_variable", "graphic_variable_explanation"
        ],
        template="""
        You are a highly skilled data visualization expert specializing in **textual and topical analysis**.
        Your goal is to analyze the provided dataset and generate the best possible **topical visualizations**.

        ---
        ### **Topical Studies Context**
        ### **Topical Studies—"What"**
        Topical analysis is commonly applied to answer **“what”** questions by analyzing large-scale text corpora (e.g., articles, patents, grants, job applications, emails). These studies identify **term frequency distributions**, **topic composition**, and **textual trends** over time.

        ---
        ### **Data Preprocessing**
        To extract meaningful insights from text, several preprocessing techniques are applied:

        - **Fielding:** Identifies key sections of text (e.g., title, author name, address, abstract).
        - **Text Selection:** Filters relevant text portions for analysis.
        - **Stemming & Stopword Removal:** Reduces words to their base form and removes common words like "the" or "of."
        - **Tokenization:** Splits text into individual words, phrases, or **n-grams** (sequences of words).
        - **Normalization:** Standardizes text by converting words into a consistent format for comparison.
        - **Descriptive Term Identification:** Uses **TF-IDF (Term Frequency-Inverse Document Frequency)** to rank words based on their importance.
        - **Tagging:** Assigns grammatical labels (e.g., noun, verb) to words for linguistic analysis.

        ---
        ### **Distributions in Text Analysis**
        - **Term Frequency & Distributions:** Measures how often a word appears to determine importance.
        - **Temporal Dynamics:** Tracks changes in word frequency over time (e.g., tracking the rise of certain terms in historical text).

        ---
        ## **Topical Visualization Types**
        To visually represent textual data, various visualization techniques are used:

        ### **Composition and Frequency**
        - **Lists:** Represents term frequencies in ranked order.
        - **Tag Clouds:** Words are displayed in various sizes based on their frequency.

        ### **Graphs and Structure**
        - **Topic Graphs:** Visualizes relationships between words or entities.
        - **Circular Graphs:** Organizes words in a structured format for hierarchical representation.

        ### **Crossmaps**
        - **Scholarly Crossmaps:** Depict connections between authors, papers, and research topics.

        ### **Trends and Evolution**
        - **History Flow:** Displays **revision history** of documents using stacked lines.
        - **Alluvial Graphs:** Tracks how topics **merge and split over time**.
        - **Stream Graphs:** Represents thematic flows in text (metaphor of a river showing text evolution).
        - **Arc Graphs:** Uses links to represent text relationships.

        ### **Network Visualizations**
        - **Text Networks:** Displays **associations and dependencies** between text entities (words, topics, references).

        ---
        ### **Execution Environment: Google Colab**
        This code will be executed in a **Google Colab environment**. Ensure that:
        - **All visualizations are adapted for Colab compatibility**.
        - **Ensure Google Colab compatibility** by adapting visualization display:
        - **For Pyvis**, generate an HTML file and display using:
          ```python
          net.show("network_graph.html")
          from IPython.core.display import display, HTML
          display(HTML("network_graph.html"))
          ```

        ---
        ### **User Request**
        The user wants to visualize: "{user_request}"

        ### **Insight Need Type**
        - {insight_need_type}

        ### **Chosen Visualization Type**
        The visualization will use:
        {chosen_visualization_type}

        ---
        ### **Dataset Overview**
        **DataFrame Info:**
        {df_info}

        **DataFrame Sample Rows:**
        {df_head}

        **DataFrame Summary Statistics:**
        {df_summary}

        ---
        ### **Chosen Graphic Symbol**
        **Symbol:** {chosen_graphic_symbol}
        **Reasoning:** {graphic_symbol_explanation}

        ### **Chosen Graphic Variable**
        **Variable:** {chosen_graphic_variable}
        **Reasoning:** {graphic_variable_explanation}

        ---
        ### **Your Tasks**
        - **Step 1:** Based on the dataset structure, insight need, and chosen visualization type, select the **best** topical visualizations.
        - **Step 2:** For the chosen visualization, list available **software tools** that can generate it (Power BI, Tableau, Python, etc.).
        - **Step 3:** Choose the most **suitable Python libraries** (preferably interactive ones like WordCloud, Spacy, NLTK, Gensim, Plotly, etc.).
        - **Step 4:** Identify any **data pre-processing** steps necessary before visualization.
        - **Step 5:** Write **Python code** that:
            - **Uses the existing dataset (`df_filtered`) in memory** without generating a new sample dataset.
            - **DO NOT create a new DataFrame**. Use `df_filtered` directly.
            - **Processes the dataset** correctly for the selected topical visualization.
            - **Generates a high-quality textual visualization** with:
              - Proper **titles, labels, font sizes, and color schemes** for readability.
              - Interactive features if applicable.
            - **Outputs the visualization.**

        ---
        ### **Expected JSON Output Format**
        ```json
        {{
            "visualization_1": {{
                "name": "Topical Visualization Type 1",
                "recommended_software": ["Software 1", "Software 2", "Software 3", "Software 4", "Software 5"],
                "selected_libraries": ["Library 1", "Library 2"],
                "preprocessing_steps": "Necessary preprocessing before visualization.",
                "generated_code": "Python code for Topical Visualization 1 using df_filtered."
            }}
        }}
        ```
        """
    )

    # Create the Chain
    topical_analysis_chain = LLMChain(
        llm=llm,
        prompt=topical_analysis_prompt,
        output_key="topical_analysis_output"
    )

    # Execute the Chain
    topical_analysis_result = topical_analysis_chain.invoke({
        "user_request": user_request,
        "insight_need_type": insight_need_type,
        "chosen_visualization_type": chosen_visualization_type,
        "df_info": df_filtered.dtypes.to_string(),
        "df_head": str(df_filtered.head()),
        "df_summary": str(df_filtered.describe(include='all')),
        "chosen_graphic_symbol": chosen_graphic_symbol,
        "graphic_symbol_explanation": graphic_symbol_explanation,
        "chosen_graphic_variable": chosen_graphic_variable,
        "graphic_variable_explanation": graphic_variable_explanation
    })

    # Extract and Clean Output
    topical_analysis_output = topical_analysis_result["topical_analysis_output"]
    cleaned_output = re.sub(r"```json", "", topical_analysis_output)
    cleaned_output = re.sub(r"```", "", cleaned_output)

    # Convert cleaned JSON string into a dictionary
    try:
        parsed_json = json.loads(cleaned_output)
        for key, visualization in parsed_json.items():
            markdown_output = f"""
        **Visualization Name**
        **{visualization.get('name', 'N/A')}**

        **Recommended Software**
        - {', '.join(visualization.get('recommended_software', []))}

        **Selected Libraries**
        - {', '.join(visualization.get('selected_libraries', []))}

        **Preprocessing Steps**
        {visualization.get('preprocessing_steps', 'N/A')}
        """
        # Display dynamically formatted Markdown
            display(Markdown(markdown_output))
        return parsed_json
    except json.JSONDecodeError:
        raise ValueError("Invalid JSON response from LLM. Please check the output formatting.")

#### network analysis

In [None]:
def run_network_analysis_chain(
    llm,
    user_request,
    insight_need_type,
    chosen_visualization_type,
    df_filtered,
    chosen_graphic_symbol,
    graphic_symbol_explanation,
    chosen_graphic_variable,
    graphic_variable_explanation
):
    """
    Executes the network analysis chain and returns the parsed JSON output.

    Parameters:
        llm: LangChain LLM instance
        user_request (str): The user’s request for network visualization.
        insight_need_type (str): The insight requirement output from previous chains.
        chosen_visualization_type (str): The visualization type determined from previous chains.
        df_filtered (pd.DataFrame): The processed dataset.
        chosen_graphic_symbol (str): Selected graphic symbol for visualization.
        graphic_symbol_explanation (str): Explanation for choosing the graphic symbol.
        chosen_graphic_variable (str): Selected graphic variable for visualization.
        graphic_variable_explanation (str): Explanation for choosing the graphic variable.

    Returns:
        dict: Parsed JSON output containing visualization recommendations.
    """

    # Define the Prompt Template
    network_analysis_prompt = PromptTemplate(
        input_variables=[
            "user_request", "insight_need_type", "chosen_visualization_type",
            "df_info", "df_head", "df_summary",
            "chosen_graphic_symbol", "graphic_symbol_explanation",
            "chosen_graphic_variable", "graphic_variable_explanation"
        ],
        template="""
        You are a highly skilled data visualization expert specializing in **network analysis**.
        Your goal is to analyze the provided dataset and generate the best possible **network visualizations**.

        ---
        ### **Network Studies Context**
        **Network Studies – "With Whom"**
        Network analysis helps answer **"Who interacts with whom?"** by examining relationships between entities (nodes) and their connections (edges). It is widely used in:
        - **Social Network Analysis (SNA)**
        - **Citation & Collaboration Networks**
        - **Supply Chain & Logistics**
        - **Telecommunication Networks**
        - **Traffic & Mobility Networks**

        **Key Network Concepts**
        - **Nodes**: Entities (e.g., people, companies, cities, publications).
        - **Edges**: Relationships between nodes (e.g., friendships, citations, transactions).
        - **Edge Weight**: Strength of connection (e.g., frequency of interactions).
        - **Directed vs. Undirected Graphs**: Relationships may have directionality (e.g., follower/following relationships).
        - **Centrality Measures**: Used to determine node importance.
          - **Degree Centrality**: Number of direct connections.
          - **Betweenness Centrality**: Measures node influence in connecting clusters.
          - **Closeness Centrality**: Measures how quickly information spreads.

        **Network Visualization Types**
        1. **Node-Link Graphs**
           - **Simple Network Graphs** – Shows basic node-to-node relationships.
           - **Force-Directed Graphs** – Uses force simulation to improve node positioning.
           - **Radial Network Graphs** – Displays hierarchical relationships (e.g., organizational charts).

        2. **Hierarchical & Tree Structures**
           - **Dendrograms** – Represents hierarchical clustering.
           - **Tree Layouts** – Displays organizational or decision-making structures.

        3. **Cluster & Community Detection**
           - **Modular Networks** – Shows tightly connected subgroups.
           - **Graph Partitioning** – Identifies distinct groups based on connectivity.

        4. **Flow & Process Networks**
           - **Sankey Diagrams** – Represents flow of resources, transactions, or influence.
           - **Bipartite Graphs** – Shows relationships between two distinct node groups (e.g., buyers & sellers).

        5. **Geospatial Network Analysis**
           - **Route Maps** – Optimizes travel paths.
           - **Supply Chain Networks** – Maps logistics & distribution.

        ---
        ### **User Request**
        The user wants to visualize: "{user_request}"

        ### **Insight Need Type**
        - {insight_need_type}

        ### **Chosen Visualization Type**
        The visualization will use:
        {chosen_visualization_type}

        ---
        ### **Dataset Overview**
        **DataFrame Info:**
        {df_info}

        **DataFrame Sample Rows:**
        {df_head}

        **DataFrame Summary Statistics:**
        {df_summary}

        ---
        ### **Chosen Graphic Symbol**
        **Symbol:** {chosen_graphic_symbol}
        **Reasoning:** {graphic_symbol_explanation}

        ### **Chosen Graphic Variable**
        **Variable:** {chosen_graphic_variable}
        **Reasoning:** {graphic_variable_explanation}

        ---
        ### **Your Tasks**
        - **Step 1:** Based on the dataset structure, insight need, and chosen visualization type, select the **best** network visualizations.
        - **Step 2:** For the chosen visualization, list available **software tools** that can generate it (Gephi, Power BI, Cytoscape, Python, etc.).
        - **Step 3:** Choose the most **suitable Python libraries** (preferably interactive ones like NetworkX, Plotly etc.).
        - **Step 4:** Identify any **data pre-processing** steps necessary before visualization.
        - **Step 5:** Write **Python code** that:
            - **Uses the existing dataset (`df_filtered`) in memory** without generating a new sample dataset.
            - **DO NOT create a new DataFrame**. Use `df_filtered` directly.
            - **Processes the dataset** correctly for the selected network visualization.
            - **Generates a high-quality network visualization** with:
              - Proper **titles, labels, node colors, and edge weights** for readability.
              - Interactive features if applicable.
            - **Properly encodes the visualizations with graphic symbols and variables**.
            - **Outputs the visualization.**

        ---
        ### **Expected JSON Output Format**
        ```json
        {{
            "visualization_1": {{
                "name": "Network Visualization Type 1",
                "recommended_software": ["Software 1", "Software 2", "Software 3"],
                "selected_libraries": ["Library 1", "Library 2"],
                "preprocessing_steps": "Necessary preprocessing before visualization.",
                "generated_code": "Python code for Network Visualization 1 using df_filtered."
            }}
        }}
        ```
        """
    )

    # Create the Chain
    network_analysis_chain = LLMChain(
        llm=llm,
        prompt=network_analysis_prompt,
        output_key="network_analysis_output"
    )

    # Execute the Chain
    network_analysis_result = network_analysis_chain.invoke({
        "user_request": user_request,
        "insight_need_type": insight_need_type,
        "chosen_visualization_type": chosen_visualization_type,
        "df_info": df_filtered.dtypes.to_string(),
        "df_head": str(df_filtered.head()),
        "df_summary": str(df_filtered.describe(include='all')),
        "chosen_graphic_symbol": chosen_graphic_symbol,
        "graphic_symbol_explanation": graphic_symbol_explanation,
        "chosen_graphic_variable": chosen_graphic_variable,
        "graphic_variable_explanation": graphic_variable_explanation
    })

    # Extract and Clean Output
    network_analysis_output = network_analysis_result["network_analysis_output"]
    cleaned_output = re.sub(r"```json", "", network_analysis_output)
    cleaned_output = re.sub(r"```", "", cleaned_output)

    # Convert cleaned JSON string into a dictionary
    try:
        parsed_json = json.loads(cleaned_output)
        for key, visualization in parsed_json.items():
            markdown_output = f"""
        **Visualization Name**
        **{visualization.get('name', 'N/A')}**

        **Recommended Software**
        - {', '.join(visualization.get('recommended_software', []))}

        **Selected Libraries**
        - {', '.join(visualization.get('selected_libraries', []))}

        **Preprocessing Steps**
        {visualization.get('preprocessing_steps', 'N/A')}
        """
        # Display dynamically formatted Markdown
            display(Markdown(markdown_output))
        return parsed_json
    except json.JSONDecodeError:
        raise ValueError("Invalid JSON response from LLM. Please check the output formatting.")

### Main Function

#### enter your google api key

In [None]:
if "GOOGLE_API_KEY" not in os.environ:
    os.environ["GOOGLE_API_KEY"] = getpass.getpass("Enter your Google AI API key: ")

Enter your Google AI API key: ··········


#### load the data file

In [None]:
# Create DataFrame
df = pd.read_csv('/content/state_M2023_dl+careerpathways.csv')

df.dropna(inplace=True)

# Display the DataFrame
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 31855 entries, 0 to 35808
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   state                 31855 non-null  object 
 1   state_abbreviation    31855 non-null  object 
 2   occupation_title      31855 non-null  object 
 3   code                  31855 non-null  object 
 4   total_employment      31855 non-null  float64
 5   hourly_wage_mean      31855 non-null  float64
 6   annual_salary_mean    31855 non-null  object 
 7   hourly_wage_median    31855 non-null  float64
 8   annual_salary_median  31855 non-null  object 
 9   career_pathway        31855 non-null  object 
 10  career_cluster        31855 non-null  object 
dtypes: float64(3), object(8)
memory usage: 2.9+ MB


#### instantiate the llm

In [None]:
# initalize the llm api object
llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2
)

#### dvl llms

In [None]:
example_choroplath_prompt = """Show a choropleth map of how the average salary is distributed across the United States.
Only include the **48 contiguous states**, **FILTER** Alaska, Hawaii, and U.S. territories (e.g., Puerto Rico, Guam) from the map.
Ensure that the map is **accurately projected** and does not distort the continental U.S.
"""

example_sankey_prompt = """Show how employment is distributed across career cluster, career pathway
 and specific occupations through a sankey diagram. Limit job roles only to top-20 occupations with the highest average salary (AVERAGE annual_salary_mean).

- Use a **gradient-based color scheme** where career clusters, career pathways, and occupations have **distinct but related color intensities** to indicate hierarchy.
- Use **lighter shades for higher-level categories (career clusters)** and **progressively darker shades** for more specific categories (career pathways and occupations).
- Ensure **high contrast between adjacent nodes** while maintaining a visually cohesive color theme.
- Use **semi-transparent colors for links** to emphasize flow without overwhelming the visualization.
- Avoid overly saturated colors that may obscure readability.
- Use **tooltips or annotations** where necessary to highlight insights.

Choose color codings that enhance clarity and interpretation while being visually appealing.
"""

In [44]:
user_request = """Show how employment is distributed across career cluster, career pathway
 and specific occupations through a sankey diagram. Limit job roles only to top-20 occupations with the highest average salary (AVERAGE annual_salary_mean).

- Use a **gradient-based color scheme** where career clusters, career pathways, and occupations have **distinct but related color intensities** to indicate hierarchy.
- Use **lighter shades for higher-level categories (career clusters)** and **progressively darker shades** for more specific categories (career pathways and occupations).
- Ensure **high contrast between adjacent nodes** while maintaining a visually cohesive color theme.
- Use **semi-transparent colors for links** to emphasize flow without overwhelming the visualization.
- Avoid overly saturated colors that may obscure readability.
- Use **tooltips or annotations** where necessary to highlight insights.

Choose color codings that enhance clarity and interpretation while being visually appealing.
"""

insight_json = identify_insight(user_request, llm)

identified_data_scale = identify_data_scale(
    df=df,
    insight_need_type=insight_json["insight_need_type"],
    key_variables=insight_json["key_variables"],
    llm=llm
)

df_filtered, column_data_types = post_process_data(df, identified_data_scale)

study_type_result = select_best_analysis_type(
    user_request=user_request,
    insight_json=insight_json,
    df_filtered=df_filtered,
    column_data_types=column_data_types,
    llm=llm
)

chosen_visualization_type = choose_best_visualization(user_request, insight_json, df_filtered, study_type_result['chosen_study_type'], llm)

chosen_graphic_symbol = choose_best_graphic_symbol(user_request, insight_json,
                        df_filtered, column_data_types, chosen_visualization_type, study_type_result['chosen_study_type'], llm)

chosen_graphic_variable = choose_best_graphic_variable(
    user_request=user_request,
    insight_json=insight_json,
    df_filtered=df_filtered,
    column_data_types=column_data_types,
    chosen_graphic_symbol=chosen_graphic_symbol,
    chosen_visualization_type=chosen_visualization_type,
    chosen_analysis_type=study_type_result['chosen_study_type'],
    llm=llm
)

chosen_study_type = study_type_result['chosen_study_type']

if chosen_study_type == "Temporal":
    result = run_temporal_analysis_chain(
    llm=llm,
      user_request=user_request,
      insight_need_type=insight_json["insight_need_type"],
      chosen_visualization_type=chosen_visualization_type,
      df_filtered=df_filtered,
      chosen_graphic_symbol=chosen_graphic_symbol['chosen_graphic_symbol'],
      graphic_symbol_explanation=chosen_graphic_symbol['reasoning'],
      chosen_graphic_variable=chosen_graphic_variable['chosen_graphic_variable'],
      graphic_variable_explanation=chosen_graphic_variable['reasoning']
)
elif chosen_study_type == "Geospatial":
  result = run_geospatial_analysis_chain(
    llm=llm,
      user_request=user_request,
      insight_need_type=insight_json["insight_need_type"],
      chosen_visualization_type=chosen_visualization_type,
      df_filtered=df_filtered,
      chosen_graphic_symbol=chosen_graphic_symbol['chosen_graphic_symbol'],
      graphic_symbol_explanation=chosen_graphic_symbol['reasoning'],
      chosen_graphic_variable=chosen_graphic_variable['chosen_graphic_variable'],
      graphic_variable_explanation=chosen_graphic_variable['reasoning']
)
elif chosen_study_type == "Topical":
  result = run_topical_analysis_chain(
    llm=llm,
      user_request=user_request,
      insight_need_type=insight_json["insight_need_type"],
      chosen_visualization_type=chosen_visualization_type,
      df_filtered=df_filtered,
      chosen_graphic_symbol=chosen_graphic_symbol['chosen_graphic_symbol'],
      graphic_symbol_explanation=chosen_graphic_symbol['reasoning'],
      chosen_graphic_variable=chosen_graphic_variable['chosen_graphic_variable'],
      graphic_variable_explanation=chosen_graphic_variable['reasoning']
)
elif chosen_study_type == "Network":
  result = run_network_analysis_chain(
      llm=llm,
      user_request=user_request,
      insight_need_type=insight_json["insight_need_type"],
      chosen_visualization_type=chosen_visualization_type,
      df_filtered=df_filtered,
      chosen_graphic_symbol=chosen_graphic_symbol['chosen_graphic_symbol'],
      graphic_symbol_explanation=chosen_graphic_symbol['reasoning'],
      chosen_graphic_variable=chosen_graphic_variable['chosen_graphic_variable'],
      graphic_variable_explanation=chosen_graphic_variable['reasoning']
  )

execute_visualization_from_json(result)

### The user request is:

Show how employment is distributed across career cluster, career pathway
 and specific occupations through a sankey diagram. Limit job roles only to top-20 occupations with the highest average salary (AVERAGE annual_salary_mean).

- Use a **gradient-based color scheme** where career clusters, career pathways, and occupations have **distinct but related color intensities** to indicate hierarchy.
- Use **lighter shades for higher-level categories (career clusters)** and **progressively darker shades** for more specific categories (career pathways and occupations).
- Ensure **high contrast between adjacent nodes** while maintaining a visually cohesive color theme.
- Use **semi-transparent colors for links** to emphasize flow without overwhelming the visualization.
- Avoid overly saturated colors that may obscure readability.
- Use **tooltips or annotations** where necessary to highlight insights.

Choose color codings that enhance clarity and interpretation while being visually appealing.


### Identified Insight Needs: 

Insight Need Type is Composition → Hierarchical Structure

Key Variables to use : ['career cluster', 'career pathway', 'occupation', 'AVERAGE annual_salary_mean']

Reasoning : The user wants to visualize the hierarchical relationship between career clusters, career pathways, and specific occupations, showing how employment is distributed across these levels. The Sankey diagram is explicitly requested, which is a suitable visualization for representing flow and proportions within a hierarchical structure. The request to limit occupations to the top 20 based on average salary adds a ranking element, but the primary goal is to understand the composition and flow of employment across the career hierarchy.

### Data Scale Types

```json
{
  "variable_mappings": {
    "career cluster": "career_cluster",
    "career pathway": "career_pathway",
    "occupation": "occupation_title",
    "AVERAGE annual_salary_mean": "annual_salary_mean"
  },
  "categorized_variables": {
    "career_cluster": {
      "chosen_scale": "Nominal",
      "suggested_python_dtype": "category",
      "conversion_needed": false,
      "conversion_code": null
    },
    "career_pathway": {
      "chosen_scale": "Nominal",
      "suggested_python_dtype": "category",
      "conversion_needed": false,
      "conversion_code": null
    },
    "occupation_title": {
      "chosen_scale": "Nominal",
      "suggested_python_dtype": "str",
      "conversion_needed": false,
      "conversion_code": null
    },
    "annual_salary_mean": {
      "chosen_scale": "Ratio",
      "suggested_python_dtype": "float",
      "conversion_needed": false,
      "conversion_code": null
    }
  }
}
```

### Data Pre-Processing:

Data before processing: 
 

```
state                   category
state_abbreviation        object
occupation_title          object
code                      object
total_employment         float64
hourly_wage_mean         float64
annual_salary_mean       float64
hourly_wage_median       float64
annual_salary_median      object
career_pathway            object
career_cluster            object
```

Data after processing: 
 

```
career_cluster         object
career_pathway         object
occupation_title       object
annual_salary_mean    float64```

### Chosen Analysis Type

Network

The user request focuses on visualizing the hierarchical relationships and flow between career clusters, career pathways, and specific occupations. This aligns perfectly with the purpose of a network study, which examines relationships and interactions between entities. The Sankey diagram, explicitly requested by the user, is a common visualization technique used in network analysis to represent flows and connections between different categories. The dataset contains categorical variables representing these entities and a numerical variable (annual_salary_mean) that can be used to filter and prioritize the occupations displayed in the network.






### Chosen Visualization Type

Sankey Diagram

The user explicitly requested a Sankey diagram to visualize the hierarchical flow of employment from career clusters to career pathways to specific occupations. This visualization is ideal for showing the distribution and relationships between these categorical variables, fulfilling the 'Composition -> Hierarchical Structure' insight need. The data structure, consisting of career_cluster, career_pathway, occupation_title (all categorical) and annual_salary_mean (numerical), is suitable for a Sankey diagram, especially after filtering for the top 20 occupations by salary. The user's specifications for color gradients, transparency, and annotations further support the choice of a Sankey diagram to effectively communicate the hierarchical relationships and flow of employment across different categories.

### Chosen Graphic Symbol

Lines

Given the user's request for a Sankey diagram to visualize the flow of employment across career clusters, career pathways, and occupations, 'Lines' are the most appropriate graphic symbol. Sankey diagrams inherently use lines (or flows/edges) to connect nodes representing different categories and show the magnitude of flow between them. The width of the lines typically represents the quantity or proportion of the flow. In this case, the lines will represent the number of employees transitioning from one career cluster to a specific career pathway and then to a particular occupation. The user's specifications for semi-transparent colors for the links further emphasize the importance of lines in conveying the flow of information without overwhelming the visualization. The categorical nature of career_cluster, career_pathway, and occupation_title, combined with the need to show hierarchical relationships and flow (Network analysis type, Composition -> Hierarchical Structure insight need), makes lines the ideal choice.

### Chosen Graphic Variable

Color (Hue & Value) + Transparency

Given the user's request for a Sankey diagram to visualize the hierarchical flow of employment across career clusters, career pathways, and occupations, and the use of 'Lines' as the graphic symbol, 'Color (Hue & Value)' and 'Transparency' are the most effective graphic variables. 

Color (Hue & Value): Hue will be used to differentiate the main categories (career clusters), providing distinct visual identities. Value (lightness/darkness) will then be used within each cluster to represent the hierarchy, with lighter shades for higher-level categories (career clusters) and progressively darker shades for more specific categories (career pathways and occupations). This aligns perfectly with the user's request for a gradient-based color scheme to indicate hierarchy. The categorical nature of career_cluster, career_pathway, and occupation_title makes hue an appropriate choice for initial differentiation, while the ordinal nature of the hierarchy within each cluster makes value suitable for representing the levels.

Transparency: Transparency will be applied to the lines (flows/edges) connecting the nodes. This addresses the user's requirement to emphasize flow without overwhelming the visualization. By making the lines semi-transparent, we can reduce visual clutter and allow the nodes to remain prominent. This is particularly important in a Sankey diagram where many lines may overlap. The degree of transparency can also be adjusted to reflect the magnitude of the flow, with more transparent lines representing smaller flows and less transparent lines representing larger flows, although this is a secondary consideration.

Combining Color and Transparency allows us to effectively encode both the categorical and hierarchical aspects of the data, as well as the magnitude of the flow between categories, while adhering to the user's specific design requirements for clarity and visual appeal.


        **Visualization Name**
        **Sankey Diagram of Employment Distribution**

        **Recommended Software**
        - Tableau, Power BI, Python (Plotly), R (networkD3)

        **Selected Libraries**
        - Plotly

        **Preprocessing Steps**
        1. Filter the DataFrame to include only the top 20 occupations with the highest average salary.
2. Aggregate the data to count the number of employees in each career cluster, career pathway, and occupation combination.
3. Create the necessary data structure (lists of labels, sources, targets, and values) for the Sankey diagram.
        


🔹 Running: Sankey Diagram of Employment Distribution Visualization
🔻 Python Code:
 import pandas as pd
import plotly.graph_objects as go

# Assuming df_filtered is already in memory and contains the data
# Filter for top 20 occupations by average salary
top_20_occupations = df_filtered.groupby('occupation_title')['annual_salary_mean'].mean().nlargest(20).index
df_top20 = df_filtered[df_filtered['occupation_title'].isin(top_20_occupations)].copy()

# Aggregate data for Sankey diagram
sankey_data = df_top20.groupby(['career_cluster', 'career_pathway', 'occupation_title']).size().reset_index(name='count')

# Create labels for all unique categories
labels = list(sankey_data['career_cluster'].unique()) + list(sankey_data['career_pathway'].unique()) + list(sankey_data['occupation_title'].unique())

# Create mappings from category to index
label_map = {label: i for i, label in enumerate(labels)}

# Create source, target, and value lists
sources = []
targets = []
values = []

# Add links from

✅ Successfully executed: Sankey Diagram of Employment Distribution

