In [2]:
# -*- coding: utf-8 -*-
"""io.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1DM5YyLav0ZE0ob1NTEkbkV6EMthzrohF
"""

# -*- coding: utf-8 -*-
"""Enhanced Anxiety Intervention Analysis with Causal Discovery

This notebook adapts the MoE framework to incorporate causal discovery algorithms,
specifically from causallearn (FCI), to identify potential causal relationships between
intervention strategies, pre-intervention anxiety, and post-intervention outcomes.
This enhancement aims to provide a deeper understanding of intervention effectiveness.

Workflow:
1. Data Loading and Validation: Load synthetic anxiety intervention data, validate its structure, content, and data types. Handle potential errors gracefully.
2. Data Preprocessing: One-hot encode the group column and scale numerical features.
3. Causal Structure Discovery: Apply FCI algorithm from causallearn to infer causal graph, handling potential errors and providing informative output.
4. SHAP Value Analysis: Quantify feature importance in predicting post-intervention anxiety.
5. Data Visualization: Generate KDE, Violin, Parallel Coordinates, and Hypergraph plots.
6. Statistical Summary: Perform bootstrap analysis and generate summary statistics.
7. LLM Insights Report: Synthesize findings using Grok, Claude, and Grok-Enhanced for explainability, handling potential LLM API errors (simulated here).

Keywords: Causal Discovery, FCI, causallearn, Anxiety Intervention, Explainability, LLMs, SHAP, Data Visualization
"""

# Suppress warnings (with caution - better to handle specific warnings)
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning, module="plotly")

# Import libraries
!pip install causal-learn shap transformers plotly
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import shap
import os
from sklearn.preprocessing import MinMaxScaler, LabelEncoder # Added LabelEncoder
from sklearn.ensemble import RandomForestRegressor
import plotly.express as px
from scipy.stats import bootstrap
import numpy as np
from causallearn.search.ConstraintBased.FCI import fci  # Corrected import for FCI
from causallearn.utils.GraphUtils import GraphUtils
from io import StringIO
# from transformers import AutoModelForCausalLM, AutoTokenizer  # Import for LLMs - Removed, as we are simulating

# Google Colab environment check
try:
    from google.colab import drive
    drive.mount("/content/drive")
    COLAB_ENV = True
except ImportError:
    COLAB_ENV = False
    print("Not running in Google Colab environment.")

# Constants
OUTPUT_PATH = "./output_anxiety_causal_discovery/" if not COLAB_ENV else "/content/drive/MyDrive/output_anxiety_causal_discovery/"
PARTICIPANT_ID_COLUMN = "participant_id"
GROUP_COLUMN = "group"  # Original group column *before* one-hot encoding
ANXIETY_PRE_COLUMN = "anxiety_pre"
ANXIETY_POST_COLUMN = "anxiety_post"
MODEL_GROK_NAME = "grok-base"
MODEL_CLAUDE_NAME = "claude-3.7-sonnet"
MODEL_GROK_ENHANCED_NAME = "grok-enhanced"
LINE_WIDTH = 2.5
BOOTSTRAP_RESAMPLES = 500

# Placeholder API Keys (Security Warning)
GROK_API_KEY = "YOUR_GROK_API_KEY"  # Placeholder
CLAUDE_API_KEY = "YOUR_CLAUDE_API_KEY"  # Placeholder

# --- Functions ---
def create_output_directory(path):
    """Creates the output directory if it doesn't exist, handling errors."""
    try:
        os.makedirs(path, exist_ok=True)
        return True
    except OSError as e:
        print(f"Error creating output directory: {e}")
        return False

def load_data_from_synthetic_string(csv_string):
    """Loads data from a CSV string, handling errors."""
    try:
        csv_file = StringIO(csv_string)
        return pd.read_csv(csv_file)
    except pd.errors.ParserError as e:
        print(f"Error parsing CSV data: {e}")
        return None
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

def validate_dataframe(df, required_columns):
    """Validates the DataFrame: checks for missing columns, non-numeric data,
    duplicate participant IDs, valid group labels, and plausible anxiety ranges.
    Returns True if valid, False otherwise.
    """
    if df is None:
        print("Error: DataFrame is None. Cannot validate.")
        return False

    missing_columns = [col for col in required_columns if col not in df.columns]
    if missing_columns:
        print(f"Error: Missing columns: {missing_columns}")
        return False

    for col in required_columns:
        if col != PARTICIPANT_ID_COLUMN and col != GROUP_COLUMN:
            if not pd.api.types.is_numeric_dtype(df[col]):
                print(f"Error: Non-numeric values found in column: {col}")
                return False

    if df[PARTICIPANT_ID_COLUMN].duplicated().any():
        print("Error: Duplicate participant IDs found.")
        return False

    valid_groups = ["Group A", "Group B", "Control"]  # Define valid group names
    invalid_groups = df[~df[GROUP_COLUMN].isin(valid_groups)][GROUP_COLUMN].unique()
    if invalid_groups.size > 0:
        print(f"Error: Invalid group labels found: {invalid_groups}")
        return False

    for col in [ANXIETY_PRE_COLUMN, ANXIETY_POST_COLUMN]:
        if df[col].min() < 0 or df[col].max() > 10:
            print(f"Error: Anxiety scores in column '{col}' are out of range (0-10).")
            return False

    return True

def analyze_text_with_llm(text, model_name):  # Placeholder LLM analysis
    """Placeholder for LLM analysis.  Replace with actual API calls."""
    text_lower = text.lower()
    if model_name == MODEL_GROK_NAME:
        if "causal graph" in text_lower: return "Grok-base: Causal graph suggests intervention impact and potential confounding factors.  Further analysis is needed to confirm the direction and strength of these relationships."
        elif "shap summary" in text_lower: return "Grok-base: SHAP values highlight feature importance, indicating which variables contribute most to predicting post-intervention anxiety."
        else: return f"Grok-base: General analysis on '{text}'."
    elif model_name == MODEL_CLAUDE_NAME:
        if "causal graph" in text_lower: return "Claude 3.7: Causal graph shows dependencies and potential causal pathways between group assignment, pre-intervention anxiety, and post-intervention anxiety."
        elif "shap summary" in text_lower: return "Claude 3.7: SHAP values explain feature contributions to the prediction, revealing the relative importance of each variable in determining post-intervention anxiety."
        else: return f"Claude 3.7: General analysis on '{text}'."
    elif model_name == MODEL_GROK_ENHANCED_NAME:
        if "causal graph" in text_lower: return "Grok-Enhanced: Causal graph reveals nuanced connections and potential causal mechanisms, including direct and indirect effects, between the intervention, pre-existing anxiety, and post-intervention outcomes."
        elif "shap summary" in text_lower: return "Grok-Enhanced: SHAP values show detailed feature effects, quantifying the contribution of each variable and highlighting any non-linear relationships or interactions."
        else: return f"Grok-Enhanced: Enhanced analysis on '{text}'."
    return f"Model '{model_name}' not supported."

def scale_data(df, columns):
    """Scales specified columns using MinMaxScaler, handling errors."""
    try:
        scaler = MinMaxScaler()
        df[columns] = scaler.fit_transform(df[columns])
        return df
    except ValueError as e:
        print(f"Error during data scaling: {e}")
        return None  # Or raise the exception
    except Exception as e:
        print(f"An unexpected error occurred during scaling: {e}")
        return None

def discover_causal_structure(df, variables, output_path):
    """
    Discover causal structure using the FCI algorithm from causallearn, handling errors.
    """
    try:
        # Convert DataFrame to NumPy array
        # *Crucially*, we need to encode any categorical variables *before* causal discovery.
        df_encoded = df.copy()
        for col in df_encoded.select_dtypes(include='object').columns:
            le = LabelEncoder()
            df_encoded[col] = le.fit_transform(df_encoded[col])

        data_fci = df_encoded[variables].to_numpy()

        # Run FCI algorithm
        G, edges = fci(data_fci)  # Corrected usage of FCI

        # Convert graph to PyDot format and save as PNG
        pdy = GraphUtils.to_pydot(G)
        pdy.write_png(os.path.join(output_path, 'causal_graph_fci.png'))

        # Extract edge information
        edges_info = [
            f"  Node {variables[i]} -> Node {variables[j]}: Edge Type {edge_type}"
            for i, adj in enumerate(edges)
            for j, edge_type in enumerate(adj) if edge_type != 0
        ]
        edge_info_str = "\n".join(edges_info)
        print("Edges from Causal Graph (FCI):\n" + edge_info_str)
        return edge_info_str

    except Exception as e:
        print(f"Error during causal structure discovery: {e}")
        return None

def calculate_shap_values(df, feature_columns, target_column, output_path):
    """Calculates and visualizes SHAP values, handling errors and one-hot encoding."""
    try:
        # Encode categorical features for SHAP
        df_encoded = df.copy()
        for col in feature_columns:
            if df_encoded[col].dtype == "object":
                le = LabelEncoder()
                df_encoded[col] = le.fit_transform(df_encoded[col])

        model_rf = RandomForestRegressor(random_state=42).fit(  # Added random_state
            df_encoded[feature_columns], df_encoded[target_column]
        )
        explainer = shap.TreeExplainer(model_rf)
        shap_values = explainer.shap_values(df_encoded[feature_columns])

        plt.figure(figsize=(10, 8))
        plt.style.use('dark_background')
        shap.summary_plot(
            shap_values, df_encoded[feature_columns], show=False, color_bar=True
        )  # show color bar
        plt.tight_layout()
        plt.savefig(f"{output_path}shap_summary.png")
        plt.close()
        return f"SHAP summary for features {feature_columns} predicting {target_column}"
    except Exception as e:
        print(f"Error during SHAP value calculation: {e}")
        return "Error: SHAP value calculation failed."

def create_kde_plot(df, column1, column2, output_path, colors):
    """Creates a KDE plot, handling errors."""
    try:
        plt.figure(figsize=(10, 6))
        plt.style.use('dark_background')
        sns.kdeplot(data=df[column1], color=colors[0], label=column1.capitalize(), linewidth=LINE_WIDTH)
        sns.kdeplot(data=df[column2], color=colors[1], label=column2.capitalize(), linewidth=LINE_WIDTH)
        plt.title('KDE Plot of Anxiety Levels', color='white')
        plt.legend(facecolor='black', edgecolor='white', labelcolor='white')
        plt.savefig(os.path.join(output_path, 'kde_plot.png'))
        plt.close()
        return f"KDE plot visualizing distributions of {column1} and {column2}"
    except KeyError as e:
        print(f"Error generating KDE plot: Column not found: {e}")
        return "Error: KDE plot generation failed.  Missing column."
    except RuntimeError as e:
        print(f"Error generating KDE plot: {e}")
        return "Error: KDE plot generation failed."
    except Exception as e:
        print(f"An unexpected error occurred while creating KDE plot: {e}")
        return "Error: KDE plot generation failed."

def create_violin_plot(df, group_column, y_column, output_path, colors):
    """Creates a violin plot, handling errors and one-hot encoded groups."""
    try:
        plt.figure(figsize=(10, 6))
        plt.style.use('dark_background')

        # Handling group column when already one-hot encoded
        encoded_group_cols = [col for col in df.columns if col.startswith(f"{group_column}_")]

        if len(encoded_group_cols) > 0:
            # Create a temporary column for group membership
            df['temp_group'] = np.nan
            for col in encoded_group_cols:
                group_name = col.split('_', 1)[1]  # Extract group name
                df.loc[df[col] == 1, 'temp_group'] = group_name

            # Create violin plot with temporary group column
            sns.violinplot(data=df, x='temp_group', y=y_column, palette=colors[:len(encoded_group_cols)], linewidth=LINE_WIDTH)
            df.drop('temp_group', axis=1, inplace=True) # Remove temp column
        else:
            # Original group column is present
            sns.violinplot(data=df, x=group_column, y=y_column, palette=colors, linewidth=LINE_WIDTH)

        plt.title('Violin Plot of Anxiety Distribution by Group', color='white')
        plt.savefig(os.path.join(output_path, 'violin_plot.png'))
        plt.close()
        return f"Violin plot showing {y_column} across groups"

    except KeyError as e:
        print(f"Error generating violin plot: Column not found: {e}")
        return "Error: Violin plot generation failed. Missing column."
    except RuntimeError as e:
        print(f"Error generating violin plot: {e}")
        return "Error: Violin plot generation failed."
    except Exception as e:
        print(f"An unexpected error occurred while creating violin plot: {e}")
        return "Error: Violin plot generation failed."

def create_parallel_coordinates_plot(df, group_column, anxiety_pre_column, anxiety_post_column, output_path, colors):
    """Creates a parallel coordinates plot, handling one-hot encoded groups and errors."""
    try:
        # Prepare data: Need original group names, not one-hot encoded.
        plot_df = df[[group_column, anxiety_pre_column, anxiety_post_column]].copy()

        # Create a color map for groups
        unique_groups = plot_df[group_column].unique()
        group_color_map = {group: colors[i % len(colors)] for i, group in enumerate(unique_groups)}

        # Map group names to colors
        plot_df['color'] = plot_df[group_column].map(group_color_map)

        # Create the parallel coordinates plot
        fig = px.parallel_coordinates(
            plot_df,
            color='color',  # Use the new 'color' column
            dimensions=[anxiety_pre_column, anxiety_post_column],
            title="Anxiety Levels: Pre- vs Post-Intervention by Group",
            color_continuous_scale=px.colors.sequential.Viridis, # Using Viridis
        )

        # Customize appearance
        fig.update_layout(
            plot_bgcolor='black',
            paper_bgcolor='black',
            font_color='white',
            title_font_size=16,
        )

        fig.write_image(os.path.join(output_path, 'parallel_coordinates_plot.png'))
        return f"Parallel coordinates plot of anxiety pre vs post intervention by group"

    except KeyError as e:
        print(f"Error generating parallel coordinates plot: Column not found: {e}")
        return "Error: Parallel coordinates plot generation failed. Missing column."
    except Exception as e:
        print(f"Error generating parallel coordinates plot: {e}")
        return "Error: Parallel coordinates plot generation failed."

def visualize_hypergraph(df, anxiety_pre_column, anxiety_post_column, output_path, colors):
    """Visualizes a hypergraph, handling errors."""
    try:
        G = nx.Graph()
        participant_ids = df[PARTICIPANT_ID_COLUMN].tolist()
        G.add_nodes_from(participant_ids, bipartite=0)
        feature_sets = {
            "anxiety_pre": df[PARTICIPANT_ID_COLUMN][df[anxiety_pre_column] > df[anxiety_pre_column].mean()].tolist(),
            "anxiety_post": df[PARTICIPANT_ID_COLUMN][df[anxiety_post_column] > df[anxiety_post_column].mean()].tolist()
        }
        feature_nodes = list(feature_sets.keys())
        G.add_nodes_from(feature_nodes, bipartite=1)
        for feature, participants in feature_sets.items():
            for participant in participants:
                G.add_edge(participant, feature)
        pos = nx.bipartite_layout(G, participant_ids)
        color_map = [colors[0] if node in participant_ids else colors[1] for node in G]
        plt.figure(figsize=(12, 10))
        plt.style.use('dark_background')
        nx.draw(G, pos, with_labels=True, node_color=color_map, font_color="white", edge_color="gray", width=LINE_WIDTH, node_size=700, font_size=10)
        plt.title("Hypergraph Representation of Anxiety Patterns", color="white")
        plt.savefig(os.path.join(output_path, "hypergraph.png"))
        plt.close()
        return "Hypergraph visualizing participant relationships based on anxiety pre and post intervention"
    except KeyError as e:
        print(f"Error generating hypergraph: Column not found: {e}")
        return "Error: Hypergraph generation failed. Missing column."
    except Exception as e:
        print(f"Error creating hypergraph: {e}")
        return "Error creating hypergraph."

def perform_bootstrap(data, statistic, n_resamples=BOOTSTRAP_RESAMPLES):
    """Performs bootstrap analysis, handling errors."""
    try:
        bootstrap_result = bootstrap((data,), statistic, n_resamples=n_resamples, method='percentile', random_state=42) # Added random_state
        return bootstrap_result.confidence_interval
    except Exception as e:
        print(f"Error during bootstrap analysis: {e}")
        return (None, None)

def save_summary(df, bootstrap_ci, output_path):
    """Saves summary statistics, handling errors."""
    try:
        summary_text = df.describe().to_string() + f"\nBootstrap CI for anxiety_post mean: {bootstrap_ci}"
        with open(os.path.join(output_path, 'summary.txt'), 'w') as f:
            f.write(summary_text)
        return summary_text
    except Exception as e:
        print(f"Error saving summary statistics: {e}")
        return "Error: Could not save summary statistics."

def generate_insights_report(summary_stats_text, causal_edges_info, shap_analysis_info, kde_plot_desc, violin_plot_desc, parallel_coords_desc, hypergraph_desc, output_path):
    """Generates an insights report, handling errors."""
    try:
        grok_insights = (
            analyze_text_with_llm(f"Analyze summary statistics:\n{summary_stats_text}", MODEL_GROK_NAME) + "\n\n" +
            (analyze_text_with_llm(f"Interpret causal graph edges (FCI):\n{causal_edges_info}", MODEL_GROK_NAME) + "\n\n" if causal_edges_info else "")
        )
        claude_insights = (
            analyze_text_with_llm(f"Interpret KDE plot: {kde_plot_desc}", MODEL_CLAUDE_NAME) + "\n\n" +
            analyze_text_with_llm(f"Interpret Violin plot: {violin_plot_desc}", MODEL_CLAUDE_NAME) + "\n\n" +
            analyze_text_with_llm(f"Interpret Parallel Coordinates: {parallel_coords_desc}", MODEL_CLAUDE_NAME) + "\n\n" +
            analyze_text_with_llm(f"Interpret Hypergraph: {hypergraph_desc}", MODEL_CLAUDE_NAME) + "\n\n" +
            analyze_text_with_llm(f"Explain SHAP summary: {shap_analysis_info}", MODEL_CLAUDE_NAME) + "\n\n"
        )
        grok_enhanced_insights = analyze_text_with_llm(f"Provide enhanced insights on anxiety intervention effectiveness based on causal graph and SHAP analysis.", MODEL_GROK_ENHANCED_NAME)

        combined_insights = f"""
    Combined Insights Report: Anxiety Intervention Causal Analysis

    Grok-base Analysis:
    {grok_insights}

    Claude 3.7 Sonnet Analysis:
    {claude_insights}

    Grok-Enhanced Analysis:
    {grok_enhanced_insights}

    Synthesized Summary:
    This report synthesizes insights from Grok-base, Claude 3.7 Sonnet, and Grok-Enhanced, focusing on the causal analysis of anxiety intervention effectiveness. Grok-base provides a statistical overview and initial causal interpretations from FCI, suggesting potential intervention impacts and confounding factors. Claude 3.7 Sonnet details visual patterns and feature importance, highlighting dependencies and causal pathways. Grok-Enhanced offers a high-level synthesis, emphasizing nuanced causal relationships and actionable recommendations, revealing subtle connections and potential mechanisms. The combined analyses provide a comprehensive understanding of intervention impacts, highlighting key factors influencing post-intervention anxiety and suggesting areas for targeted improvements in intervention strategies. The use of FCI enhances the robustness of causal inference, providing a more reliable foundation for understanding intervention effectiveness.
    """
        with open(os.path.join(output_path, 'insights.txt'), 'w') as f:
            f.write(combined_insights)
        print(f"Insights saved to: {os.path.join(output_path, 'insights.txt')}")
        return "Insights report generated successfully."

    except Exception as e:
        print(f"Error generating insights report: {e}")
        return "Error generating insights report."

# --- Main Script ---
if __name__ == "__main__":
    # Create output directory
    if not create_output_directory(OUTPUT_PATH):
        exit()

    # Synthetic dataset (small, embedded in code)
    synthetic_dataset = """
participant_id,group,anxiety_pre,anxiety_post
P001,Group A,4,2
P002,Group A,3,1
P003,Group A,5,3
P004,Group B,6,5
P005,Group B,5,4
P006,Group B,7,6
P007,Control,3,3
P008,Control,4,4
P009,Control,2,2
P010,Control,5,5
"""
    # Load and validate data
    df = load_data_from_synthetic_string(synthetic_dataset)
    if df is None:
        exit()

    required_columns = [PARTICIPANT_ID_COLUMN, GROUP_COLUMN, ANXIETY_PRE_COLUMN, ANXIETY_POST_COLUMN]
    if not validate_dataframe(df, required_columns):
        exit()

    # Keep original 'group' column for visualizations
    df_viz = df.copy()

    # One-hot encode and scale
    df = pd.get_dummies(df, columns=[GROUP_COLUMN], prefix=GROUP_COLUMN, dtype=int)  # One-hot encode group, specify dtype
    encoded_group_cols = [col for col in df.columns if col.startswith(f"{GROUP_COLUMN}_")]
    df = scale_data(df, [ANXIETY_PRE_COLUMN, ANXIETY_POST_COLUMN] + encoded_group_cols)  # Scale data
    if df is None:
        exit()

    # Causal Discovery with FCI
    causal_variables = encoded_group_cols + [ANXIETY_PRE_COLUMN, ANXIETY_POST_COLUMN]
    causal_edges_info = discover_causal_structure(df, causal_variables, OUTPUT_PATH)

    # SHAP Analysis
    shap_feature_columns = encoded_group_cols + [ANXIETY_PRE_COLUMN]
    shap_analysis_info = calculate_shap_values(df.copy(), shap_feature_columns, ANXIETY_POST_COLUMN, OUTPUT_PATH)

    # Visualization colors
    neon_colors = ["#FF00FF", "#00FFFF", "#FFFF00", "#00FF00"]

    # Create visualizations (using the original dataframe for plots that need group labels)
    kde_plot_desc = create_kde_plot(df_viz, ANXIETY_PRE_COLUMN, ANXIETY_POST_COLUMN, OUTPUT_PATH, neon_colors[:2])
    violin_plot_desc = create_violin_plot(df_viz, GROUP_COLUMN, ANXIETY_POST_COLUMN, OUTPUT_PATH, neon_colors)
    parallel_coords_desc = create_parallel_coordinates_plot(df_viz, GROUP_COLUMN, ANXIETY_PRE_COLUMN, ANXIETY_POST_COLUMN, OUTPUT_PATH, neon_colors)
    hypergraph_desc = visualize_hypergraph(df_viz, ANXIETY_PRE_COLUMN, ANXIETY_POST_COLUMN, OUTPUT_PATH, neon_colors[:2])

    # Bootstrap analysis
    bootstrap_ci = perform_bootstrap(df_viz[ANXIETY_POST_COLUMN], np.mean)

    # Save summary statistics
    summary_stats_text = save_summary(df_viz, bootstrap_ci, OUTPUT_PATH)

    # Generate insights report
    generate_insights_report(summary_stats_text, causal_edges_info, shap_analysis_info, kde_plot_desc, violin_plot_desc, parallel_coords_desc, hypergraph_desc, OUTPUT_PATH)

    print("Execution completed successfully - Causal Discovery Enhanced Notebook.")

Mounted at /content/drive


  0%|          | 0/5 [00:00<?, ?it/s]

Error during causal structure discovery: 'Edge' object is not iterable


  sns.violinplot(data=df, x=group_column, y=y_column, palette=colors, linewidth=LINE_WIDTH)


Error generating parallel coordinates plot: 
Image export using the "kaleido" engine requires the kaleido package,
which can be installed using pip:
    $ pip install -U kaleido

Insights saved to: /content/drive/MyDrive/output_anxiety_causal_discovery/insights.txt
Execution completed successfully - Causal Discovery Enhanced Notebook.
