In [3]:
# Install required packages
%pip install openai pandas matplotlib seaborn --quiet
print("Packages installed successfully!")

StatementMeta(, fb2683e7-6b16-4aa6-87c7-0baa65e90922, 10, Finished, Available, Finished)


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
Packages installed successfully!



In [4]:
from notebookutils import mssparkutils
import pandas as pd
import os

# Azure OpenAI Configuration
key_vault_name = "your-key-vault-name"              # Update with your Key Vault name
openai_secret_name = "azure-openai-api-key"        # Update with your secret name
openai_endpoint = "your-resource.openai.azure.com" # Update with your endpoint
openai_version = "2023-05-15"
openai_model = "gpt-4"

# File paths
data_path = "/lakehouse/default/Files/PDC_biospecimen_manifest_03272025_214257.csv"
report_path = "/lakehouse/default/Files/biospecimen_analysis_report.html"

StatementMeta(, fb2683e7-6b16-4aa6-87c7-0baa65e90922, 12, Finished, Available, Finished)

In [5]:
from openai import AzureOpenAI
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from notebookutils import mssparkutils
import os
from IPython.display import display, HTML

def generate_biospecimen_report():
    """
    Generate comprehensive biospecimen report with statistics, visualizations,
    and AI-powered insights in Microsoft Fabric.
    """
    try:
        # 1. Load data from Fabric Lakehouse
        print("⏳ Loading data from Lakehouse...")
        df = pd.read_csv(data_path)
        print(f"✅ Successfully loaded data with shape: {df.shape}")
        
        # 2. Generate basic statistics
        print("\n📊 Generating statistics...")
        numerical_summary = df.describe(include=['number']).to_string()
        categorical_summary = ""
        
        for col in df.select_dtypes(include=['object']).columns:
            categorical_summary += f"\n\n=== {col} ===\n{df[col].value_counts(dropna=False).to_string()}"
        
        # 3. Create visualizations
        print("\n🎨 Creating visualizations...")
        os.makedirs('temp_visualizations', exist_ok=True)
        
        # Visualization 1: Sample Type Distribution
        plt.figure(figsize=(12, 6))
        df['Sample Type'].value_counts().plot(kind='bar', color='skyblue')
        plt.title('Sample Type Distribution', fontsize=14)
        plt.xlabel('Sample Type', fontsize=12)
        plt.ylabel('Count', fontsize=12)
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        sample_type_path = 'temp_visualizations/sample_type_distribution.png'
        plt.savefig(sample_type_path, dpi=300)
        plt.close()
        
        # Visualization 2: Disease Type Distribution (Top 15)
        plt.figure(figsize=(12, 6))
        df['Disease Type'].value_counts().nlargest(15).plot(kind='barh', color='lightgreen')
        plt.title('Top 15 Disease Types', fontsize=14)
        plt.xlabel('Count', fontsize=12)
        plt.ylabel('Disease Type', fontsize=12)
        plt.tight_layout()
        disease_type_path = 'temp_visualizations/disease_type_distribution.png'
        plt.savefig(disease_type_path, dpi=300)
        plt.close()
        
        # 4. Generate AI Insights
        print("\n🧠 Generating AI insights...")
        client = AzureOpenAI(
            api_key=mssparkutils.credentials.getSecret(key_vault_name, openai_secret_name),
            api_version=openai_version,
            azure_endpoint=f"https://{openai_endpoint}"
        )
        
        prompt = f"""You are a biomedical data analyst. Analyze this biospecimen dataset:

        NUMERICAL SUMMARY:
        {numerical_summary}

        CATEGORICAL DISTRIBUTIONS:
        {categorical_summary}

        Provide:
        1. 3 key observations about the data composition
        2. 2 potential data quality issues to investigate
        3. 3 recommendations for further analysis
        4. 1 interesting pattern worth exploring
        
        Format the response with clear headings and bullet points."""
        
        response = client.chat.completions.create(
            model=openai_model,
            messages=[{"role": "user", "content": prompt}],
            temperature=0.3,
            max_tokens=1000
        )
        insights = response.choices[0].message.content
        
        # 5. Compile and Save Report
        print("\n📝 Compiling final report...")
        
        # Create HTML content with proper escaping
        insights_html = insights.replace('\n', '<br>')
        html_content = f"""
<html>
<head>
    <title>Biospecimen Analysis Report</title>
    <style>
        body {{ font-family: Arial, sans-serif; line-height: 1.6; }}
        h1 {{ color: #2e6c80; }}
        h2 {{ color: #3e7c90; margin-top: 30px; }}
        img {{ max-width: 100%; height: auto; margin: 20px 0; border: 1px solid #ddd; }}
        .insights {{ background-color: #f5f9fa; padding: 15px; border-radius: 5px; }}
        pre {{ white-space: pre-wrap; background-color: #f5f5f5; padding: 10px; border-radius: 5px; }}
    </style>
</head>
<body>
    <h1>Biospecimen Analysis Report</h1>
    <p>Generated on: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M')}</p>
    
    <h2>1. Data Overview</h2>
    <p>Total records: {len(df):,}</p>
    <p>Total columns: {len(df.columns)}</p>
    
    <h2>2. Sample Type Distribution</h2>
    <img src="sample_type_distribution.png" alt="Sample Type Distribution">
    
    <h2>3. Disease Type Distribution (Top 15)</h2>
    <img src="disease_type_distribution.png" alt="Disease Type Distribution">
    
    <h2>4. Statistical Summary</h2>
    <pre>{numerical_summary}</pre>
    
    <h2>5. AI-Generated Insights</h2>
    <div class="insights">
        {insights_html}
    </div>
</body>
</html>
        """
        
        # Save HTML report
        with open(report_path, 'w') as f:
            f.write(html_content)
        
        # Save visualizations to Lakehouse
        mssparkutils.fs.cp(f"file:{sample_type_path}", "/lakehouse/default/Files/sample_type_distribution.png")
        mssparkutils.fs.cp(f"file:{disease_type_path}", "/lakehouse/default/Files/disease_type_distribution.png")
        
        print(f"\n🎉 Report successfully generated and saved to:")
        print(f"- HTML Report: {report_path}")
        print(f"- Visualization 1: /lakehouse/default/Files/sample_type_distribution.png")
        print(f"- Visualization 2: /lakehouse/default/Files/disease_type_distribution.png")
        
        # Display report preview
        display(HTML(f"<a href='{report_path}' target='_blank'>Open Full Report</a>"))
        
    except Exception as e:
        print(f"\n❌ Error generating report: {str(e)}")
        print("\n🛠️ Troubleshooting steps:")
        print("1. Verify packages are installed (%pip install openai pandas matplotlib seaborn)")
        print("2. Check the data file exists in your Lakehouse Files")
        print("3. Verify Azure OpenAI credentials in Key Vault")
        print("4. Ensure your Fabric capacity has network access to Azure OpenAI")

# Execute the report generation
generate_biospecimen_report()

StatementMeta(, fb2683e7-6b16-4aa6-87c7-0baa65e90922, 13, Finished, Available, Finished)

⏳ Loading data from Lakehouse...
✅ Successfully loaded data with shape: (452, 45)

📊 Generating statistics...

🎨 Creating visualizations...

🧠 Generating AI insights...

❌ Error generating report: An error occurred while calling z:mssparkutils.credentials.getSecret.
: com.microsoft.azure.trident.tokenlibrary.util.AkvHttpClientException: Invalid vault uri. Uri should match azure key vault URI like https://<keyVaultName>.vault.azure.net/
	at com.microsoft.azure.trident.tokenlibrary.util.AkvBasedSecretProviderClientImpl.invokeGetTarget(AkvBasedSecretProviderClient.scala:122)
	at com.microsoft.azure.trident.tokenlibrary.util.AkvBasedSecretProviderClientImpl.getAkvSecretWithAccessToken(AkvBasedSecretProviderClient.scala:153)
	at com.microsoft.azure.trident.tokenlibrary.TokenLibrary.getSecretWithToken(TokenLibrary.scala:806)
	at com.microsoft.azure.trident.tokenlibrary.TokenLibrary$.getSecretWithToken(TokenLibrary.scala:1359)
	at mssparkutils.credentials$.getSecret(credentials.scala:166)
	at