# BioPilot Example Workflow

This notebook demonstrates a typical bioinformatics analysis workflow using BioPilot.

In [None]:
import sys
sys.path.insert(0, '..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from biopilot.src.fetcher import DatasetFetcher
from biopilot.src.annotation import AnnotationDB, Sample
from biopilot.src.analyzer import DataAnalyzer
from biopilot.src.reproducibility import ReproducibilityLogger

## 1. Setup

In [None]:
fetcher = DatasetFetcher(data_dir='../data')
db = AnnotationDB(db_path='../data/metadata/annotations.db')
analyzer = DataAnalyzer(results_dir='../results', logs_dir='../logs')
logger = ReproducibilityLogger(logs_dir='../logs', env_dir='../env')

## 2. Search for Datasets

In [None]:
results = fetcher.search_geo('RNA-seq human cancer', max_results=5)
print(f'Found {len(results)} datasets')

for ds in results:
    print(f'{ds.accession}: {ds.title[:50]}...')

## 3. Register Samples

In [None]:
samples = [
    Sample(sample_id='treated_1', accession='EXP001', species='Homo sapiens', 
           tissue_type='liver', sequencing_type='RNA-seq', condition='treated'),
    Sample(sample_id='treated_2', accession='EXP002', species='Homo sapiens',
           tissue_type='liver', sequencing_type='RNA-seq', condition='treated'),
    Sample(sample_id='control_1', accession='EXP003', species='Homo sapiens',
           tissue_type='liver', sequencing_type='RNA-seq', condition='control'),
    Sample(sample_id='control_2', accession='EXP004', species='Homo sapiens',
           tissue_type='liver', sequencing_type='RNA-seq', condition='control'),
]

for sample in samples:
    db.add_sample(sample)

print(f'Total samples: {db.get_statistics()["total_samples"]}')

## 4. Generate Example Expression Data

In [None]:
np.random.seed(42)

n_genes = 1000
n_samples = 4

data = pd.DataFrame(
    np.random.negative_binomial(20, 0.5, size=(n_genes, n_samples)),
    columns=['treated_1', 'treated_2', 'control_1', 'control_2'],
    index=[f'gene_{i}' for i in range(n_genes)],
)

de_genes = [f'gene_{i}' for i in range(50)]
for gene in de_genes:
    data.loc[gene, ['treated_1', 'treated_2']] *= 3

data.head(10)

## 5. Normalization

In [None]:
normalized, params = analyzer.normalize(data, method='cpm', log_transform=True)
print(f"Normalization method: {params['method']}")
normalized.head()

## 6. PCA Analysis

In [None]:
pca_result, pca_params = analyzer.pca(normalized, n_components=2)

print(f"PC1: {pca_params['explained_variance'][0]*100:.1f}% variance")
print(f"PC2: {pca_params['explained_variance'][1]*100:.1f}% variance")

pca_result

In [None]:
fig, ax = plt.subplots(figsize=(8, 6))

colors = {'treated': 'red', 'control': 'blue'}
for sample in pca_result.index:
    condition = 'treated' if 'treated' in sample else 'control'
    ax.scatter(pca_result.loc[sample, 'PC1'], pca_result.loc[sample, 'PC2'],
               c=colors[condition], s=100, label=condition)

ax.set_xlabel(f"PC1 ({pca_params['explained_variance'][0]*100:.1f}%)")
ax.set_ylabel(f"PC2 ({pca_params['explained_variance'][1]*100:.1f}%)")
ax.legend()
ax.set_title('PCA of Samples')
plt.tight_layout()
plt.savefig('../results/figures/pca_notebook.png', dpi=150)
plt.show()

## 7. Differential Expression

In [None]:
de_result, de_params = analyzer.differential_expression(
    normalized,
    {'treated': ['treated_1', 'treated_2'], 'control': ['control_1', 'control_2']},
    fold_change_threshold=2.0,
    pvalue_threshold=0.05,
)

print(f"Significant genes: {de_params['significant_genes']}")
de_result.head(10)

In [None]:
analyzer.plot_volcano(de_result)

## 8. Heatmap of Top Variable Genes

In [None]:
analyzer.plot_heatmap(normalized, top_genes=50)

## 9. Save Results

In [None]:
output_file, analysis = analyzer.save_results(de_result, 'differential_expression', de_params)
print(f"Results saved to: {output_file}")
print(f"Analysis ID: {analysis.analysis_id}")

## 10. Capture Environment for Reproducibility

In [None]:
snapshot = logger.capture_environment()
print(f"Environment hash: {snapshot.hash}")
print(f"Python version: {snapshot.python_version}")
print(f"Total packages: {len(snapshot.packages)}")

In [None]:
logger.log_command('biopilot analyze de -i expression.tsv', exit_code=0, duration=2.5)
logger.export_log('../logs/notebook_session.json')

## Summary

This notebook demonstrated:
1. Dataset searching with BioPilot fetcher
2. Sample management in SQLite database
3. Expression data normalization (CPM + log2)
4. PCA analysis and visualization
5. Differential expression analysis
6. Volcano plot and heatmap generation
7. Environment capture for reproducibility