In [3]:
import os

folders = [
    "brca-pathway-classifier/data/raw",
    "brca-pathway-classifier/data/processed",
    "brca-pathway-classifier/notebooks",
    "brca-pathway-classifier/src",
    "brca-pathway-classifier/models",
    "brca-pathway-classifier/results/figures",
    "brca-pathway-classifier/results/tables",
    "brca-pathway-classifier/web_app"
]

# Create folders
for folder in folders:
    os.makedirs(folder, exist_ok=True)

# Create placeholder files
with open("brca-pathway-classifier/README.md", "w") as f:
    f.write("# BRCA Pathway-Environmental Classifier\n\nSee `notebooks/01_feature_engineering.ipynb` to begin.")

with open("brca-pathway-classifier/requirements.txt", "w") as f:
    f.write("xgboost\nshap\npandas\nnumpy\nscikit-learn\nbiopython\nrequests\nmatplotlib\nseaborn\n")

with open("brca-pathway-classifier/.gitignore", "w") as f:
    f.write("*.pyc\n__pycache__/\ndata/raw/\nmodels/\n")

print("✅ Project folder scaffolded.")

✅ Project folder scaffolded.


In [None]:
# Mount your Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Define your project base path
project_path = "/content/drive/My Drive/BRCA-pathway-classifier"

# Define the path to the new, larger VEP-filtered file
vep_path = f"{project_path}/data/processed/brca_vep_output_all_filtered.tsv"

# Define common subfolders
data_raw = f"{project_path}/data/raw"
data_processed = f"{project_path}/data/processed"
data_external = f"{project_path}/data/external"
models_path = f"{project_path}/models"
results_path = f"{project_path}/results"
scripts_path = f"{project_path}/scripts"

In [12]:
import pandas as pd

# ✅ Define the path to your filtered VEP output
vep_path = "/content/drive/MyDrive/BRCA-pathway-classifier/data/processed/brca_vep_output_all_filtered.tsv"

# ✅ Load the VEP file, skipping the first 106 lines (comment/header lines)
vep_df = pd.read_csv(vep_path, sep="\t", skiprows=106)

# ✅ Print shape and columns to confirm it's loaded properly
print("✅ Loaded VEP shape:", vep_df.shape)
print("✅ Columns:", vep_df.columns.tolist())

# ✅ Define the columns we want to extract
selected_cols = [
    "Consequence", "IMPACT", "SYMBOL", "BIOTYPE", "CANONICAL",
    "SIFT", "PolyPhen", "AF", "gnomADg_AF", "CLIN_SIG"
]

# ✅ Filter only those columns
features_df = vep_df[selected_cols].copy()

# ✅ Clean SIFT and PolyPhen — extract numeric scores from strings like "tolerated(0.35)"
def extract_score(val):
    if pd.isna(val): return None
    if "(" in val: return float(val.split("(")[-1].strip(")"))
    try: return float(val)
    except: return None

for col in ["SIFT", "PolyPhen"]:
    features_df[col] = features_df[col].apply(extract_score)

# ✅ Drop rows missing key values
features_df = features_df.dropna(subset=["Consequence", "IMPACT", "CLIN_SIG"])

# ✅ One-hot encode categorical variables
features_encoded = pd.get_dummies(features_df)

# ✅ Save to processed folder
output_path = "/content/drive/MyDrive/BRCA-pathway-classifier/data/processed/brca_features_enriched.csv"
features_encoded.to_csv(output_path)

print("✅ Feature file saved:", output_path)
print("✅ Final shape:", features_encoded.shape)

✅ Loaded VEP shape: (35395, 82)
✅ Columns: ['#Uploaded_variation', 'Location', 'Allele', 'Gene', 'Feature', 'Feature_type', 'Consequence', 'cDNA_position', 'CDS_position', 'Protein_position', 'Amino_acids', 'Codons', 'Existing_variation', 'IMPACT', 'DISTANCE', 'STRAND', 'FLAGS', 'VARIANT_CLASS', 'SYMBOL', 'SYMBOL_SOURCE', 'HGNC_ID', 'BIOTYPE', 'CANONICAL', 'MANE', 'MANE_SELECT', 'MANE_PLUS_CLINICAL', 'TSL', 'APPRIS', 'CCDS', 'ENSP', 'SWISSPROT', 'TREMBL', 'UNIPARC', 'UNIPROT_ISOFORM', 'GENE_PHENO', 'SIFT', 'PolyPhen', 'EXON', 'INTRON', 'DOMAINS', 'miRNA', 'HGVSc', 'HGVSp', 'HGVS_OFFSET', 'AF', 'AFR_AF', 'AMR_AF', 'EAS_AF', 'EUR_AF', 'SAS_AF', 'gnomADe_AF', 'gnomADe_AFR_AF', 'gnomADe_AMR_AF', 'gnomADe_ASJ_AF', 'gnomADe_EAS_AF', 'gnomADe_FIN_AF', 'gnomADe_MID_AF', 'gnomADe_NFE_AF', 'gnomADe_REMAINING_AF', 'gnomADe_SAS_AF', 'gnomADg_AF', 'gnomADg_AFR_AF', 'gnomADg_AMI_AF', 'gnomADg_AMR_AF', 'gnomADg_ASJ_AF', 'gnomADg_EAS_AF', 'gnomADg_FIN_AF', 'gnomADg_MID_AF', 'gnomADg_NFE_AF', 'gnomADg_

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib

# Load enriched features
df = pd.read_csv("/content/drive/MyDrive/BRCA-pathway-classifier/data/processed/brca_features_enriched.csv", index_col=0)

# Check label availability
if "CLIN_SIG_pathogenic" not in df.columns or "CLIN_SIG_benign" not in df.columns:
    raise ValueError("Label columns not found. Ensure one-hot encoding produced 'CLIN_SIG_pathogenic' and 'CLIN_SIG_benign'.")

# Create binary label: 1 = pathogenic, 0 = benign
df = df[(df["CLIN_SIG_pathogenic"] == 1) | (df["CLIN_SIG_benign"] == 1)].copy()
df["label"] = df["CLIN_SIG_pathogenic"]

# Drop label columns and split
X = df.drop(columns=["label", "CLIN_SIG_pathogenic", "CLIN_SIG_benign"])
y = df["label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)

# Train classifier
clf = RandomForestClassifier(n_estimators=200, max_depth=12, random_state=42)
clf.fit(X_train, y_train)

# Evaluate
y_pred = clf.predict(X_test)
print("✅ Classification Report:\n", classification_report(y_test, y_pred))

# Save model
model_path = "/content/drive/MyDrive/BRCA-pathway-classifier/models/brca_sandbox_model.pkl"
joblib.dump(clf, model_path)
print("✅ Model saved to:", model_path)

✅ Classification Report:
               precision    recall  f1-score   support

       False       0.99      0.96      0.98       291
        True       0.99      1.00      1.00      1690

    accuracy                           0.99      1981
   macro avg       0.99      0.98      0.99      1981
weighted avg       0.99      0.99      0.99      1981

✅ Model saved to: /content/drive/MyDrive/BRCA-pathway-classifier/models/brca_sandbox_model.pkl


In [None]:
import shap
import pandas as pd
import joblib

# ✅ Reload model and features
model_path = "/content/drive/MyDrive/BRCA-pathway-classifier/models/brca_sandbox_model.pkl"
model = joblib.load(model_path)

features_path = "/content/drive/MyDrive/BRCA-pathway-classifier/data/processed/brca_features_enriched.csv"
df = pd.read_csv(features_path, index_col=0)

# ✅ Rebuild X (ensure it matches what model saw)
X = df.drop(columns=[col for col in df.columns if col.startswith("CLIN_SIG")])
X = X.select_dtypes(include=['number']).copy()  # Keep numeric only

print("✅ Final numeric features shape:", X.shape)

# ✅ SHAP Explainer
explainer = shap.Explainer(model, X)
shap_values = explainer(X)

# ✅ Plot top 15 features
shap.summary_plot(shap_values, X, max_display=15)

✅ Final numeric features shape: (35395, 2)


In [1]:
# ✅ Proceed to modeling
print("Next: Open `03_sandbox_model_brca1_vs_brca2.ipynb` to train classifier and analyze feature importance.")

Next: Open `03_sandbox_model_brca1_vs_brca2.ipynb` to train classifier and analyze feature importance.
