<a href="https://colab.research.google.com/github/enank07/DROIDSRIProject/blob/main/XGBoostenankhan.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Hey, this is the XGBoost project workspace! All directories have been removed for privacy reasons, so please feel free to put your own in. Have fun! - Enan

In [None]:
!pip install -qU \
    cellxgene-census[tiledbsoma] \
    scanpy anndata tiledbsoma \
    xgboost scikit-learn

!pip install -q scikit-misc
!pip install --user scikit-misc

In [None]:
#imports
from google.colab import drive
import pandas as pd
import numpy as np
import scanpy as sc
import anndata
import cellxgene_census as cxc
import xgboost as xgb
from sklearn.model_selection import GroupShuffleSplit
from pathlib import Path
import warnings

warnings.filterwarnings("ignore", category=FutureWarning, module="anndata")

In [None]:
#mounted drive
drive.mount('/content/drive')

project_dir = Path("")
project_dir.mkdir(parents=True, exist_ok=True)
adata_path = project_dir / "covid_3kHVG.h5ad"

In [None]:
#check data
import cellxgene_census as cxc
import pandas as pd
censusgarbage= "2025-01-30"
idcollection = "ddfad306-714d-4cc0-9985-d9072820c530"
with cxc.open_soma(census_version=censusgarbage) as census:
    #data pull
    ds = (
        census["census_info"]["datasets"]
        .read()
        .concat()
        .to_pandas() #pandas switch
        .set_index("soma_joinid")
    )
my_ds= ds.query("collection_id == @idcollection")
print(my_ds[["dataset_id", "dataset_title", "dataset_total_cell_count"]])

In [None]:
#data maker with AnnData
import cellxgene_census as cxc
import scanpy as sc

censusgarbage = "2025-01-30"                            #good id
DATASET_ID     = "c7775e88-49bf-4ba2-a03b-93f00447c958"

with cxc.open_soma(census_version=censusgarbage) as census:
    adata = cxc.get_anndata(
        census=census,
        organism="Homo sapiens",
        obs_value_filter=(
            f'dataset_id == "{DATASET_ID}" '
            'and is_primary_data == True'            #all canonical cell
        )
    )

print(adata)
print(adata.obs.head())

In [None]:
#data preparation
import scanpy as sc

#normalize data via log1p to remove bias between cells (sum library bias crpa)
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
adata.layers["log1p"] = adata.X.copy()   #keep copy

#VARY IT TO NUANCE (hyperparameters: 3000)
sc.pp.highly_variable_genes(
    adata, n_top_genes=3_000, layer="log1p", flavor="seurat_v3", subset=True
)
print(adata)      #check data

In [None]:
#data splitting and label preparation
adata.obs["cell_type_code"] = adata.obs["cell_type"].astype("category").cat.codes
y_cell = adata.obs["cell_type_code"].values
n_cell_classes = len(adata.obs["cell_type"].cat.categories)

#binary labels
adata.obs["disease_code"] = (adata.obs["disease"] != "normal").astype(int)
y_disease = adata.obs["disease_code"].values

# Get the feature matrix (gene expression) as a NumPy array
X = adata.X.toarray()

#train + val + group shuffle split
groups = adata.obs["donor_id"].values
gss = GroupShuffleSplit(test_size=0.2, n_splits=1, random_state=42)
train_idx, val_idx = next(gss.split(X, groups=groups))

#create training and validation sets for features and both targets
X_train, X_val = X[train_idx], X[val_idx]
y_cell_train, y_cell_val = y_cell[train_idx], y_cell[val_idx]
y_disease_train, y_disease_val = y_disease[train_idx], y_disease[val_idx]

print(f"Data split into training ({len(train_idx)} cells) and validation ({len(val_idx)} cells).")

In [None]:
import torch
import xgboost as xgb
from pathlib import Path

#setup+path
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

#define path in drive
project_dir = Path("")
project_dir.mkdir(parents=True, exist_ok=True) #ensure directory exists
cell_model_path = project_dir / "xgb_cell_classifier_final.ubj"
print(f"Cell-type model will be saved to: {cell_model_path}")

#cell type classifer
print("\n--- Training Cell-Type Classifier ---")

#define classifier
xgb_cell_classifier = xgb.XGBClassifier(
    objective='multi:softmax',
    num_class=n_cell_classes,
    n_estimators=50,
    early_stopping_rounds=50,
    tree_method='hist',
    eval_metric=['mlogloss', 'merror'],
    device=device,
    random_state= #choose
)

#train and print
xgb_cell_classifier.fit(
    X_train,
    y_cell_train,
    eval_set=[(X_val, y_cell_val)],
    verbose=1
)

print("\nCell-type classifier training complete.")
print(f"Saving model to {cell_model_path}...")
xgb_cell_classifier.save_model(cell_model_path)
print("Model saved successfully.")

In [None]:
#xg boost disease classifier
import torch
import xgboost as xgb
from pathlib import Path

#path
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

#defined
project_dir = Path("")
project_dir.mkdir(parents=True, exist_ok=True) #directory exists
disease_model_path = project_dir / "xgb_disease_classifier_final.ubj"
print(f"Disease model will be saved to: {disease_model_path}")


#classifier
print("\n--- Training Disease Classifier ---")

#xgboost
xgb_disease_classifier = xgb.XGBClassifier(
    objective='binary:logistic',
    n_estimators=50,
    early_stopping_rounds=50,
    tree_method='hist',
    eval_metric=['logloss', 'auc', 'error'],
    device=device,
    random_state= #choose
)

#train, print output every 1 loop
xgb_disease_classifier.fit(
    X_train,
    y_disease_train,
    eval_set=[(X_val, y_disease_val)],
    verbose=1
)

print("\nDisease classifier training complete.")
print(f"Saving model to {disease_model_path}...")
xgb_disease_classifier.save_model(disease_model_path)
print("Model saved successfully.")

DO THE CONFUSION MATRIXES as well...