<a href="https://colab.research.google.com/github/enank07/DROIDSRIProject/blob/main/Supportvectormachineenankhan.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#dependencies
!pip install -qU \
    cellxgene-census[tiledbsoma] \
    scanpy anndata tiledbsoma torch torchvision

!pip install -q scikit-misc

#mounted drive
from google.colab import drive
drive.mount('/content/drive')
DATA_DIR = ""

In [None]:
import cellxgene_census as cxc
import scanpy as sc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
import os
from pathlib import Path

from sklearn.model_selection import GroupShuffleSplit
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay

warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

In [None]:
OUTPUT_DIR = Path("")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True) #directory creation
PREPROCESSED_ADATA_PATH = OUTPUT_DIR / "covid_3kHVG_svm.h5ad"

In [None]:
#check data
import cellxgene_census as cxc
import pandas as pd
censusgarbage= "2025-01-30"
idcollection = "ddfad306-714d-4cc0-9985-d9072820c530"
with cxc.open_soma(census_version=censusgarbage) as census:
    #data pull
    ds = (
        census["census_info"]["datasets"]
        .read()
        .concat()
        .to_pandas() #pandas switch
        .set_index("soma_joinid")
    )
my_ds= ds.query("collection_id == @idcollection")
print(my_ds[["dataset_id", "dataset_title", "dataset_total_cell_count"]])

In [None]:
#data maker with AnnData
import cellxgene_census as cxc
import scanpy as sc

censusgarbage = "2025-01-30"                            #good id
DATASET_ID     = "c7775e88-49bf-4ba2-a03b-93f00447c958"

with cxc.open_soma(census_version=censusgarbage) as census:
    adata = cxc.get_anndata(
        census=census,
        organism="Homo sapiens",
        obs_value_filter=(
            f'dataset_id == "{DATASET_ID}" '
            'and is_primary_data == True'            #all canonical cell
        )
    )

print(adata)
print(adata.obs.head())

In [None]:
#data preparation
import scanpy as sc

#normalize data via log1p to remove bias between cells (sum library bias crpa)
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
adata.layers["log1p"] = adata.X.copy()   #keep copy

#VARY IT TO NUANCE (hyperparameters: 3000)
sc.pp.highly_variable_genes(
    adata, n_top_genes=3_000, layer="log1p", flavor="seurat_v3", subset=True
)
print(adata)      #check data

#save drive
adata.write_h5ad(PREPROCESSED_ADATA_PATH, compression="gzip")

In [None]:
#data preprocessing
X = adata.X.toarray()
y_cell = adata.obs["cell_type"].astype("category").cat.codes.values
cell_type_labels = adata.obs["cell_type"].astype("category").cat.categories.tolist()
y_disease = (adata.obs["disease"] != "normal").astype(int).values
disease_labels = ["Normal", "COVID-19"]


gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
groups = adata.obs["donor_id"].values
train_idx, test_idx = next(gss.split(X, y_cell, groups=groups))

X_train, X_test = X[train_idx], X[test_idx]
y_cell_train, y_cell_test = y_cell[train_idx], y_cell[test_idx]
y_disease_train, y_disease_test = y_disease[train_idx], y_disease[test_idx]
print(f"Training set size: {len(X_train)} cells | Test set size: {len(X_test)} cells")

In [None]:
print("\n" + "="*50)
print(" Model 1: Cell Type SVM (One-vs-Rest)")
print("="*50)
cell_type_svm = LinearSVC(random_state= , dual=False, max_iter=1000) #choose random state
print("Training Cell Type SVM...")
cell_type_svm.fit(X_train, y_cell_train)
print("Training complete. Evaluating...")
y_cell_pred = cell_type_svm.predict(X_test)
cell_type_accuracy = accuracy_score(y_cell_test, y_cell_pred)
print(f"\nOverall Cell Type Accuracy: {cell_type_accuracy:.4f}")

#generate + save classification report
report_cell_path = OUTPUT_DIR / "cell_type_svm_report.txt"
report_cell_str = classification_report(y_cell_test, y_cell_pred, target_names=cell_type_labels, digits=3)
with open(report_cell_path, "w") as f:
    f.write(f"Overall Cell Type Accuracy: {cell_type_accuracy:.4f}\n\n")
    f.write(report_cell_str)
print(f"Full classification report saved to: {report_cell_path}")

In [None]:
print("\n" + "="*50)
print("  Model 2: Disease State SVM (Binary)")
print("="*50)
disease_svm = LinearSVC(random_state= , dual=False, max_iter=1000) #choose random state
print("Training Disease State SVM...")
disease_svm.fit(X_train, y_disease_train)
print("Training complete. Evaluating...")
y_disease_pred = disease_svm.predict(X_test)
disease_accuracy = accuracy_score(y_disease_test, y_disease_pred)
print(f"\nOverall Disease State Accuracy: {disease_accuracy:.4f}")

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay

#define path
plot_path = OUTPUT_DIR / "disease_svm_confusion_matrix.png"
print(f"Generating and saving confusion matrix to: {plot_path}")

#create figure + axis plot
fig, ax = plt.subplots(figsize=(6, 6))

#conf matrix
ConfusionMatrixDisplay.from_estimator(
    disease_svm,
    X_test,
    y_disease_test,
    display_labels=disease_labels,
    cmap='Blues',  # Use a blue color map
    ax=ax
)

#title
ax.set_title("Confusion Matrix - Disease State SVM")

#save fig
plt.savefig(plot_path, dpi=300, bbox_inches='tight')

# Display the plot
plt.show()