In [5]:
import pandas as pd
import glob
import os

input_folder = "/Users/sehriban/Desktop/Row_Data_Mouse/DE_results/Groups/normal_cell/*csv"
fdr_threshold = 0.05
top_n_genes = 10000

all_expr = []

for file in glob.glob(os.path.join(input_folder)):
    print(f"üìÇ Processing file: {os.path.basename(file)}")
    
    df = pd.read_csv(file)
    df.columns = df.columns.str.strip()

    if "gene_name" not in df.columns or "FDR" not in df.columns:
        print(f"‚ùå Skipping {os.path.basename(file)}: missing gene_name or FDR")
        continue

    sig = df[df["FDR"] < fdr_threshold]
    print(f"üîç {len(sig)} genes passed FDR < {fdr_threshold}")

    if sig.empty:
        print(f"‚ö†Ô∏è No significant genes in {os.path.basename(file)} ‚Äî skipping")
        continue

    if top_n_genes:
        sig = sig.sort_values("FDR").head(top_n_genes)

    sig = sig.drop_duplicates(subset="gene_name")

    sample_cols = [c for c in sig.columns if c.startswith("SRR")]
    if not sample_cols:
        print(f"‚ö†Ô∏è Skipping {os.path.basename(file)}: no SRR columns found")
        continue

    mat = sig[["gene_name"] + sample_cols]
    mat = mat.set_index("gene_name").transpose()
    mat = mat.reset_index().rename(columns={"index": "sample_id"})

    # Label extraction: expects .0 or .1 at end of sample_id
    mat["label"] = mat["sample_id"].str.extract(r"\.(0|1)$")[0]

    if mat["label"].isnull().any():
        print(f"‚ö†Ô∏è Some labels could not be parsed in {os.path.basename(file)}")

    all_expr.append(mat)

# Final check
if all_expr:
    combined = pd.concat(all_expr, ignore_index=True)
    combined.to_csv("/Users/sehriban/Desktop/Row_Data_Mouse/DE_results/Groups/normal_cell/ML_ready_expression_matrix_normal_cell.csv", index=False)
    print("‚úÖ Saved ML_ready_expression_matrix_mormal_cell.csv")
else:
    print("‚ùå No data to combine ‚Äî check logs above.")



üìÇ Processing file: data15_DE_results.csv
üîç 2082 genes passed FDR < 0.05
‚ö†Ô∏è Some labels could not be parsed in data15_DE_results.csv
üìÇ Processing file: data71_DE_results.csv
üîç 264 genes passed FDR < 0.05
‚ö†Ô∏è Some labels could not be parsed in data71_DE_results.csv
üìÇ Processing file: data64_DE_results.csv
üîç 11424 genes passed FDR < 0.05
‚ö†Ô∏è Some labels could not be parsed in data64_DE_results.csv
üìÇ Processing file: data79_DE_results.csv
üîç 8164 genes passed FDR < 0.05
‚ö†Ô∏è Some labels could not be parsed in data79_DE_results.csv
üìÇ Processing file: data89_DE_results.csv
üîç 3242 genes passed FDR < 0.05
‚ö†Ô∏è Some labels could not be parsed in data89_DE_results.csv
üìÇ Processing file: data63_DE_results.csv
üîç 404 genes passed FDR < 0.05
‚ö†Ô∏è Some labels could not be parsed in data63_DE_results.csv
üìÇ Processing file: data62_DE_results.csv
üîç 12314 genes passed FDR < 0.05
‚ö†Ô∏è Some labels could not be parsed in data62_DE_results.csv
üìÇ P

In [39]:
# 3. Train ML Models in Python
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# 1) Load data
df = pd.read_csv("ML_ready_expression_matrix.csv")
X = df.drop(columns=["sample_id", "label"])
y = df["label"]

print("Class counts:")
print(y.value_counts())

print(X.shape)
print(len(y))


from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5, stratify=y, random_state=40
)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
print(y_train.value_counts())
print(y_test.value_counts())



# 3) Common scaler
scaler = StandardScaler()

# 4) Pipelines
pipe_knn = Pipeline([
    ("scaler", scaler),
    ("knn", KNeighborsClassifier())
])
pipe_rf  = Pipeline([
    ("scaler", scaler),
    ("rf", RandomForestClassifier(random_state=44))
])

# 5) Hyperparameter grids
param_grid_knn = {
    "knn__n_neighbors": [3, 5, 7, 9],
    "knn__weights": ["uniform", "distance"],
    "knn__p": [1, 2]  # 1 = Manhattan, 2 = Euclidean
}

param_grid_rf = {
    "rf__n_estimators": [100, 300, 500],
    "rf__max_depth": [None, 10, 20, 30],
    "rf__min_samples_split": [2, 5, 10],
    "rf__min_samples_leaf": [1, 2, 4]
}

# 6) Grid searches (5-fold CV, optimizing accuracy)
grid_knn = GridSearchCV(
    pipe_knn,
    param_grid_knn,
    cv=5,
    n_jobs=-1,
    scoring="accuracy",
    verbose=1
)
grid_rf = GridSearchCV(
    pipe_rf,
    param_grid_rf,
    cv=5,
    n_jobs=-1,
    scoring="accuracy",
    verbose=1
)

# 7) Fit and evaluate KNN
print("=== Tuning KNN ===")
grid_knn.fit(X_train, y_train)
print("Best KNN params:", grid_knn.best_params_)
print(f"Best CV accuracy: {grid_knn.best_score_:.3f}\n")

y_pred_knn = grid_knn.predict(X_test)
print("KNN Classification Report on Test Set:")
print(classification_report(y_test, y_pred_knn))

# 8) Fit and evaluate Random Forest
print("\n=== Tuning Random Forest ===")
grid_rf.fit(X_train, y_train)
print("Best RF params:", grid_rf.best_params_)
print(f"Best CV accuracy: {grid_rf.best_score_:.3f}\n")

y_pred_rf = grid_rf.predict(X_test)
print("Random Forest Classification Report on Test Set:")
print(classification_report(y_test, y_pred_rf))


Class counts:
Epi    86
Mes    86
Name: label, dtype: int64
(182, 11288)
182


ValueError: Input contains NaN

In [None]:
np.random.seed(42)
np.random.shuffle(X)
np.random.seed(42)
np.random.shuffle(Y)
print(np.unique(Y))

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, stratify=Y, random_state=2)