<a href="https://colab.research.google.com/github/gilsonauerswald/Bioinformatic_Projects/blob/main/Lesson_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from joblib import dump

# -------------------- Config --------------------
TRAIN_PATH = "https://raw.githubusercontent.com/Omicslogic-git/Project_3_data/refs/heads/main/sel48_Risk_train.txt"
TEST_PATH  = "https://raw.githubusercontent.com/Omicslogic-git/Project_3_data/refs/heads/main/sel48_Risk_test.txt"
OUTDIR = "rf_outputs"
RANDOM_SEED = 42
N_ESTIMATORS = 500
os.makedirs(OUTDIR, exist_ok=True)

# -------------------- Load ----------------------
train_raw = pd.read_csv(TRAIN_PATH, sep="\t", index_col=0)
test_raw  = pd.read_csv(TEST_PATH,  sep="\t", index_col=0)

# ---- Labels from the `class` row ----
assert "class" in train_raw.index, "Row 'class' not found in TRAIN matrix."
y_train_tokens = train_raw.loc["class"].astype(str).tolist()

test_has_labels = "class" in test_raw.index
y_test_tokens  = (test_raw.loc["class"].astype(str).tolist() if test_has_labels else None)

# ---- Drop label row from features ----
train_df = train_raw.drop(index="class")
test_df  = (test_raw.drop(index="class") if test_has_labels else test_raw)

# ---- Align genes between train and test ----
common_genes = train_df.index.intersection(test_df.index)
train_df = train_df.loc[common_genes]
test_df  = test_df.loc[common_genes]

# ---- Samples as rows, genes as columns ----
X_train = train_df.T.copy()
X_test  = test_df.T.copy()

# Encode labels
le = LabelEncoder()
y_train = le.fit_transform(y_train_tokens)
if test_has_labels:
    try:
        y_test = le.transform(y_test_tokens)
    except ValueError:
        # If test contains unseen labels, skip accuracy/report computation
        test_has_labels = False
        y_test = None

print(f"[Info] Train: {X_train.shape[0]} samples x {X_train.shape[1]} genes")
print(f"[Info] Test :  {X_test.shape[0]} samples x {X_test.shape[1]} genes")
print(f"[Info] Classes: {list(le.classes_)}")

# ---------------- Train RF on ALL genes ----------------
rf = RandomForestClassifier(
    n_estimators=N_ESTIMATORS,
    max_features="sqrt",
    class_weight="balanced_subsample",
    random_state=RANDOM_SEED,
    n_jobs=-1,
)
rf.fit(X_train, y_train)

dump(rf, os.path.join(OUTDIR, "rf_model.joblib"))

# ---------------- Training Results -----------------
y_pred_train = rf.predict(X_train)
cm = confusion_matrix(y_train, y_pred_train, labels=np.arange(len(le.classes_)))
cm_df = pd.DataFrame(cm, index=[f"True_{c}" for c in le.classes_],
                        columns=[f"Pred_{c}" for c in le.classes_])
cm_df.to_csv(os.path.join(OUTDIR, "training_confusion_matrix.csv"))

# Plot (default colors)
fig, ax = plt.subplots(figsize=(6, 6))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=le.classes_)
disp.plot(ax=ax, values_format='d', cmap=None)
plt.title("Random Forest â€” Training Confusion Matrix")
plt.tight_layout()
plt.savefig(os.path.join(OUTDIR, "training_confusion_matrix.png"), dpi=300)
plt.close(fig)

# Classification report (train)
report_train = classification_report(y_train, y_pred_train, target_names=le.classes_, output_dict=True)
pd.DataFrame(report_train).T.to_csv(os.path.join(OUTDIR, "training_classification_report.csv"))

# ---------------- Test Predictions -----------------
y_pred_test = rf.predict(X_test)
pred_labels = le.inverse_transform(y_pred_test)

test_out = pd.DataFrame({"Sample": X_test.index, "Predicted_Class": pred_labels})

if test_has_labels:
    test_out["True_Class"] = y_test_tokens
    acc = (test_out["Predicted_Class"] == test_out["True_Class"]).mean()
    with open(os.path.join(OUTDIR, "test_accuracy.txt"), "w") as f:
        f.write(f"Test Accuracy: {acc:.4f}\n")

    report_test = classification_report(y_test, y_pred_test, target_names=le.classes_, output_dict=True)
    pd.DataFrame(report_test).T.to_csv(os.path.join(OUTDIR, "test_classification_report.csv"))

# Always save predictions

test_out.to_csv(os.path.join(OUTDIR, "test_predictions.csv"), index=False)

print(
    f"[Done] Wrote outputs to `{OUTDIR}`:\n"
    "  - training_confusion_matrix.csv / .png\n"
    "  - training_classification_report.csv\n"
    "  - test_predictions.csv (+ test_accuracy.txt & test_classification_report.csv if labels present)\n"
    "  - rf_model.joblib\n"
)

[Info] Train: 50 samples x 48 genes
[Info] Test :  21 samples x 48 genes
[Info] Classes: [np.str_('No_Primary_factor'), np.str_('Not_available'), np.str_('alcohol_consumption'), np.str_('hemochromatosis'), np.str_('hepatitis_b'), np.str_('hepatitis_c'), np.str_('non-alcoholic_fatty_liver_disease'), np.str_('other')]
[Done] Wrote outputs to `rf_outputs`:
  - training_confusion_matrix.csv / .png
  - training_classification_report.csv
  - test_predictions.csv (+ test_accuracy.txt & test_classification_report.csv if labels present)
  - rf_model.joblib

