In [None]:
!pip -q install pandas numpy scikit-learn matplotlib


In [None]:
import os, textwrap
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, recall_score

RANDOM_STATE = 42
pd.set_option("display.max_colwidth", 120)


In [None]:
# If your filename differs, change it here:
CSV_PATH = "clinical_data(labels).csv"
assert os.path.exists(CSV_PATH), f"File not found: {CSV_PATH}"
df = pd.read_csv(CSV_PATH)
print("Shape:", df.shape)
df.head()


Shape: (1063, 26)


Unnamed: 0,bcr_patient_barcode,Time,age_at_initial_pathologic_diagnosis,lymph_node_examined_count,vital_status,tissue_prospective_collection_indicator_YES,radiation_therapy_NO,breast_carcinoma_surgical_procedure_name_Lumpectomy,breast_carcinoma_surgical_procedure_name_Other,breast_carcinoma_surgical_procedure_name_Simple Mastectomy,...,pathologic_N_N1,pathologic_N_N2,pathologic_N_N3,pathologic_N_NX,pathologic_M_M1,pathologic_M_MX,pathologic_stage_Stage I,pathologic_stage_Stage III,pathologic_stage_Stage IV,pathologic_stage_Stage X
0,TCGA-E2-A1BD,1133.0,53,1.0,1,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
1,TCGA-BH-A0AW,622.0,56,12.0,1,False,False,True,False,False,...,True,False,False,False,False,False,False,False,False,False
2,TCGA-AO-A0JB,1542.0,50,14.0,1,False,False,False,False,False,...,True,False,False,False,False,False,False,True,False,False
3,TCGA-D8-A1JN,620.0,80,13.0,1,True,True,False,False,False,...,False,False,True,False,False,True,False,True,False,False
4,TCGA-EW-A1P8,239.0,58,15.0,2,False,True,True,False,False,...,False,False,True,False,False,False,False,True,False,False


In [None]:
def find_col(df, candidates, required=False, default=None):
    cols = {c.lower(): c for c in df.columns}
    for cand in candidates:
        if cand.lower() in cols:
            return cols[cand.lower()]
    if required:
        raise ValueError(f"Required column not found. Tried: {candidates}")
    return default

ethnicity_col = find_col(df, ["ethnicity","race","race_ethnicity","demographic_ethnicity","ethnicity_category"])
sex_col       = find_col(df, ["gender","sex","biological_sex"])
age_col       = find_col(df, ["age","age_at_initial_pathologic_diagnosis","age_at_diagnosis"])
cancer_col    = find_col(df, ["cancer_type","primary_diagnosis","project_id","disease_type"])
treat_col     = find_col(df, ["treatment_type","treatment","therapy_type","treatment_or_therapy","drug_name","treatment_regimen"])
outcome_col   = find_col(df, ["vital_status","overall_survival_status","os_status","five_year_survival","disease_status","outcome"])

print("Detected columns:")
print(" ethnicity_col:", ethnicity_col)
print(" sex_col      :", sex_col)
print(" age_col      :", age_col)
print(" cancer_col   :", cancer_col)
print(" treat_col    :", treat_col)
print(" outcome_col  :", outcome_col)


Detected columns:
 ethnicity_col: None
 sex_col      : None
 age_col      : age_at_initial_pathologic_diagnosis
 cancer_col   : None
 treat_col    : None
 outcome_col  : vital_status


In [None]:
work = df.copy()

def norm_text(x):
    if pd.isna(x): return np.nan
    x = str(x).strip().lower()
    maps = {"m":"male","male":"male","f":"female","female":"female",
            "alive":"alive","deceased":"dead","dead":"dead"}
    return maps.get(x, x)

for col in [ethnicity_col, sex_col, treat_col, outcome_col, cancer_col]:
    if col in work.columns:
        work[col] = work[col].apply(norm_text)

if age_col in work.columns:
    work[age_col] = pd.to_numeric(work[age_col], errors="coerce")
    work.loc[(work[age_col] < 0) | (work[age_col] > 120), age_col] = np.nan

print("Rows after cleaning:", len(work))
work.head(3)


Rows after cleaning: 1063


Unnamed: 0,bcr_patient_barcode,Time,age_at_initial_pathologic_diagnosis,lymph_node_examined_count,vital_status,tissue_prospective_collection_indicator_YES,radiation_therapy_NO,breast_carcinoma_surgical_procedure_name_Lumpectomy,breast_carcinoma_surgical_procedure_name_Other,breast_carcinoma_surgical_procedure_name_Simple Mastectomy,...,pathologic_N_N1,pathologic_N_N2,pathologic_N_N3,pathologic_N_NX,pathologic_M_M1,pathologic_M_MX,pathologic_stage_Stage I,pathologic_stage_Stage III,pathologic_stage_Stage IV,pathologic_stage_Stage X
0,TCGA-E2-A1BD,1133.0,53.0,1.0,1,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
1,TCGA-BH-A0AW,622.0,56.0,12.0,1,False,False,True,False,False,...,True,False,False,False,False,False,False,False,False,False
2,TCGA-AO-A0JB,1542.0,50.0,14.0,1,False,False,False,False,False,...,True,False,False,False,False,False,False,True,False,False


In [None]:
def dist_table(series, top_n=20):
    vc = series.value_counts(dropna=False)
    pct = (vc/vc.sum()*100).round(2)
    return pd.DataFrame({"count": vc, "percent": pct}).head(top_n)

if ethnicity_col:
    print("Ethnicity / Race distribution:")
    display(dist_table(work[ethnicity_col]))

if sex_col:
    print("\nSex / Gender distribution:")
    display(dist_table(work[sex_col]))

if cancer_col:
    print("\nCancer type distribution (top 15):")
    display(dist_table(work[cancer_col], top_n=15))

# Plot ethnicity counts if available
if ethnicity_col:
    dt = dist_table(work[ethnicity_col])
    plt.figure()
    dt["count"].plot(kind="bar")
    plt.title("Ethnicity/Race Counts")
    plt.xlabel("Group")
    plt.ylabel("Count")
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    plt.show()


In [None]:
if treat_col and ethnicity_col:
    print("Treatment mix by Ethnicity (row-normalized %):")
    treat_by_eth = (work.groupby([ethnicity_col, treat_col]).size()
                       .groupby(level=0).apply(lambda s: (s/s.sum()*100).round(2)))
    display(treat_by_eth)

if treat_col and sex_col:
    print("\nTreatment mix by Sex (row-normalized %):")
    treat_by_sex = (work.groupby([sex_col, treat_col]).size()
                       .groupby(level=0).apply(lambda s: (s/s.sum()*100).round(2)))
    display(treat_by_sex)


In [None]:
def binarize_outcome(x):
    if pd.isna(x): return np.nan
    x = str(x).lower()
    if x in ["alive","0","censored","no_event","disease_free","tumor free","not dead","survived","alive, disease free"]:
        return 1
    if x in ["dead","1","event","deceased","died","death","deceased, disease"]:
        return 0
    # If your column is numeric survival event (0/1), try:
    try:
        v = float(x)
        if v in [0.0,1.0]: return 1-int(v)  # flip if OS=1 means death
    except:
        pass
    return np.nan

if outcome_col:
    work["_outcome_bin"] = work[outcome_col].apply(binarize_outcome)
    uniq = work["_outcome_bin"].dropna().unique()
    print("Outcome unique values after mapping:", uniq)
    if len(uniq)==2:
        if ethnicity_col:
            print("\nMean positive outcome by Ethnicity (1=good, 0=poor):")
            display((work.groupby(ethnicity_col)["_outcome_bin"].mean()*100).round(2).astype(str)+"%")
        if sex_col:
            print("\nMean positive outcome by Sex:")
            display((work.groupby(sex_col)["_outcome_bin"].mean()*100).round(2).astype(str)+"%")
    else:
        print("Binary mapping did not succeed; adjust binarize_outcome().")
else:
    print("No outcome column detected.")


Outcome unique values after mapping: [0.]
Binary mapping did not succeed; adjust binarize_outcome().


In [None]:
if "_outcome_bin" in work.columns and work["_outcome_bin"].dropna().nunique()==2:
    features = []
    if age_col:       features.append(age_col)
    if sex_col:       features.append(sex_col)
    if ethnicity_col: features.append(ethnicity_col)
    if cancer_col:    features.append(cancer_col)
    if treat_col:     features.append(treat_col)

    model_df = work[features + ["_outcome_bin"]].dropna()
    print("Rows available for modeling:", model_df.shape[0])

    if model_df.shape[0] > 200:
        X = model_df[features]
        y = model_df["_outcome_bin"].astype(int)

        numeric_features = [c for c in [age_col] if c in X.columns]
        categorical_features = [c for c in X.columns if c not in numeric_features]

        pre = ColumnTransformer([
            ("num", "passthrough", numeric_features),
            ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
        ])

        clf = Pipeline([("pre", pre),
                        ("lr", LogisticRegression(max_iter=1000, random_state=RANDOM_STATE))])

        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.25, random_state=RANDOM_STATE, stratify=y
        )
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        acc = accuracy_score(y_test, y_pred)
        tpr = recall_score(y_test, y_pred)
        print(f"Test Accuracy: {acc:.3f} | Test TPR (Recall on positive): {tpr:.3f}")

        def group_metrics(X_raw, y_true, y_hat, group_col):
            if (group_col is None) or (group_col not in X_raw.columns):
                return None
            gvals = X_raw[group_col].astype(str).values
            rows = []
            for g in sorted(pd.Series(gvals).dropna().unique()):
                idx = (gvals == g)
                if idx.sum()==0: continue
                dp = (y_hat[idx]==1).mean()  # Demographic parity P(Ŷ=1|G=g)
                if (y_true[idx]==1).sum() > 0:
                    tpr_g = ((y_hat[idx]==1) & (y_true[idx]==1)).sum() / (y_true[idx]==1).sum()
                else:
                    tpr_g = np.nan
                rows.append([g, dp, tpr_g, int(idx.sum())])
            return pd.DataFrame(rows, columns=[group_col, "DemographicParity(Pred=1)", "TPR_on_Positive", "n"])

        if sex_col:
            print("\nFairness by Sex:")
            display(group_metrics(X_test, y_test.values, y_pred, sex_col))
        if ethnicity_col:
            print("\nFairness by Ethnicity:")
            display(group_metrics(X_test, y_test.values, y_pred, ethnicity_col))
    else:
        print("Not enough rows for stable modeling; skipping model step.")
else:
    print("Binary outcome not available; skipping model step.")


Binary outcome not available; skipping model step.


In [None]:
if "_outcome_bin" in work.columns and work["_outcome_bin"].dropna().nunique()==2 and ethnicity_col:
    model_df = work[[ethnicity_col, "_outcome_bin"]].dropna()
    counts = model_df[ethnicity_col].value_counts()
    inv_freq = counts.sum() / (len(counts) * counts)
    print("Inverse-frequency weights by ethnicity:")
    display(inv_freq)

    # If model from step 9 exists:
    if 'X_train' in locals():
        sw = X_train[ethnicity_col].map(inv_freq).fillna(1.0).values if ethnicity_col in X_train.columns else np.ones(len(X_train))
        clf_w = Pipeline([("pre", clf.named_steps["pre"]),
                          ("lr", LogisticRegression(max_iter=1000, random_state=RANDOM_STATE))])
        clf_w.fit(X_train, y_train, lr__sample_weight=sw)
        y_pred_w = clf_w.predict(X_test)

        acc_w = accuracy_score(y_test, y_pred_w)
        tpr_w = recall_score(y_test, y_pred_w)
        print(f"(Reweighted) Test Accuracy: {acc_w:.3f} | Test TPR: {tpr_w:.3f}")

        def group_metrics(X_raw, y_true, y_hat, group_col):
            gvals = X_raw[group_col].astype(str).values
            rows = []
            for g in sorted(pd.Series(gvals).dropna().unique()):
                idx = (gvals == g)
                if idx.sum()==0: continue
                dp = (y_hat[idx]==1).mean()
                if (y_true[idx]==1).sum() > 0:
                    tpr_g = ((y_hat[idx]==1) & (y_true[idx]==1)).sum() / (y_true[idx]==1).sum()
                else:
                    tpr_g = np.nan
                rows.append([g, dp, tpr_g, int(idx.sum())])
            return pd.DataFrame(rows, columns=[group_col, "DemographicParity(Pred=1)", "TPR_on_Positive", "n"])

        if sex_col:
            print("\n(Reweighted) Fairness by Sex:")
            display(group_metrics(X_test, y_test.values, y_pred_w, sex_col))
        if ethnicity_col:
            print("\n(Reweighted) Fairness by Ethnicity:")
            display(group_metrics(X_test, y_test.values, y_pred_w, ethnicity_col))
    else:
        print("No baseline model in memory; reweighting demo skipped.")


In [None]:
os.makedirs("/content/reports", exist_ok=True)

def dist_table(series, top_n=20):
    vc = series.value_counts(dropna=False)
    pct = (vc/vc.sum()*100).round(2)
    return pd.DataFrame({"count": vc, "percent": pct}).head(top_n)

txt_analysis = """(Paste the 300-word analysis from above here to save with your run.)"""
with open("/content/reports/task3_analysis.txt","w") as f:
    f.write(txt_analysis)

if ethnicity_col: dist_table(work[ethnicity_col]).to_csv("/content/reports/ethnicity_distribution.csv")
if sex_col:       dist_table(work[sex_col]).to_csv("/content/reports/sex_distribution.csv")
print("Saved to /content/reports")


Saved to /content/reports
