In [1]:
import pandas as pd
import numpy as np

In [3]:
# removing irrelevant features

df = pd.read_csv('../data/gsea_clinical.tsv', sep='\t')
cols = df.columns.values
id_col = df["Sample ID"]
cols_drop = [cols[i] for i in [0, 1, 2, 5, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 19, 21, 22, 23, 24, 25, 26, 27, 28, 33, 34, 36, 40, 41, 42, 43, 44, 49, 52]]
df.drop(cols_drop, axis=1, inplace=True)

df.head()

Unnamed: 0,Diagnosis Age,Neoplasm Disease Stage American Joint Committee on Cancer Code,Aneuploidy Score,Buffa Hypoxia Score,Ethnicity Category,Fraction Genome Altered,MSI MANTIS Score,MSIsensor Score,Mutation Count,New Neoplasm Event Post Initial Therapy Indicator,...,Tumor Break Load,Tissue Prospective Collection Indicator,Tissue Retrospective Collection Indicator,Tissue Source Site,Tissue Source Site Code,TMB (nonsynonymous),Tumor Disease Anatomic Site,Tumor Type,Patient Weight,Winter Hypoxia Score
0,70.0,STAGE IV,17.0,11.0,,0.4565,0.2886,0.03,192.0,,...,98.0,No,Yes,Indivumed,5,6.4,Lung,Lung Adenocarcinoma (NOS),,20.0
1,67.0,STAGE IB,24.0,-27.0,,0.2221,0.2807,0.07,300.0,No,...,29.0,No,Yes,Indivumed,5,10.0,Lung,Lung Adenocarcinoma (NOS),,-26.0
2,79.0,STAGE IIIA,17.0,29.0,,0.2362,0.3059,0.28,312.0,,...,81.0,No,Yes,Indivumed,5,10.5,Lung,Lung Adenocarcinoma (NOS),,32.0
3,68.0,STAGE IB,22.0,19.0,,0.0854,0.3193,0.05,1547.0,Yes,...,226.0,No,Yes,Indivumed,5,51.733333,Lung,"Lung Adenocarcinoma, Mixed Subtype",,34.0
4,66.0,STAGE IIIA,1.0,-37.0,,0.0661,0.285,0.0,117.0,Yes,...,6.0,No,Yes,Indivumed,5,3.966667,Lung,"Lung Adenocarcinoma, Mixed Subtype",,-24.0


In [4]:
# nan checking
print(f"number of rows: {len(df)}")
df.isna().sum()

number of rows: 566


Diagnosis Age                                                                 71
Neoplasm Disease Stage American Joint Committee on Cancer Code                54
Aneuploidy Score                                                              65
Buffa Hypoxia Score                                                           56
Ethnicity Category                                                           176
Fraction Genome Altered                                                       66
MSI MANTIS Score                                                               2
MSIsensor Score                                                                1
Mutation Count                                                                 5
New Neoplasm Event Post Initial Therapy Indicator                            117
Overall Survival Status                                                       52
American Joint Committee on Cancer Metastasis Stage Code                      56
Neoplasm Disease Lymph Node 

In [5]:
# removing columns with 100+ nans
df.drop(["Patient Weight", "Race Category", "New Neoplasm Event Post Initial Therapy Indicator", "Ethnicity Category"], axis=1, inplace=True)
target_cols = ["American Joint Committee on Cancer Metastasis Stage Code", "Neoplasm Disease Lymph Node Stage American Joint Committee on Cancer Code", "American Joint Committee on Cancer Tumor Stage Code"]
cols_to_check = [col for col in df.columns if col not in target_cols]
df = df.dropna(subset=cols_to_check)

print(f"number of rows: {len(df)}")
df.isna().sum()

number of rows: 410


Diagnosis Age                                                                0
Neoplasm Disease Stage American Joint Committee on Cancer Code               0
Aneuploidy Score                                                             0
Buffa Hypoxia Score                                                          0
Fraction Genome Altered                                                      0
MSI MANTIS Score                                                             0
MSIsensor Score                                                              0
Mutation Count                                                               0
Overall Survival Status                                                      0
American Joint Committee on Cancer Metastasis Stage Code                     2
Neoplasm Disease Lymph Node Stage American Joint Committee on Cancer Code    0
American Joint Committee on Cancer Tumor Stage Code                          0
Radiation Therapy                                   

In [6]:
# looking for missing results (TX, NX, MX)
for i in target_cols:
  print(df[i].unique())

df.replace(['MX', 'TX', 'NX'], np.nan, inplace=True)
print(f"number of rows: {len(df)}")
df.isna().sum()

['M0' 'M1' nan 'MX' 'M1B' 'M1A']
['N0' 'N2' 'N3' 'NX' 'N1']
['T2' 'T1' 'T4' 'T3' 'T2B' 'T1B' 'T2A' 'T1A' 'TX']
number of rows: 410


Diagnosis Age                                                                  0
Neoplasm Disease Stage American Joint Committee on Cancer Code                 0
Aneuploidy Score                                                               0
Buffa Hypoxia Score                                                            0
Fraction Genome Altered                                                        0
MSI MANTIS Score                                                               0
MSIsensor Score                                                                0
Mutation Count                                                                 0
Overall Survival Status                                                        0
American Joint Committee on Cancer Metastasis Stage Code                     129
Neoplasm Disease Lymph Node Stage American Joint Committee on Cancer Code      7
American Joint Committee on Cancer Tumor Stage Code                            2
Radiation Therapy           

In [7]:
# RFC to estimate missing values

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas.api.types as ptypes

obj_cols = []

for x in df.columns.values:
  if ptypes.is_string_dtype(df[x]):
    obj_cols.append(x)

for x in target_cols:
    print(f"evaluating model for column: {x}")

    train = df[df[x].notna()]
    test = df[df[x].isna()]

    X_full = pd.get_dummies(train[[y for y in df.columns if y not in target_cols]], drop_first=True)
    y_full = train[x]

    X_train, X_val, y_train, y_val = train_test_split(X_full, y_full, test_size=0.2, random_state=42, stratify=y_full)

    clf = RandomForestClassifier()
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_val)

    report = classification_report(y_val, y_pred, zero_division=0, output_dict=True)
    f1_weighted = report['weighted avg']['f1-score']

    print("Accuracy:", accuracy_score(y_val, y_pred))
    print("F1 Weighted:", f1_weighted)

    if (accuracy_score(y_val, y_pred) > 0.8) and (f1_weighted > 0.8):
      print(f"predicted nan values for {x}\n")
      X_test = pd.get_dummies(test[[y for y in df.columns.values if y not in target_cols]], drop_first=True)
      X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

      clf.fit(X_full, y_full)
      predicted = clf.predict(X_test)
      df.loc[df[x].isna(), x] = predicted
    else:
      print(f"\ndid not predict nan values for {x}. Alternative models should be considered.")

evaluating model for column: American Joint Committee on Cancer Metastasis Stage Code
Accuracy: 0.9473684210526315
F1 Weighted: 0.9305488491871882
predicted nan values for American Joint Committee on Cancer Metastasis Stage Code

evaluating model for column: Neoplasm Disease Lymph Node Stage American Joint Committee on Cancer Code
Accuracy: 0.8518518518518519
F1 Weighted: 0.8386978248089357
predicted nan values for Neoplasm Disease Lymph Node Stage American Joint Committee on Cancer Code

evaluating model for column: American Joint Committee on Cancer Tumor Stage Code
Accuracy: 0.5487804878048781
F1 Weighted: 0.524626288830019

did not predict nan values for American Joint Committee on Cancer Tumor Stage Code. Alternative models should be considered.


In [8]:
print(f"We drop the 2 missing values for tumor since it only takes {round(2/410, 3)*100}% of the samples.")
df.dropna(inplace=True)

We drop the 2 missing values for tumor since it only takes 0.5% of the samples.


In [9]:
# checking for unique values in TNM
for x in target_cols:
  print(df[x].unique())

['M0' 'M1' 'M1B' 'M1A']
['N0' 'N2' 'N3' 'N1']
['T2' 'T1' 'T4' 'T3' 'T2B' 'T1B' 'T2A' 'T1A']


In [10]:
# type checking
df.dtypes

Diagnosis Age                                                                float64
Neoplasm Disease Stage American Joint Committee on Cancer Code                object
Aneuploidy Score                                                             float64
Buffa Hypoxia Score                                                          float64
Fraction Genome Altered                                                      float64
MSI MANTIS Score                                                             float64
MSIsensor Score                                                              float64
Mutation Count                                                               float64
Overall Survival Status                                                       object
American Joint Committee on Cancer Metastasis Stage Code                      object
Neoplasm Disease Lymph Node Stage American Joint Committee on Cancer Code     object
American Joint Committee on Cancer Tumor Stage Code              

In [11]:
df.head()

Unnamed: 0,Diagnosis Age,Neoplasm Disease Stage American Joint Committee on Cancer Code,Aneuploidy Score,Buffa Hypoxia Score,Fraction Genome Altered,MSI MANTIS Score,MSIsensor Score,Mutation Count,Overall Survival Status,American Joint Committee on Cancer Metastasis Stage Code,...,Somatic Status,Tumor Break Load,Tissue Prospective Collection Indicator,Tissue Retrospective Collection Indicator,Tissue Source Site,Tissue Source Site Code,TMB (nonsynonymous),Tumor Disease Anatomic Site,Tumor Type,Winter Hypoxia Score
1,67.0,STAGE IB,24.0,-27.0,0.2221,0.2807,0.07,300.0,0:LIVING,M0,...,Matched,29.0,No,Yes,Indivumed,5,10.0,Lung,Lung Adenocarcinoma (NOS),-26.0
3,68.0,STAGE IB,22.0,19.0,0.0854,0.3193,0.05,1547.0,0:LIVING,M0,...,Matched,226.0,No,Yes,Indivumed,5,51.733333,Lung,"Lung Adenocarcinoma, Mixed Subtype",34.0
4,66.0,STAGE IIIA,1.0,-37.0,0.0661,0.285,0.0,117.0,0:LIVING,M0,...,Matched,6.0,No,Yes,Indivumed,5,3.966667,Lung,"Lung Adenocarcinoma, Mixed Subtype",-24.0
5,70.0,STAGE IA,20.0,13.0,0.4579,0.2907,0.05,211.0,0:LIVING,M0,...,Matched,155.0,No,Yes,Indivumed,5,7.033333,Lung,"Lung Adenocarcinoma, Mixed Subtype",14.0
6,58.0,STAGE IB,21.0,35.0,0.3056,0.2876,0.01,517.0,0:LIVING,M0,...,Matched,229.0,No,Yes,Indivumed,5,17.333333,Lung,"Lung Adenocarcinoma, Mixed Subtype",46.0
