In [1]:
import numpy as np
import pandas as pd 
import re
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV

In [2]:
df = pd.read_csv(r".\NUS_IT\Train_Set.csv")
test_df = pd.read_csv(r".\NUS_IT\Test_Set.csv")
print("Dimensions of DataFrame:",df.shape)
df.head()

Dimensions of DataFrame: (2193, 351)


Unnamed: 0,length_51,length_52,length_53,length_54,length_55,length_56,length_57,length_58,length_59,length_60,...,length_392,length_393,length_394,length_395,length_396,length_397,length_398,length_399,length_400,class_label
0,0.000152,8.2e-05,8.7e-05,9.2e-05,9.9e-05,0.000125,0.000122,0.000115,0.000151,0.000167,...,0.007396,0.007193,0.006973,0.006481,0.006139,0.005958,0.005578,0.005327,0.005142,healthy
1,0.006256,0.006413,0.006512,0.006469,0.00681,0.00707,0.007748,0.008088,0.008671,0.008835,...,0.01783,0.017033,0.016373,0.015371,0.01434,0.013975,0.013243,0.012725,0.012267,healthy
2,0.003783,0.003886,0.004063,0.004237,0.004481,0.004832,0.00496,0.005605,0.005919,0.00648,...,0.010957,0.010481,0.009782,0.009211,0.0088,0.00832,0.007869,0.007286,0.00702,healthy
3,0.004635,0.004471,0.004383,0.004829,0.00492,0.005056,0.005475,0.006158,0.007174,0.007697,...,0.010032,0.008933,0.00897,0.008546,0.008125,0.007587,0.007501,0.006931,0.006663,healthy
4,0.011315,0.010045,0.009795,0.009906,0.01063,0.011467,0.013024,0.014853,0.016874,0.017501,...,0.009118,0.008535,0.008233,0.007754,0.007584,0.006933,0.00653,0.006378,0.005842,healthy


In [3]:
## Checking the number of columns with missing values 
df.columns[df.isna().sum()>0] # no columns with missing values 

Index([], dtype='object')

In [4]:
df.dtypes.value_counts() # 350 columns with float dtypes (decimal type); 1 which is non-numeric (target)

float64    350
object       1
dtype: int64

In [5]:
df['class_label'].value_counts() # imbalanced dataset; to use class weights

early stage cancer        781
screening stage cancer    490
mid stage cancer          453
late stage cancer         409
healthy                    60
Name: class_label, dtype: int64

In [6]:
dic={"healthy":0,"screening stage cancer":1,"early stage cancer":2,"mid stage cancer":3,"late stage cancer":4} 
# each stage is mapped distinctly
df['class_label'].map(dic).value_counts() # mapped appropriately

2    781
1    490
3    453
4    409
0     60
Name: class_label, dtype: int64

In [7]:
df['class_label'] = df['class_label'].map(dic)           
# changes the class_label column to numeric (ML model requires numeric data)
test_df['class_label'] = test_df['class_label'].map(dic) 
# likewise for the class_label column in the calidation dataset (test_df)

In [8]:
#### --> train test split first (from only the train data csv ; test data csv will be used for testing (unseen data))
r_state = 27 # for controlling of state 
target = "class_label" 
# for benefit of ez reference >> as opposed to always typing "class_label"
X_features=[i for i in df.columns if i!=target] 
# all independent features

val_p = 0.35

X_train,X_test,y_train,y_test= train_test_split(df[X_features],df[target],test_size=val_p,stratify=df[target],random_state = r_state)
df = pd.concat([X_train,y_train],axis=1) 
# train dataset
dfv = pd.concat([X_test,y_test],axis=1) 
# validation dataset

In [9]:
model = RandomForestClassifier(random_state=r_state,class_weight='balanced')
model.fit(df[X_features], df[target])
sig_features = model.feature_importances_
feature_scores = sorted([(i,e) for i,e in enumerate(sig_features)],key=lambda x:x[-1],reverse=True)
forest_features = df.columns[list(map(lambda x:x[0],feature_scores))]

In [10]:
# Also will take a while as model is being trained
def model_trainer(df,dfv,best_cols):

    rf_clf  = RandomForestClassifier(random_state=r_state,class_weight='balanced')
    rf_clf.fit(df[best_cols],df[target]) 
    # random forest model used since it was also the best performing model on multiple classes
    # in the IT_Proj - Final notebook

    rf_pred  = rf_clf.predict(dfv[best_cols])

    reverse_lis = [k for k,v in sorted(dic.items(),key=lambda x:x[-1])]

    print("Random Forest Report:")
    rf_r = classification_report(dfv[target],rf_pred,target_names = reverse_lis)
    print(rf_r)
    return [rf_r]

In [26]:
scores_by_cols = sorted(list(zip(forest_features,map(lambda x:x[-1],feature_scores))),key=lambda x:int(x[0].split('_')[-1]))
colnames = list(map(lambda x:x[0].split('_')[-1],scores_by_cols))
scores = list(map(lambda x:x[1],scores_by_cols))

In [12]:
scores = np.array(scores)
print(f"Median Score: {np.median(scores)}")
print(f"Mean Score  : {np.mean(scores)}")
iqr = np.quantile(scores,(0.25,0.75))
print(f"Scores at 25th percentile: {iqr[0]:.6f} and\n\t  75th percentile: {iqr[-1]:.6f}")

Median Score: 0.0020006031801589165
Mean Score  : 0.002857142857142857
Scores at 25th percentile: 0.001502 and
	  75th percentile: 0.002928


In [13]:
above_75th = len(scores[scores>=iqr[1]])
above_mean = len(scores[scores>=np.mean(scores)])
above_median = len(scores[scores>=np.median(scores)])
above_25th = len(scores[scores>=iqr[0]])
above_p = [('25th percentile',above_25th),('median',above_median),('mean',above_mean),('75th percentile',above_75th)]
print("Number of Features with scores\n" +'\n'.join([f'above the {i[0]:<16}: '+str(i[-1]) for i in above_p]))
# aim to look at number of top n-ranked features where n is 
# a number from 90 - 275 >> basis of comparison against all features (350)

Number of Features with scores
above the 25th percentile : 262
above the median          : 175
above the mean            : 89
above the 75th percentile : 88


In [14]:
d={}
no_of_features = [90,100,125,150,175,225,275,350]
for i in no_of_features: 
    print(f"Top {i} features")
    best_cols = forest_features[:i]
    l = model_trainer(df,dfv,best_cols) 
    a = list(map(lambda x:x.strip().split('\n'),l))
    k = ['Random Forest']
    for n,e in enumerate(a):
        key = f"{i} - {(k[n])}"
        z=[float(i[-1]) for i in list(map(lambda x:re.findall("\d\.\d\d",x),e)) if len(i)>0]
        d[key] = z

Top 90 features


Random Forest Report:
                        precision    recall  f1-score   support

               healthy       0.00      0.00      0.00        21
screening stage cancer       0.76      0.91      0.83       172
    early stage cancer       0.84      0.83      0.84       273
      mid stage cancer       0.79      0.79      0.79       159
     late stage cancer       0.95      0.87      0.91       143

              accuracy                           0.82       768
             macro avg       0.67      0.68      0.67       768
          weighted avg       0.81      0.82      0.81       768

Top 100 features
Random Forest Report:
                        precision    recall  f1-score   support

               healthy       0.00      0.00      0.00        21
screening stage cancer       0.77      0.88      0.82       172
    early stage cancer       0.84      0.84      0.84       273
      mid stage cancer       0.78      0.79      0.78       159
     late stage cancer       0.95      

In [15]:
best_cols = forest_features # highest score for each metric across all classes when all columns are used

In [16]:
reverse_lis = [k for k,_ in sorted(dic.items(),key=lambda x:x[-1])]

In [17]:
refined_params = {'bootstrap': [True,False],
 'max_depth': [20, 50, 100, None],
 'max_leaf_nodes':[2,5,None],
 'min_samples_split': [2,3,5],
 'criterion':['gini','entropy'],
 'n_estimators': [50, 100, 200]}
rf_grid = GridSearchCV(estimator = RandomForestClassifier(random_state=r_state,class_weight='balanced'), param_grid= refined_params, 
                        cv = StratifiedKFold(n_splits=5,random_state=r_state,shuffle=True),n_jobs=-1)
# Grid Search CV is used to evaluate ALL narrowed down parameters to give optimised result

In [18]:
rf_b = RandomForestClassifier(random_state=r_state,class_weight='balanced')
rf_b.fit(df[best_cols],df[target]) 
y_pred_base = rf_b.predict(dfv[best_cols])
print(classification_report(dfv[target],y_pred_base,target_names = reverse_lis)) 
# model performance for early, mid and late stage cancer (f1-score) comparable to primary model in final code,
# but results for screening and healthy stage > nt as great
# precision and recall for healthy stage pitiful compared to final model

                        precision    recall  f1-score   support

               healthy       0.25      0.05      0.08        21
screening stage cancer       0.77      0.91      0.83       172
    early stage cancer       0.87      0.85      0.86       273
      mid stage cancer       0.82      0.82      0.82       159
     late stage cancer       0.95      0.87      0.91       143

              accuracy                           0.84       768
             macro avg       0.73      0.70      0.70       768
          weighted avg       0.83      0.84      0.83       768



In [19]:
# takes about 30 min - 1 hour to run this code cell >> code is placed in multi-line string 
"""
rf_grid.fit(df[best_cols],df[target]) 
print(rf_grid.best_params_)
"""
# may remove the string quotes to run the code; do take note on the time required

### Observed Output ###
# {'bootstrap': False, 'criterion': 'entropy', 'max_depth': 20, 'max_leaf_nodes': None, 'min_samples_split': 3, 'n_estimators': 200}

{'bootstrap': False, 'criterion': 'entropy', 'max_depth': 20, 'max_leaf_nodes': None, 'min_samples_split': 3, 'n_estimators': 200}


In [20]:
rf_h = RandomForestClassifier(random_state=r_state,max_depth = 20,max_leaf_nodes = None,min_samples_split = 3,
    n_estimators = 200,class_weight='balanced',criterion='entropy',bootstrap=False)
rf_h.fit(df[best_cols],df[target]) 
y_pred_hyper = rf_h.predict(dfv[best_cols])
print(classification_report(dfv[target],y_pred_hyper,target_names = reverse_lis)) 
# hyperparameter model performs worse wrt precision for validation set
# esp w regards to precision data

                        precision    recall  f1-score   support

               healthy       0.12      0.05      0.07        21
screening stage cancer       0.78      0.90      0.84       172
    early stage cancer       0.88      0.88      0.88       273
      mid stage cancer       0.82      0.83      0.83       159
     late stage cancer       0.95      0.87      0.91       143

              accuracy                           0.85       768
             macro avg       0.71      0.70      0.70       768
          weighted avg       0.84      0.85      0.84       768

