In [239]:
import pickle
import pandas as pd
from sklearn.metrics import classification_report

In [240]:
with open(r".\CancerStagersPrimaryModel.pkl",'rb') as f:
    pri_model = pickle.load(f) # load the saved primary model
with open(r".\CancerStagersSecondaryModel.pkl",'rb') as f:
    sec_model = pickle.load(f) # load the saved secondary model

In [246]:
pri_model # view of the primary model (with hyperparameters)

In [247]:
sec_model # view of the secondary model (with hyperparameters)

In [241]:
def df_test(dataframe: pd.DataFrame): # takes in a pandas dataframe 
    if all(x in dataframe.columns for x in pri_model.feature_names_in_)==False:
        raise Exception(f"""Missing Mandatory Columns Detected. 
Please ensure your dataframe has the required columns:\n\n{list(pri_model.feature_names_in_)}""")
    # checks that the mandatory columns are present in the dataframe 
    # (both primary and secondary models use the same features)
    elif (any(dataframe[pri_model.feature_names_in_].isna().sum()>0)): 
        # checks if any of the mandatory columns (as found in model.feature_names_in_) has a missing value
        missing_cols = (list((dataframe[pri_model.feature_names_in_].isna().sum()>0).index))
        raise Exception(f"Missing Data Detected in Columns {missing_cols}")
    else:
        # if above checks are fulfilled, continue with code
        dataframe = dataframe.reset_index().drop(columns='index') 
        # reindex the dataframe for ease of reference for row indices
        y_pred_pri = list(pri_model.predict(dataframe[pri_model.feature_names_in_]))
        # primary model predicts into 4 classes > healthy/screening, early, mid, late
        y_1 = [i for i,e in enumerate(y_pred_pri) if e==1] # 1st class (healthy/screening) selected
        dataframe2 = dataframe.loc[dataframe.index.isin(y_1)].copy() 
        # indexes are used to lookup those rows in the dataset
        y_pred_sec = list(sec_model.predict(dataframe2[sec_model.feature_names_in_])) # secondary model used 
        # to reclassify those data points more specifically into healthy/ screening stage
        y_pred = []
        for e in y_pred_pri:
            if e!=1:
                y_pred.append(e)
            else:
                y_pred.append(y_pred_sec.pop(0)) # final results are added to y_pred list in order
        dic = {"healthy":0,"screening stage cancer":1,"early stage cancer":2,"mid stage cancer":3,"late stage cancer":4}
        reverse_lis = [k for k,_ in sorted(dic.items(),key=lambda x:x[-1])]
        r_dic = {k:v for k,v in enumerate(reverse_lis)}
        target = 'class_label'
        df=None
        if target in dataframe.columns:
        # if there is an actual class_label column present, can compare actual data with predicted data
            df = pd.DataFrame({"Actual":dataframe[target].values,
                    "Predicted":y_pred})
            for e in df.columns[1:]:
                df[e]=df[e].map(r_dic) 
                # numeric labels (0,1,2,3,4) converted to meaningful information
                # (healthy, screening stage , early stage, mid stage, late stage)
            df['Match']=df['Actual']==df['Predicted']
            print(f"Percentage of matches between prediction and actual case: {df['Match'].sum()/df['Match'].__len__()*100}")
            # expresses the number of matches as a percentage of total number of results (rough gauge of overall accuracy)
            print(df)
            # displays the information
        y_pred = pd.Series(list(map(lambda x:r_dic[x],y_pred)))
        y_pred.name = 'Predicted'
        return [y_pred,df]

In [242]:
test_path = r".\NUS_IT\Test_Set.csv" # default test path; can change to other csv files of a similar format
df = pd.read_csv(test_path)
df_sample = df.sample(100)

In [243]:
pd.set_option('expand_frame_repr', False)

res=df_test(df_sample)
if res[-1] is not None:
    pred,pf = res 
else:
    pred = res[0]

Percentage of matches between prediction and actual case: 71.0
                    Actual               Predicted  Match
0         mid stage cancer       late stage cancer  False
1        late stage cancer       late stage cancer   True
2       early stage cancer      early stage cancer   True
3         mid stage cancer        mid stage cancer   True
4       early stage cancer      early stage cancer   True
..                     ...                     ...    ...
95  screening stage cancer  screening stage cancer   True
96      early stage cancer       late stage cancer  False
97                 healthy  screening stage cancer  False
98        mid stage cancer       late stage cancer  False
99       late stage cancer        mid stage cancer  False

[100 rows x 3 columns]


In [244]:
print(pred)

0          late stage cancer
1          late stage cancer
2         early stage cancer
3           mid stage cancer
4         early stage cancer
               ...          
95    screening stage cancer
96         late stage cancer
97    screening stage cancer
98         late stage cancer
99          mid stage cancer
Name: Predicted, Length: 100, dtype: object


In [245]:
target = 'class_label'
if target in df.columns:
    print(classification_report(pf['Actual'],pred)) 
    # more tailored version of checking precision, recall and f1 of each category

                        precision    recall  f1-score   support

    early stage cancer       0.70      0.79      0.74        33
               healthy       1.00      0.25      0.40         4
     late stage cancer       0.75      0.86      0.80        21
      mid stage cancer       0.74      0.64      0.68        22
screening stage cancer       0.63      0.60      0.62        20

              accuracy                           0.71       100
             macro avg       0.76      0.63      0.65       100
          weighted avg       0.72      0.71      0.70       100

