In [None]:
""""
 test_synoptic_report_classifier

 By Chris Fong - MSKCC 2021


"""
import os
import sys
sys.path.insert(0, '/mind_data/fongc2/pathology_report_segmentation/')
import pandas as pd
import numpy as np
import constants_darwin_pathology as c_dar
from utils_pathology import set_debug_console
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.model_selection import KFold
from sklearn.decomposition import PCA
from pathology_synoptic_logistic_model import SynopticReportClassifier
%config Completer.use_jedi = False

In [None]:
# Console settings
set_debug_console()

In [None]:
col_syn = 'IS_SYNOPTIC'
cols_feat = ['FEATURE1', 'FEATURE2', 'FEATURE3']
pathname = c_dar.pathname
fname_save = c_dar.fname_path_synoptic
pathfilename_save = os.path.join(pathname, fname_save)

In [None]:
## Load data
fname = c_dar.fname_darwin_path_clean_parsed_specimen
pathfilename_data = os.path.join(pathname, fname)
df_path_long = pd.read_csv(pathfilename_data, header=0, low_memory=False, sep=',')

In [None]:
fname = c_dar.fname_path_synoptic_labels
pathfilename_labels = os.path.join(pathname, fname)
df_path_labels = pd.read_csv(pathfilename_labels, header=0, low_memory=False, sep=',')
df_path_labels = df_path_labels[df_path_labels[col_syn].notnull()]
df_path_labels[col_syn] = df_path_labels[col_syn].astype(int)
df_path_labels.drop(columns=['PATH_DX_SPEC_DESC'], inplace=True)

In [None]:
df_path_labels.head()
# df_path_long.head()

In [None]:
# ------------------------------------------------------------------------------------

# Create features
feature1 = df_path_long['PATH_DX_SPEC_DESC'].str.count('- ')
feature2 = df_path_long['PATH_DX_SPEC_DESC'].str.count(':')
feature3 = df_path_long['PATH_DX_SPEC_DESC'].str.len()
df_path_long = df_path_long.assign(FEATURE1=feature1)
df_path_long = df_path_long.assign(FEATURE2=feature2)
df_path_long = df_path_long.assign(FEATURE3=feature3)
df_path_long = df_path_long.merge(right=df_path_labels, how='left', on=['ACCESSION_NUMBER', 'PATH_DX_SPEC_NUM'])


df = df_path_long[['ACCESSION_NUMBER', 'PATH_DX_SPEC_NUM', col_syn] + cols_feat]

In [None]:
## Create df for training features and labels
logic_keep = df[cols_feat].notnull().sum(axis=1) == 3
logic_labeled = df[col_syn].notnull()
df_training = df[logic_keep & logic_labeled].reset_index(drop=True)
df_training = df_training.assign(IS_PREDICTION=False)
df_training_features = df_training[cols_feat]
data_norm = (df_training_features - df_training_features.mean(axis=0))/df_training_features.std(axis=0)
df_training_labels = df_training[col_syn]

In [None]:
# Perform cross validation by taking all feature data and separating into training and testing data
# Init cross validation variables
folds = 10
logisticRegr = LogisticRegression(solver='lbfgs')
num_classes = 2
Cknown = np.zeros(shape=(num_classes, 1))
Cmat = np.zeros(shape=(num_classes, num_classes))

kf = KFold(n_splits=folds)

scores = [None] * folds
precisions = [None] * folds
precision_all = [None] * folds
i = 0
j = 0
for train_index, test_index in kf.split(df_training_features):
#     print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = df_training_features.loc[train_index], df_training_features.loc[test_index]
    y_train, y_test = df_training_labels[train_index], df_training_labels[test_index]
    
    df_index = df_training.loc[X_test.index, ['ACCESSION_NUMBER', 'PATH_DX_SPEC_NUM']]
    
    # Perform classification
    logisticRegr.fit(X_train, y_train)
    y_pred = logisticRegr.predict(X_test)

    # Determine accuracy and precision
    y_actual = np.array(y_test)
    
    df_pred_comp = pd.DataFrame([y_actual, y_pred]).T.rename(columns={0:'Actual', 1: 'Predicted'})
    df_pred_comp = df_pred_comp.set_index(df_index.index, drop=True)
    df_pred_comp = pd.concat([df_index, df_pred_comp], axis=1)
    logic_cmp = df_pred_comp['Actual'] != df_pred_comp['Predicted']
    df_pred_comp = df_pred_comp.assign(Incorrect=logic_cmp)
    
    df_wrong_cases = df_pred_comp[df_pred_comp['Incorrect'] == True]
    if df_wrong_cases.shape[0] > 0:
        if j == 0:
            df_wrong_cases_f = df_wrong_cases.copy()
            j = 1
        else:
            df_wrong_cases_f = pd.concat([df_wrong_cases_f, df_wrong_cases], axis=0)

    C = confusion_matrix(y_actual, y_pred)

    # The precision is the ratio tp / (tp + fp) where tp is the number of true positives and fp the number of false positives
    average_precision = precision_score(y_actual, y_pred, average='weighted')
    precision_organ = precision_score(y_actual, y_pred, average=None)
    # The set of labels predicted for a sample must exactly match the corresponding set of labels in y_true.
    score = accuracy_score(y_true=y_test, y_pred=y_pred, normalize=True)
    scores[i] = score
    precisions[i] = average_precision
    precision_all[i] = precision_organ

    sumC = np.sum(C, axis=1)
    sumC = sumC.reshape(num_classes, 1)
    Cknown += sumC
    Cmat = Cmat + C
    i += 1
    
df_wrong_cases_f = df_wrong_cases_f.drop_duplicates()
df_wrong_cases_f = df_path_labels.merge(right=df_wrong_cases_f, how='right', on=['ACCESSION_NUMBER', 'PATH_DX_SPEC_NUM'])
df_wrong_cases_f;

In [None]:
# Confusion matrix heatmap 
label_names = ['Synoptic', 'Not Synoptic']
score_avg = np.mean(scores)
precision_avg = np.mean(precisions)
precision_organ = pd.DataFrame(precision_all, columns=label_names)
Cknown1 = np.tile(Cknown, num_classes)
Cmat1 = np.divide(Cmat, Cknown1)
Cmat1_d = pd.DataFrame(Cmat1, columns=['Predicted ' + x for x in label_names], index=['True ' + x for x in label_names])


fig, ax = plt.subplots()
tick_marks = np.arange(len(df_training_labels))
plt.xticks(tick_marks, df_training_labels)
plt.yticks(tick_marks, df_training_labels)
# create heatmap
sns.heatmap(Cmat1_d, annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.tight_layout()

In [None]:
Cmat
Cknown1

In [None]:
# ROC curve
logit_roc_auc = roc_auc_score(y_test, logisticRegr.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, logisticRegr.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()

In [None]:
# Compute PCA for visualization
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(df_training_features)
principalDf = pd.DataFrame(data = principalComponents, columns = ['principal component 1', 'principal component 2'])
principalDf = principalDf.assign(IS_SYNOPTIC=df_training_labels)
principalDf = principalDf.replace(to_replace={col_syn: {1: label_names[0], 0: label_names[1]}})

# Plot PCA results
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1)
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 component PCA for Synoptic Report Classification', fontsize = 14)
colors = ['g', 'b']
for target, color in zip(label_names, colors):
    indicesToKeep = principalDf[col_syn] == target
    ax.scatter(principalDf.loc[indicesToKeep, 'principal component 1']
               , principalDf.loc[indicesToKeep, 'principal component 2']
               , c = color
               , s = 50)
ax.legend(label_names)
ax.grid()

---
## Create full model from all labels and predict on unlabeled data

In [None]:
# Create logistic reg model
## Compute test path files
logic_not_labeled = df[col_syn].isnull()
df_validation = df.loc[logic_not_labeled & logic_keep].copy()
x_validation = df_validation[cols_feat]

# all parameters not specified are set to their defaults
logisticRegr = LogisticRegression(solver='lbfgs')
logisticRegr.fit(df_training_features, df_training_labels)

In [None]:
df_training_features.head()
x_validation.head()

In [None]:
predicted = logisticRegr.predict(x_validation)
df_validation = df_validation.assign(IS_SYNOPTIC=predicted)
df_validation = df_validation.assign(IS_PREDICTION=True)

pred_counts = df_validation['IS_SYNOPTIC'].value_counts()

# Percentage of reports predicted to be synoptic
pct_pred_syn = (pred_counts[pred_counts.index == 1].iloc[0])/pred_counts.sum()
pct_pred_syn

In [None]:



df_label_synoptic = pd.concat([df_training, df_validation], axis=0, sort=False)[col_keep].reset_index(drop=True)
df_label_synoptic

In [None]:
df_label_synoptic['IS_PREDICTION'].value_counts()

In [None]:
df_label_synoptic['IS_SYNOPTIC'].value_counts()

In [None]:
obj_syn = SynopticReportClassifier(fname_parsed_spec=pathfilename_data, 
                                   fname_synoptic_labels=pathfilename_labels,
                                   fname_save=pathfilename_save)
df_results = obj_syn.return_synoptic()
print(df_results['IS_SYNOPTIC'].value_counts())
print(df_results['IS_PREDICTION'].value_counts())
df_results.head()