In [72]:
import numpy as np
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/cpan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [74]:
input_dir = '/Users/cpan/OneDrive - Guardant Health/page_classification_data/'
train_data = pd.read_csv(input_dir + 'train_docs_hir_filter.csv')
valid_data = pd.read_csv(input_dir + 'valid_docs_hir_filter.csv')
test_data = pd.read_csv(input_dir + 'test_docs_hir_filter.csv')

In [25]:
# load BERT features 
train_features = np.load(input_dir+'bert_train_features.npy')
val_features = np.load(input_dir+'bert_val_features.npy')
test_features = np.load(input_dir+'bert_test_features.npy')

In [26]:
X_train = train_features
y_train = train_data['label']
X_val = val_features
y_val = valid_data['label']

In [27]:
X_test = test_features
y_test = test_data['label']

In [28]:
# use SMOTE to oversample minority classes 
smote = SMOTE(sampling_strategy=0.5, k_neighbors=5)  # Adjust k_neighbors as needed
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [49]:
scaler = StandardScaler()

# Fit the scaler on the training data and transform features
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

In [30]:
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters, digits, and extra spaces
    text = re.sub(r'\W+', ' ', text)
    text = re.sub(r'\d+', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Define your custom vocabulary to be added to stopwords
    custom_vocab = ["redacted", "redactedredacted", 'is', 'to', 'of', 'with', 'in', 'no']  # Add your domain-specific terms
    
    # Combine NLTK stopwords and custom vocabulary
    stop_words = set(stopwords.words('english') + custom_vocab)
    
    words = text.split()
    # filter out words that have length less than 2 
    filtered_words = [word for word in words if len(word) > 2 and word not in stop_words]
    text = ' '.join(filtered_words)
    
    return text

In [58]:
# # grid search on svm
# def grid_searchcv(classifier, parameters, X_train, y_train):
#     clf = GridSearchCV(classifier, parameters, scoring='f1_micro', cv = 5)
#     clf.fit(X_train, y_train)
#     print("best parameters:", clf.best_params_)
#     print("best score:", clf.best_score_)
#     return clf
  
# def f1_score_each_category(x_test, y_test, model):
#     pred = model.predict(x_test)
#     print(classification_report(y_test, pred))

# def train_model( model, parameters, x_train, y_train):
#     best_model  = grid_searchcv(model, parameters, x_train, y_train)
#     return best_model

# def train_svm(x_train, y_train,parameters):
#     best_svm = train_model(SVC(), parameters, x_train, y_train)
#     return best_svm

In [52]:
# svm_parameters = [{'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}]
# best_svm = train_svm(X_train_scaled, y_train_resampled, svm_parameters)
# f1_score_each_category(X_val_scaled, y_val, best_svm)

In [59]:
# nested cross validation to reduce overfitting 
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.svm import SVC
from sklearn.metrics import f1_score, make_scorer

# Create a custom f1_micro scorer
f1_micro = make_scorer(f1_score, average='micro')

# Outer StratifiedKFold for the final evaluation
outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Inner StratifiedKFold for hyperparameter tuning
inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def nested_cross_val(classifier, parameters, X, y, outer_cv, inner_cv):
    clf = GridSearchCV(classifier, parameters, scoring=f1_micro, cv=inner_cv)
    clf.fit(X, y)  # Fit the grid search on the entire dataset
    nested_scores = cross_val_score(clf.best_estimator_, X, y, cv=outer_cv, scoring=f1_micro)
    print("Nested Cross-Validation F1 Scores:", nested_scores)
    print("Mean F1 Score:", nested_scores.mean())
    return clf.best_params_

# Hyperparameters to search
svm_parameters = [{'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}]

# Perform nested cross-validation for hyperparameter tuning
best_params = nested_cross_val(SVC(), svm_parameters, X_train_scaled, y_train_resampled, outer_cv, inner_cv)

# Instantiate the best SVM with the best parameters
best_svm = SVC(**best_params)

# Fit the best SVM on the entire training data
best_svm.fit(X_train_scaled, y_train_resampled)

# Evaluate the best model on the validation set
f1_score_each_category(X_val_scaled, y_val, best_svm)

Nested Cross-Validation F1 Scores: [0.99826389 0.99652778 0.99826087 0.99304348 0.99826087]
Mean F1 Score: 0.9968713768115942
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       411
           1       0.00      0.00      0.00         8

    accuracy                           0.98       419
   macro avg       0.49      0.50      0.49       419
weighted avg       0.96      0.98      0.97       419



In [60]:
print(best_params)

{'C': 10, 'kernel': 'rbf'}


In [61]:
# Choose the best model based on cross-validation F1 scores
X_test = X_test_scaled
y_test = test_data['label']

In [62]:
# Evaluate the best model on the test set using extracted BERT features
y_pred = best_svm.predict(X_test)

In [63]:
from sklearn.metrics import precision_recall_fscore_support

# Assuming y_true contains the true labels and y_pred contains the predicted labels
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')

print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1_score)

Precision: 0.0
Recall: 0.0
F1-Score: 0.0


In [64]:
predictions_df = pd.DataFrame({"file_name": test_data["file_name"], 'patient_id': test_data['patient_id'], 'page_content':test_data['page_content'],"is_test": y_pred})

In [65]:
print(predictions_df.shape)
predictions_df['is_test'].value_counts()

(419, 4)


0    417
1      2
Name: is_test, dtype: int64

In [41]:
# predictions_df.to_csv(input_dir+'predicted_test_results_svm.csv', index=False)

In [42]:
# print(input_dir)

In [36]:
pwd

'/Users/cpan/Desktop/project/notebooks'

In [36]:
# # save model to use later 
# import joblib

# # Save the trained SVM model
# model_filename = 'best_svm_model.pkl'
# joblib.dump(best_svm, model_filename)

# print("Model saved to", model_filename)


In [31]:
# Load the trained SVM model from the file
# loaded_model = joblib.load(model_filename)

### Error Analysis

In [66]:
from sklearn.metrics import confusion_matrix

# Print confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Find indices of misclassified examples
misclassified_indices = np.where(y_test != y_pred)[0]

# Print some of the misclassified examples for analysis
num_examples_to_print = 5
print("\nMisclassified Examples:")
for i in range(min(num_examples_to_print, len(misclassified_indices))):
    index = misclassified_indices[i]
    print(f"Example {i+1}:")
    print("True Label:", y_test.iloc[index])
    print("Predicted Label:", y_pred[index])
    print("Text:")
    print(test_data['page_content'].iloc[index])
    print("="*50)

# Analyze common words or phrases in misclassified examples
misclassified_text = test_data['page_content'].iloc[misclassified_indices]
miclassified_text_clean = [clean_text(text) for text in misclassified_text]
common_words = {}
for text in miclassified_text_clean:
    words = text.split()
    for word in words:
        if word not in common_words:
            common_words[word] = 1
        else:
            common_words[word] += 1

# Print most common words in misclassified examples
print("Common Words in Misclassified Examples:")
sorted_common_words = sorted(common_words.items(), key=lambda x: x[1], reverse=True)
for word, count in sorted_common_words[:10]:
    print(f"{word}: {count}")

Confusion Matrix:
[[409   2]
 [  8   0]]

Misclassified Examples:
Example 1:
True Label: 1
Predicted Label: 0
Text:
to: 18772418203 page 33 of 46 2020-08-17 17:06:47 (gmt) 5055917000 from: 5055917000 confidential imt 53799932 myrisk management tool name: [**REDACTED**], [**REDACTED**] oct 24, 1955 accession #: 03630003-bld report date: may 21, 2020 * whatis meant by “high risk" and "elevated risk"? in the genetic test result summary, a gene-associated cancer risk is described as "high risk" for a cancer type if all of the following conditions are met: the absolute risk of cancer is approximately 5% or higher, the increase in risk over the general population is approximately 3-fold or higher, and thers is significant data from multiple studies supporting the cancer risk estimate. a gene is described as "elevated risk" for a cancer type if there is sufficient data to support an increase in cancer risk over the general population risk, but not all criteria far "high risk" are met. informa

In [67]:
print(len(misclassified_indices))

10


The misclassified files might due to the fact that they are test results but they belong to guardant test results instead of competitors' test results. 

In [68]:
import re

# Initialize a counter for occurrences of "guardant" or its part
guardant_count = 0

# Analyze common words or phrases in misclassified examples
misclassified_text = test_data['page_content'].iloc[misclassified_indices]
misclassified_text_clean = [clean_text(text) for text in misclassified_text]

for text in misclassified_text_clean:
    if re.findall(r'\bguardant\b', text):  # Using \b to match whole word "guardant"
        guardant_count += 1

# Print the number of misclassified examples containing "guardant" or its part
print("Number of Misclassified Examples Containing 'guardant':", guardant_count)


Number of Misclassified Examples Containing 'guardant': 1


### Data Filtering
- Due to 3 out of 7 misclassified text files contain the word "guardant". The model might misclassified guardant' test results as the competitors' test results. We can further conduct data filtering to filter out those classified test results that only contain guardant test resulst but without any competitor's test results. 

In [69]:
predictions_df.head()

Unnamed: 0,file_name,patient_id,page_content,is_test
0,A0428547_MedRec1/image-009.txt,A0428547,moffitt cancer 4/28/2022 2:98:33 pm [**REDACTE...,0
1,A0129768/image-046.txt,A0129768,foundationone'cdx q orfy g59959 patient pfelff...,0
2,A0329908/image-009.txt,A0329908,"[**REDACTED**], [**REDACTED**] (a0329908) dob:...",0
3,A0428569_MedRec1/image-022.txt,A0428569,"to: +17184924575 tan, cuilian (a0428569} eimg ...",0
4,A0596283_MedRec1/image-011.txt,A0596283,12/19/2622 @4:51pm 5612892134 hematology -onco...,0


In [70]:
print(predictions_df.shape)

(419, 4)


In [71]:
import re

# List of competitor test names (not converted to lowercase)
company_test_list = ['FoundationOne Liquid', 'Plasma Focus', 'Liquid Hallmark', 'Genestrat, InVisionFirst-Lung',
                     'FoundationOne CDx', 'Caris Molecular Intelligence', 'OncoExtra', 'Altera', 'Invitae Cancer Screen', 'MyRisk']
not_lower_company_test_list = ['xF', 'xF+', 'Assure', 'xT']

# Create a copy of the predictions DataFrame to avoid modifying the original one
filtered_predictions_df = predictions_df.copy()

# Iterate through each row in the DataFrame
for index, row in filtered_predictions_df.iterrows():
    text_content = row['page_content']
    predicted_test = row['is_test']
    
    # Use re.findall to find partial matches for "guardant" (case-insensitive)
    guardant_matches = re.findall(r'guardant', text_content, re.IGNORECASE)
    
    # Check if the guardant_matches list is not empty and if the predicted test is 1
    if guardant_matches and predicted_test == 1:
        # Check if any competitor's test name is present in the text content (not converted to lowercase)
        if not any(spec_test in text_content for spec_test in not_lower_company_test_list):
            # Change the predicted label to 0
            filtered_predictions_df.at[index, 'is_test'] = 0
        if not any(test.lower() in text_content.lower() for test in company_test_list):
            filtered_predictions_df.at[index, 'is_test'] = 0

# Assuming y_true contains the true labels and filtered_predictions_df contains the modified predicted labels
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, filtered_predictions_df['is_test'], average='binary')

print("Precision after filtering:", precision)
print("Recall after filtering:", recall)
print("F1-Score after filtering:", f1_score)


Precision after filtering: 0.0
Recall after filtering: 0.0
F1-Score after filtering: 0.0
