In [1]:
!pip install transformers



In [2]:
import numpy as np
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from imblearn.over_sampling import RandomOverSampler

In [3]:
input_dir = '/Users/cpan/OneDrive - Guardant Health/page_classification_data/'
train_data = pd.read_csv(input_dir + 'train_docs.csv')
valid_data = pd.read_csv(input_dir + 'valid_docs.csv')
test_data = pd.read_csv(input_dir + 'test_docs.csv')

In [4]:
train_data.head()

Unnamed: 0,file_name,patient_id,page_content,label
0,A0311306/image-022.txt,A0311306,"orsinger, veronika (a0311306) guardant 36 dob:...",0
1,A0329908/image-002.txt,A0329908,"[**REDACTED**], [**REDACTED**] (40329908) guar...",0
2,A0125350/image-000.txt,A0125350,10/29/2018 4:04 pm fax 2066861268 lifespring c...,0
3,A0311306/image-021.txt,A0311306,"orsinger, veronika (a0311306) dob: oct-12-1961...",0
4,A0400913_MedRec1/image-007.txt,A0400913,"vendice, [**REDACTED**] (a040091 3) guardant 3...",0


In [5]:
# Initialize the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
# Function to extract BERT embeddings for text data
def extract_bert_features(df):
    embeddings = []
    for text in df['page_content']:
        inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
        with torch.no_grad():
            outputs = model(**inputs)
        last_hidden_states = outputs.last_hidden_state
        avg_pooling = torch.mean(last_hidden_states, dim=1)
        embeddings.append(avg_pooling.squeeze().numpy())
    return np.vstack(embeddings)

# Extract BERT features for train, validation, and test sets
train_features = extract_bert_features(train_data)
val_features = extract_bert_features(valid_data)
test_features = extract_bert_features(test_data)

In [7]:
X_train = train_features
y_train = train_data['label']
X_val = val_features
y_val = valid_data['label']

In [8]:
# Apply oversampling to the training set since imbalanced data distribution 
oversampler = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train)
# not apply resamplling to validation data --> cuz it will introduce bias to the model's generalization 

In [9]:
# Initialize the logistic regression classifier with hyperparameter tuning (L1, L2, and elasticnet regularization)
log_reg_l1 = LogisticRegressionCV(Cs=10, cv=5, penalty='l1', solver='liblinear', max_iter=100, scoring='f1')
log_reg_l2 = LogisticRegressionCV(Cs=10, cv=5, penalty='l2', solver='liblinear', max_iter=100, scoring='f1')
log_reg_elasticnet = LogisticRegressionCV(Cs=10, cv=5, penalty='elasticnet', solver='saga', max_iter=100, l1_ratios=[0.5], scoring='f1')

# Perform 5-fold cross-validation with hyperparameter tuning on the combined train and validation data
cv_scores_l1 = cross_val_score(log_reg_l1, X_val, y_val, cv=5)
cv_scores_l2 = cross_val_score(log_reg_l2, X_val, y_val, cv=5)
cv_scores_elasticnet = cross_val_score(log_reg_elasticnet, X_val, y_val, cv=5)

# Print cross-validation scores and mean F1 score
print("Cross-Validation F1 Scores (L1):", cv_scores_l1)
print("Mean F1 Score (L1):", np.mean(cv_scores_l1))

print("Cross-Validation F1 Scores (L2):", cv_scores_l2)
print("Mean F1 Score (L2):", np.mean(cv_scores_l2))

print("Cross-Validation F1 Scores (ElasticNet):", cv_scores_elasticnet)
print("Mean F1 Score (ElasticNet):", np.mean(cv_scores_elasticnet))









Cross-Validation F1 Scores (L1): [0.5        0.75       0.33333333 0.66666667 0.66666667]
Mean F1 Score (L1): 0.5833333333333333
Cross-Validation F1 Scores (L2): [0.66666667 0.75       0.33333333 0.4        0.8       ]
Mean F1 Score (L2): 0.5900000000000001
Cross-Validation F1 Scores (ElasticNet): [0.4        0.66666667 0.4        0.4        0.5       ]
Mean F1 Score (ElasticNet): 0.47333333333333333




In [12]:
# Choose the best model based on cross-validation F1 scores
X_test = test_features
y_test = test_data['label']
best_model = log_reg_l1

# Train the best model on the combined training and validation data
best_model.fit(X_train_resampled, y_train_resampled)

# Evaluate the best model on the test set using extracted BERT features
y_pred = best_model.predict(X_test)

# Calculate F1 score on the test set
test_f1_score = f1_score(y_test, y_pred)
print("Test F1 Score:", test_f1_score)

Test F1 Score: 0.6190476190476191


In [13]:
predictions_df = pd.DataFrame({"file_name": test_data["file_name"], 'patient_id': test_data['patient_id'], 'page_content':test_data['page_content'],"predicted_test_name": y_pred})

In [14]:
predictions_df.head()

Unnamed: 0,file_name,patient_id,page_content,predicted_test_name
0,A0184575/image-024.txt,A0184575,41/29/21 14:17:28 city of hope > 18772418243 c...,0
1,A0285660_MedRec1/image-050.txt,A0285660,"[**REDACTED**], [**REDACTED**]a0285660) cuarda...",0
2,A0557898_MedRec1/image-003.txt,A0557898,[**REDACTED**] 109) 11/1/2022 12:00:06 pm cdt ...,0
3,A0266158/image-020.txt,A0266158,ucdhs-o1 3/7/2021 3:14:44 pm page 23/035 fax b...,0
4,A0394647_medrec1/image-064.txt,A0394647,a more detailed guardant360 patient report is ...,0


In [15]:
print(predictions_df.shape)
predictions_df['predicted_test_name'].value_counts()

(419, 4)


0    396
1     23
Name: predicted_test_name, dtype: int64

In [16]:
from sklearn.metrics import precision_recall_fscore_support

# Assuming y_true contains the true labels and y_pred contains the predicted labels
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')

print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1_score)

Precision: 0.5652173913043478
Recall: 0.6842105263157895
F1-Score: 0.6190476190476191


In [17]:
predictions_df.to_csv(input_dir+'predicted_test_results1.csv', index=False)

### Error Analysis 

In [18]:
from sklearn.metrics import confusion_matrix

# Print confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Find indices of misclassified examples
misclassified_indices = np.where(y_test != y_pred)[0]

# Print some of the misclassified examples for analysis
num_examples_to_print = 5
print("\nMisclassified Examples:")
for i in range(min(num_examples_to_print, len(misclassified_indices))):
    index = misclassified_indices[i]
    print(f"Example {i+1}:")
    print("True Label:", y_test.iloc[index])
    print("Predicted Label:", y_pred[index])
    print("Text:")
    print(test_data['page_content'].iloc[index])
    print("="*50)

# Analyze common words or phrases in misclassified examples
misclassified_text = test_data['page_content'].iloc[misclassified_indices]
common_words = {}
for text in misclassified_text:
    words = text.split()
    for word in words:
        if word not in common_words:
            common_words[word] = 1
        else:
            common_words[word] += 1

# Print most common words in misclassified examples
print("Common Words in Misclassified Examples:")
sorted_common_words = sorted(common_words.items(), key=lambda x: x[1], reverse=True)
for word, count in sorted_common_words[:10]:
    print(f"{word}: {count}")

# Additional analysis could involve checking if there are specific types of pages (e.g., medical, technical) that the model struggles with,
# and using this information to fine-tune features or the model itself.

Confusion Matrix:
[[390  10]
 [  6  13]]

Misclassified Examples:
Example 1:
True Label: 0
Predicted Label: 1
Text:
2022/02/04 13:50:24 76 /110 physical exam general: patient appears: chronically ill. in no apparent distress. psychiatric: judgment and insight are normal. appropriate mood and affect. memory appears intact. eyes: lids without lesions, conjunctiva clear. positive for scleral icterus [**REDACTED**]. ears, nose, mouth & throat: masking in place for [**REDACTED**] neck: neck is symmetrical with no masses and midline trachea. cardiovascular: heart with regular rate and rhythm. no murmur present. lower extremity edema trace. respiratory: normal respiratory effort. clear, no rales/rhonchi. no wheezing. gastrointestional: abdomen tender. positive for [**REDACTED**]. abdomen firm. no [**REDACTED**]. no ascites. hematologic/lymphatic: no petechiae. ecchymosis present. skin: jaundiced appearing musculoskeletal: no joint swelling. no cyanosis. no clubbing. abnormal gait and station,

In [20]:
print(len(misclassified_indices))

16


In [22]:
import re

# Initialize a counter for occurrences of "guardant" or its part
guardant_count = 0

# Analyze common words or phrases in misclassified examples
misclassified_text = test_data['page_content'].iloc[misclassified_indices]

for text in misclassified_text:
    if re.findall(r'\bguardant\b', text):  # Using \b to match whole word "guardant"
        guardant_count += 1

# Print the number of misclassified examples containing "guardant" or its part
print("Number of Misclassified Examples Containing 'guardant':", guardant_count)


Number of Misclassified Examples Containing 'guardant': 0
