In [None]:
!pip install transformers datasets torch pandas scikit-learn openpyxl accelerate -U

import pandas as pd
import re
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch

# Load the Excel file from the local directory
file_path = r"C:\Users\Michael\Downloads\BERTmodel\comments.xlsx"  # Update with your local path
data = pd.read_excel(file_path)

# Print the column names to verify
print("Columns in the dataset:", data.columns)

# Use the correct column names based on the inspection
correct_comment_column = 'careprovidercomments'
wait_score_column = 'Combined_Wait'
bert_wait_column = 'BERT_Wait'

# Work with the first 1000 rows
data = data.iloc[:1000]

# Drop rows with NaN values in the wait score column
data = data.dropna(subset=[wait_score_column])

# Print unique values in the wait score column after dropping NaNs
print("Unique values in the wait score column after dropping NaNs:", data[wait_score_column].unique())

# Function to clean punctuation
def clean_text(text):
    if isinstance(text, str):
        # Remove all punctuation except periods
        text = re.sub(r'[^\w\s.]', '', text)
    else:
        text = ''
    return text

# Clean the comments
data['cleaned_comment'] = data[correct_comment_column].apply(clean_text)

# Ensure wait scores are 0 and 1
def ensure_wait_scores(data, column):
    if not data[column].between(0, 1).all():
        raise ValueError("Wait scores must be 0 or 1.")

ensure_wait_scores(data, wait_score_column)


# Split the data into training and testing sets (80/20 split)
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Print the sizes of the training and testing sets
print(f"Training set size: {len(train_data)} comments")
print(f"Test set size: {len(test_data)} comments")

# Free up memory by deleting the original dataframe
del data

# Convert the data to Hugging Face datasets
train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)
dataset_dict = DatasetDict({"train": train_dataset, "test": test_dataset})

# Free up memory by deleting the Pandas dataframes
del train_data, test_data

# Initialize tokenizer with smaller max_length
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples['cleaned_comment'], padding='max_length', truncation=True, max_length=64)

tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)

def format_labels(examples):
    examples["labels"] = [int(label) for label in examples[wait_score_column]]
    return examples

tokenized_datasets = tokenized_datasets.map(format_labels, batched=True)

# Free up memory by deleting the original datasets
del dataset_dict

# Print tokenized dataset sizes for debugging
print(f"Tokenized training set size: {len(tokenized_datasets['train'])} comments")
print(f"Tokenized test set size: {len(tokenized_datasets['test'])} comments")

# Load the model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# Training arguments with reduced batch sizes
training_args = TrainingArguments(
    output_dir=r"C:\Users\Michael\Downloads\BERT_MODEL_TRAINING_RESULTS-20240701T162037Z-002",  # Update with your local path
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,
    logging_strategy="steps",
    logging_steps=50,
)

# Define evaluation metrics
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='weighted')
    acc = accuracy_score(p.label_ids, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    compute_metrics=compute_metrics,
)

# Print expected number of steps per epoch
expected_steps_per_epoch = len(tokenized_datasets['train']) // training_args.per_device_train_batch_size
print(f"Expected number of steps per epoch: {expected_steps_per_epoch}")

# Train the model
trainer.train()

# Unified predict function
def predict(texts, model, tokenizer, batch_size=4):
    model.eval()
    if not isinstance(texts, list):
        texts = texts.tolist()

    predictions = []
    probabilities = []

    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        encoded_inputs = tokenizer(batch_texts, padding=True, truncation=True, max_length=128, return_tensors='pt')

        with torch.no_grad():
            outputs = model(**encoded_inputs)

        logits = outputs.logits
        batch_probabilities = torch.nn.functional.softmax(logits, dim=-1)
        batch_predictions = torch.argmax(batch_probabilities, dim=-1)

        predictions.extend(batch_predictions.cpu().numpy())
        probabilities.extend(batch_probabilities.cpu().numpy())

    return np.array(predictions), np.array(probabilities)

def evaluate_model(test_dataset, model, tokenizer):
    texts = test_dataset['cleaned_comment']
    true_labels = test_dataset[wait_score_column].tolist()

    predictions, probabilities = predict(texts, model, tokenizer)

    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='weighted')
    accuracy = accuracy_score(true_labels, predictions)

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

    return predictions, probabilities

# Convert the test dataset to a pandas DataFrame
test_data = tokenized_datasets['test'].to_pandas()

# Evaluate the model
predictions, probabilities = evaluate_model(test_data, model, tokenizer)

# Inspect the predictions and their probabilities

for i in range(len(test_data)):
    print(f"Text: {test_data['cleaned_comment'].iloc[i]}")
    print(f"True Label: {test_data[wait_score_column].iloc[i]}")
    print(f"Predicted Label: {predictions[i]}")
    print(f"Probabilities: {probabilities[i]}")
    print()

# Re-load the full dataset including comments without wait scores
full_data = pd.read_excel(file_path)

# Clean the comments
full_data['cleaned_comment'] = full_data[correct_comment_column].apply(clean_text)

# Get predictions
comments_to_predict = full_data['cleaned_comment'].tolist()
predicted_labels, predicted_probabilities = predict(comments_to_predict, model, tokenizer)

# Add predicted labels to the data in the BERT_Wait column
full_data[bert_wait_column] = predicted_labels

# List of columns to keep
columns_to_keep = ['ID', 'unidentifiableid', 'Combined_Sentiment', 'Combined_Wait', 'Combined_Wait', 'BERT_Sentiment', 'BERT_Wait', 'BERT_Wait', 'careprovidercomments']

# Drop columns not in the list
full_data = full_data[columns_to_keep]

# Save the full data with predictions back to the original Excel file
full_data.to_excel(file_path, index=False)  # Overwrite the original file




[notice] A new release of pip is available: 24.0 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Columns in the dataset: Index(['ID', 'unidentifiableid', 'Combined_Sentiment', 'Combined_Wait',
       'Combined_Mistake', 'BERT_Sentiment', 'BERT_Wait', 'BERT_Mistake',
       'careprovidercomments'],
      dtype='object')
Unique values in the wait score column after dropping NaNs: [0. 1.]
Training set size: 800 comments
Test set size: 200 comments


Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Tokenized training set size: 800 comments
Tokenized test set size: 200 comments


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Expected number of steps per epoch: 400


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.299,0.12365,0.98,0.970101,0.9604,0.98
2,0.1079,0.086537,0.985,0.980557,0.985226,0.985
3,0.0008,0.108838,0.975,0.973246,0.971743,0.975


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 0.9750
Precision: 0.9717
Recall: 0.9750
F1 Score: 0.9732
Text: in all fairness to him i have hearing loss and havent been able to get aids.  i know communicating with me can be irritating.  i felt the dr was irritated if i asked him to repeat something i didnt catch the first time.  the subject of pain was so absorbing the whole appt. that i didnt have a chance to discuss other problems i am having.
True Label: 0.0
Predicted Label: 0
Probabilities: [9.9979156e-01 1.4863558e-04 5.9847800e-05]

Text: Dr. Cooper has always been responsive courteous personable.
True Label: 0.0
Predicted Label: 0
Probabilities: [9.99798834e-01 1.13559116e-04 8.76385238e-05]

Text: Very courteous and helpful
True Label: 0.0
Predicted Label: 0
Probabilities: [9.997900e-01 1.238531e-04 8.614035e-05]

Text: I have really grown to like and respect Bonnie Stephens.  I only hope to be able to continue to see her in the future.
True Label: 0.0
Predicted Label: 0
Probabilities: [9.9977762e-01 1.2930507e-04