In [None]:
from google.colab import files

# Upload your CSV file
uploaded = files.upload()

In [None]:
# Load the verified data
import pandas as pd
df = pd.read_csv('processed_data.csv')
print(f"Loaded {len(df)} examples for BERT training")

In [None]:
# Install required libraries
!pip install transformers torch datasets accelerate

# Import necessary libraries
import torch
from transformers import (
    DistilBertTokenizer,
    DistilBertForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
from torch.utils.data import Dataset
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
import pandas as pd

print("✅ Libraries installed and imported successfully!")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

In [None]:
import pandas as pd
import io

# Read and fix the CSV formatting (same issue as before)
with open('processed_data.csv', 'r', encoding='utf-8-sig') as f:
    content = f.read()

# Remove BOM and fix quote wrapping
content = content.replace('﻿', '')
lines = content.split('\n')

# Fix each line by removing outer quotes
fixed_lines = []
for line in lines:
    if line.strip():
        if line.startswith('"') and line.endswith('"'):
            line = line[1:-1]
        line = line.replace('""', '"')
        fixed_lines.append(line)

# Create properly formatted CSV
fixed_content = '\n'.join(fixed_lines)
df = pd.read_csv(io.StringIO(fixed_content))

print(f"Loaded {len(df)} examples for BERT training")
print(f"Columns: {df.columns.tolist()}")
print(f"Patterns: {df['pattern'].value_counts()}")

# Prepare data for BERT
X = df['text'].values
y = df['pattern'].values

# Create label mapping
unique_labels = df['pattern'].unique()
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}

print(f"Label mapping: {label2id}")

In [None]:
# Initialize tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Create custom dataset class
class WorkplaceDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        # Tokenize
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Convert labels to numeric
y_numeric = [label2id[label] for label in y]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y_numeric, test_size=0.2, random_state=42, stratify=y_numeric
)

print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")

# Create datasets
train_dataset = WorkplaceDataset(X_train, y_train, tokenizer)
test_dataset = WorkplaceDataset(X_test, y_test, tokenizer)

print("✅ Datasets created successfully!")

In [None]:
# Load pre-trained DistilBERT model
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)

# Define training arguments (corrected parameter names)
training_args = TrainingArguments(
    output_dir='./bert_results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy="epoch",  # Changed from evaluation_strategy
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_accuracy",
    greater_is_better=True,
    dataloader_pin_memory=False,
    report_to="none", # Disable Weights & Biases logging
)

# Define metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average='macro'
    )
    accuracy = accuracy_score(labels, predictions)

    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

print("✅ Model and training setup complete!")
print(f"Model loaded: DistilBERT with {len(label2id)} classes")
print(f"Training epochs: 3")
print(f"Batch size: 8 (CPU optimized)")

In [None]:
# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

print("🚀 Starting BERT fine-tuning...")
print("This will take 10-15 minutes on CPU...")

# Train the model
trainer.train()

print("✅ Training completed!")

In [None]:
# Final evaluation and results (RUN AFTER TRAINING COMPLETES)
print("=== BERT FINE-TUNING RESULTS ===")

# Get final predictions
predictions = trainer.predict(test_dataset)
y_pred = np.argmax(predictions.predictions, axis=1)

# Convert back to label names
y_test_labels = [id2label[label] for label in y_test]
y_pred_labels = [id2label[pred] for pred in y_pred]

# Print detailed results
from sklearn.metrics import classification_report, confusion_matrix
print("Classification Report:")
print(classification_report(y_test_labels, y_pred_labels))

print(f"\nBERT F1 Score: {predictions.metrics['test_f1']:.3f}")
print(f"BERT Accuracy: {predictions.metrics['test_accuracy']:.3f}")
print(f"Target Achievement: {'✅ EXCEEDED' if predictions.metrics['test_f1'] > 0.75 else '⚠️ BELOW TARGET'}")

In [None]:
# Test with a new workplace narrative (DEVICE-CORRECTED)
test_narrative = """
After I questioned the new remote work policy in a team meeting, my manager suddenly started requiring me to submit daily written reports about my activities. These reports were never required before and no one else has to do them. Every email I send now gets forwarded to HR with additional commentary about my communication style. Small issues like joining a meeting two minutes late are now documented in writing when they never were before.
"""

# Get prediction (fix device placement)
device = next(model.parameters()).device  # Get model's device
inputs = tokenizer(test_narrative, return_tensors="pt", truncation=True, padding=True, max_length=512)

# Move inputs to same device as model
inputs = {key: value.to(device) for key, value in inputs.items()}

model.eval()
with torch.no_grad():
    outputs = model(**inputs)
    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
    predicted_class = torch.argmax(predictions, dim=-1).item()
    confidence = torch.max(predictions).item()

print("=== REAL USER TEST ===")
print(f"Input: {test_narrative[:100]}...")
print(f"Predicted Pattern: {id2label[predicted_class]}")
print(f"Confidence: {confidence:.3f}")
print(f"All Probabilities:")
for i, prob in enumerate(predictions[0]):
    print(f"  {id2label[i]}: {prob:.3f}")

In [None]:
# Test multiple workplace scenarios with confidence analysis
test_scenarios = [
    {
        "text": "Placed on formal improvement plan after questioning budget decisions. Goals are vague and timeline unrealistic.",
        "expected": "pip_tactics"
    },
    {
        "text": "Manager gives different instructions in meetings versus private conversations. Won't clarify priorities in writing.",
        "expected": "strategic_ambiguity"
    },
    {
        "text": "Removed from team meetings without explanation. Colleagues avoid sharing project information with me.",
        "expected": "isolation_tactics"
    },
    {
        "text": "Every conversation now requires written follow-up. Minor issues become formal policy violations.",
        "expected": "documentation_building"
    },
    {
        "text": "Work has been stressful lately. Manager seems busy. Not sure what's happening with the project.",
        "expected": "unclear/ambiguous"
    }
]

print("=== CONFIDENCE THRESHOLD ANALYSIS ===")
device = next(model.parameters()).device

for i, scenario in enumerate(test_scenarios):
    # Tokenize and predict
    inputs = tokenizer(scenario["text"], return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
        predicted_class = torch.argmax(predictions, dim=-1).item()
        confidence = torch.max(predictions).item()

    print(f"\n--- Scenario {i+1} ---")
    print(f"Text: {scenario['text'][:80]}...")
    print(f"Expected: {scenario['expected']}")
    print(f"Predicted: {id2label[predicted_class]}")
    print(f"Confidence: {confidence:.3f}")

    # Flag low confidence predictions
    if confidence < 0.6:
        print("⚠️ LOW CONFIDENCE - Human review recommended")
    elif confidence > 0.9:
        print("✅ HIGH CONFIDENCE - Reliable prediction")
    else:
        print("🔄 MEDIUM CONFIDENCE - Consider additional context")