In [None]:
from google.colab import files

# Upload your CSV file
uploaded = files.upload()

Saving processed_data.csv to processed_data.csv


In [None]:
import pandas as pd
import io

# Read the file and fix the formatting
with open('processed_data.csv', 'r', encoding='utf-8-sig') as f:
    content = f.read()

# Remove the BOM and fix quote wrapping
content = content.replace('﻿', '')  # Remove BOM
lines = content.split('\n')

# Fix each line by removing outer quotes
fixed_lines = []
for line in lines:
    if line.strip():
        # Remove outer quotes if they exist
        if line.startswith('"') and line.endswith('"'):
            line = line[1:-1]  # Remove first and last quote
        # Fix double quotes inside text
        line = line.replace('""', '"')
        fixed_lines.append(line)

# Create new content
fixed_content = '\n'.join(fixed_lines)

# Parse the fixed CSV
df = pd.read_csv(io.StringIO(fixed_content))

print("=== FIXED! ===")
print(f"Columns: {df.columns.tolist()}")
print(f"Total rows: {len(df)}")
print(f"\nPattern distribution:")
print(df['pattern'].value_counts())
print(f"\nSource distribution:")
print(df['source'].value_counts())
print("\n✅ SUCCESS! Data is ready for model training!")

=== FIXED! ===
Columns: ['narrative_id', 'text', 'pattern', 'source']
Total rows: 300

Pattern distribution:
pattern
pip_tactics               75
strategic_ambiguity       75
isolation_tactics         75
documentation_building    75
Name: count, dtype: int64

Source distribution:
source
synthetic    149
reddit        76
glassdoor     75
Name: count, dtype: int64

✅ SUCCESS! Data is ready for model training!


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# Prepare the data
X = df['text']
y = df['pattern']

# Train-test split (80/20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Vectorize text
vectorizer = CountVectorizer(max_features=5000, stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(X_train_vec, y_train)

# Make predictions
y_pred = nb_model.predict(X_test_vec)

# Evaluate
print("=== NAIVE BAYES RESULTS ===")
print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

=== NAIVE BAYES RESULTS ===
Training samples: 240
Test samples: 60

Classification Report:
                        precision    recall  f1-score   support

documentation_building       1.00      1.00      1.00        15
     isolation_tactics       1.00      1.00      1.00        15
           pip_tactics       1.00      1.00      1.00        15
   strategic_ambiguity       1.00      1.00      1.00        15

              accuracy                           1.00        60
             macro avg       1.00      1.00      1.00        60
          weighted avg       1.00      1.00      1.00        60



In [None]:
# Check for potential overfitting indicators
print("=== OVERFITTING INVESTIGATION ===")

# 1. Check for duplicate or very similar texts
print("Checking for text similarities...")
from sklearn.feature_extraction.text import TfidfVectorizer

# Simple duplicate check
duplicates = df[df.duplicated(subset=['text'], keep=False)]
print(f"Exact duplicates: {len(duplicates)}")

# 2. Check vocabulary overlap between patterns
print("\nTop words per pattern:")
for pattern in df['pattern'].unique():
    pattern_texts = df[df['pattern'] == pattern]['text']
    vectorizer_check = TfidfVectorizer(max_features=10, stop_words='english')
    pattern_vec = vectorizer_check.fit_transform(pattern_texts)
    feature_names = vectorizer_check.get_feature_names_out()
    print(f"{pattern}: {list(feature_names)}")

# 3. Check prediction confidence
pred_proba = nb_model.predict_proba(X_test_vec)
print(f"\nAverage prediction confidence: {np.mean(np.max(pred_proba, axis=1)):.3f}")
print(f"Minimum confidence: {np.min(np.max(pred_proba, axis=1)):.3f}")

# 4. Manual spot check
print(f"\nSpot check - First test example:")
print(f"Text: {X_test.iloc[0][:100]}...")
print(f"True label: {y_test.iloc[0]}")
print(f"Predicted: {y_pred[0]}")

=== OVERFITTING INVESTIGATION ===
Checking for text similarities...
Exact duplicates: 0

Top words per pattern:
pip_tactics: ['enhance', 'evidence', 'improve', 'improvement', 'include', 'management', 'metrics', 'plan', 'previous', 'questioning']
strategic_ambiguity: ['clarification', 'coordination', 'different', 'emphasized', 'guidance', 'inability', 'management', 'requirements', 'specifications', 'stressed']
isolation_tactics: ['access', 'colleagues', 'concerns', 'despite', 'implementation', 'information', 'notification', 'participation', 'planning', 'systematically']
documentation_building: ['documentation', 'explanation', 'formal', 'interactions', 'protocols', 'questioning', 'require', 'requiring', 'violations', 'written']

Average prediction confidence: 1.000
Minimum confidence: 1.000

Spot check - First test example:
Text: Marketing requirements dramatically increased after reporting violations. Routine interactions requi...
True label: documentation_building
Predicted: documentat