In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import json
import warnings
warnings.filterwarnings('ignore')

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Load dataset
print("Loading dataset...")
# UPDATE THIS PATH to where your dataset is located
DATASET_PATH = '/content/drive/MyDrive/Model epoch 3/News_Category_Dataset_v3.json'
df = pd.read_json(DATASET_PATH, lines=True)

print(f"Total articles loaded: {len(df)}")
print(f"Columns: {df.columns.tolist()}")

# Combine headline and short_description for better context
df['text'] = df['headline'] + ' [SEP] ' + df['short_description']

# Filter out categories with too few samples (keeps categories with 100+ samples)
category_counts = df['category'].value_counts()
print(f"\nOriginal number of categories: {len(category_counts)}")
top_categories = category_counts[category_counts >= 100].index
df = df[df['category'].isin(top_categories)]

print(f"Dataset shape after filtering: {df.shape}")
print(f"Number of categories (100+ samples): {df['category'].nunique()}")
print(f"\nTop 10 categories:")
print(df['category'].value_counts().head(10))

# Encode labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['category'])

# Save label encoder for later use
label_mapping = {idx: label for idx, label in enumerate(label_encoder.classes_)}
with open('label_mapping.json', 'w') as f:
    json.dump(label_mapping, f, indent=2)

print(f"\n‚úÖ Saved label_mapping.json with {len(label_mapping)} categories")

# Split data
X_train, X_val, y_train, y_val = train_test_split(
    df['text'].values,
    df['label'].values,
    test_size=0.15,
    random_state=42,
    stratify=df['label']
)

print(f"\nTraining samples: {len(X_train)}")
print(f"Validation samples: {len(X_val)}")

# Custom Dataset class
class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Initialize tokenizer and model
print("\nLoading BERT model...")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
num_labels = len(label_encoder.classes_)
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=num_labels
)
model = model.to(device)

print(f"Model loaded with {num_labels} output classes")

# Create datasets and dataloaders
train_dataset = NewsDataset(X_train, y_train, tokenizer)
val_dataset = NewsDataset(X_val, y_val, tokenizer)

# Hyperparameters
BATCH_SIZE = 32  # Increased for better GPU utilization
EPOCHS = 3
LEARNING_RATE = 2e-5
ACCUMULATION_STEPS = 2  # Gradient accumulation for effective batch size of 64

train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=2,
    pin_memory=True
)
val_loader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    num_workers=2,
    pin_memory=True
)

print(f"\nTraining configuration:")
print(f"  Batch size: {BATCH_SIZE}")
print(f"  Effective batch size (with accumulation): {BATCH_SIZE * ACCUMULATION_STEPS}")
print(f"  Epochs: {EPOCHS}")
print(f"  Learning rate: {LEARNING_RATE}")
print(f"  Training batches per epoch: {len(train_loader)}")
print(f"  Validation batches: {len(val_loader)}")

# Training setup
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

# Training function with gradient accumulation
def train_epoch(model, data_loader, optimizer, scheduler, device, accumulation_steps=2):
    model.train()
    losses = []
    correct_predictions = 0

    for idx, batch in enumerate(tqdm(data_loader, desc="Training")):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        logits = outputs.logits

        # Normalize loss for accumulation
        loss = loss / accumulation_steps

        _, preds = torch.max(logits, dim=1)
        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item() * accumulation_steps)

        loss.backward()

        # Update weights every accumulation_steps
        if (idx + 1) % accumulation_steps == 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)

# Evaluation function
def eval_model(model, data_loader, device):
    model.eval()
    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = outputs.loss
            logits = outputs.logits

            _, preds = torch.max(logits, dim=1)
            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())

    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)

# Training loop
print("\n" + "="*70)
print("STARTING TRAINING")
print("="*70)

best_accuracy = 0
history = {
    'train_loss': [],
    'train_acc': [],
    'val_loss': [],
    'val_acc': []
}

for epoch in range(EPOCHS):
    print(f'\nEpoch {epoch + 1}/{EPOCHS}')
    print('-' * 70)

    train_acc, train_loss = train_epoch(
        model,
        train_loader,
        optimizer,
        scheduler,
        device,
        ACCUMULATION_STEPS
    )
    print(f'Train Loss: {train_loss:.4f} | Train Accuracy: {train_acc:.4f}')

    val_acc, val_loss = eval_model(model, val_loader, device)
    print(f'Val Loss: {val_loss:.4f} | Val Accuracy: {val_acc:.4f}')

    # Save history
    history['train_loss'].append(train_loss)
    history['train_acc'].append(train_acc.item())
    history['val_loss'].append(val_loss)
    history['val_acc'].append(val_acc.item())

    # Save best model
    if val_acc > best_accuracy:
        best_accuracy = val_acc
        torch.save(model.state_dict(), 'best_model_state.bin')
        print(f'‚úÖ Best model saved! Accuracy: {best_accuracy:.4f}')

print("\n" + "="*70)
print("TRAINING COMPLETE!")
print("="*70)
print(f"Best validation accuracy: {best_accuracy:.4f}")

# Save the final model
print("\nSaving final model...")
model.save_pretrained('news_classifier_bert')
tokenizer.save_pretrained('news_classifier_bert')

print("‚úÖ Model saved to 'news_classifier_bert/' directory")
print("‚úÖ Label mapping saved to 'label_mapping.json'")
print("‚úÖ Best weights saved to 'best_model_state.bin'")

# Print training summary
print("\n" + "="*70)
print("TRAINING SUMMARY")
print("="*70)
print(f"Total epochs: {EPOCHS}")
print(f"Final train accuracy: {history['train_acc'][-1]:.4f}")
print(f"Final validation accuracy: {history['val_acc'][-1]:.4f}")
print(f"Best validation accuracy: {best_accuracy:.4f}")
print(f"Number of categories: {num_labels}")
print(f"Training samples: {len(X_train)}")
print(f"Validation samples: {len(X_val)}")
print("="*70)

# Save training history
with open('training_history.json', 'w') as f:
    json.dump(history, f, indent=2)
print("\n‚úÖ Training history saved to 'training_history.json'")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import json
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import os


# from google.colab import drive
# drive.mount('/content/drive')


MODEL_PATH = '/content/drive/MyDrive/Model epoch 3/news_classifier_bert'
DATASET_PATH = '/content/drive/MyDrive/Model epoch 3/News_Category_Dataset_v3.json'

# ===== FIX: Load the dataset =====
print("Loading dataset...")
df = pd.read_json(DATASET_PATH, lines=True)  # lines=True because it's a JSON Lines file
print(f"‚úÖ Dataset loaded: {len(df)} articles")
print(f"Columns: {df.columns.tolist()}")
print(f"Sample categories: {df['category'].unique()[:10]}\n")
# ================================

# Create label mapping
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['category'])
label_mapping = {idx: label for idx, label in enumerate(label_encoder.classes_)}

with open('label_mapping.json', 'w') as f:
    json.dump(label_mapping, f, indent=2)

print(f"‚úÖ Created label_mapping.json with {len(label_mapping)} categories\n")

# Defining classifier
class NewsClassifier:
    def __init__(self, model_path, label_mapping_path='label_mapping.json'):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        print(f"Using device: {self.device}")

        with open(label_mapping_path, 'r') as f:
            self.label_mapping = json.load(f)
            self.label_mapping = {int(k): v for k, v in self.label_mapping.items()}

        print(f"Loading model from: {model_path}")
        self.tokenizer = BertTokenizer.from_pretrained(model_path, local_files_only=True)
        self.model = BertForSequenceClassification.from_pretrained(model_path, local_files_only=True)
        self.model = self.model.to(self.device)
        self.model.eval()

        print("‚úÖ Model loaded successfully!")
        print(f"Number of categories: {len(self.label_mapping)}\n")

    def predict(self, text=None, headline=None, description=None):
        if headline and description:
            input_text = f"{headline} [SEP] {description}"
        elif text:
            input_text = text
        else:
            raise ValueError("Must provide either 'text' or both 'headline' and 'description'")

        encoding = self.tokenizer.encode_plus(
            input_text,
            add_special_tokens=True,
            max_length=128,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        input_ids = encoding['input_ids'].to(self.device)
        attention_mask = encoding['attention_mask'].to(self.device)

        with torch.no_grad():
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            probabilities = torch.softmax(logits, dim=1)

        top_probs, top_indices = torch.topk(probabilities, k=min(5, len(self.label_mapping)))
        top_probs = top_probs.cpu().numpy()[0]
        top_indices = top_indices.cpu().numpy()[0]

        predicted_label = top_indices[0]
        predicted_category = self.label_mapping[predicted_label]
        confidence = top_probs[0]

        top_predictions = [
            {'category': self.label_mapping[idx], 'confidence': float(prob)}
            for idx, prob in zip(top_indices, top_probs)
        ]

        return {
            'predicted_category': predicted_category,
            'confidence': float(confidence),
            'top_predictions': top_predictions
        }

# Initializing classifier
print("="*80)
print("BERT NEWS CLASSIFIER - COMPREHENSIVE TEST SUITE")
print("="*80 + "\n")

classifier = NewsClassifier(model_path=MODEL_PATH)

# Sample Test Articles
test_articles = [
    {
        "headline": "Apple Unveils iPhone 16 with Revolutionary AI Chip",
        "description": "Tech giant Apple announced its latest smartphone featuring a groundbreaking neural processing unit that promises 10x faster AI performance and week-long battery life.",
        "expected": "TECH or BUSINESS"
    },
    {
        "headline": "LeBron James Becomes NBA's All-Time Leading Scorer",
        "description": "The Los Angeles Lakers star broke Kareem Abdul-Jabbar's long-standing record with a fadeaway jumper in the third quarter against the Oklahoma City Thunder.",
        "expected": "SPORTS"
    },
    {
        "headline": "New Study Links Mediterranean Diet to 30% Lower Dementia Risk",
        "description": "Researchers at Harvard Medical School conducted a 20-year study showing that people who follow a Mediterranean diet rich in olive oil, fish, and vegetables have significantly lower rates of cognitive decline.",
        "expected": "HEALTH or WELLNESS"
    },
    {
        "headline": "Federal Reserve Cuts Interest Rates by 0.5% to Boost Economy",
        "description": "In a surprise move, the Fed announced a larger-than-expected rate cut aimed at preventing a potential recession as inflation continues to moderate and unemployment rises.",
        "expected": "BUSINESS or MONEY"
    },
    {
        "headline": "Taylor Swift's Eras Tour Breaks Records with $2 Billion in Ticket Sales",
        "description": "The pop superstar's worldwide concert tour has become the highest-grossing music tour in history, surpassing previous records and boosting local economies in every city visited.",
        "expected": "ENTERTAINMENT"
    },
    {
        "headline": "Scientists Discover Habitable Exoplanet Just 40 Light-Years Away",
        "description": "Astronomers using the James Webb Space Telescope have identified a rocky planet in the habitable zone of its star with atmospheric conditions that could support liquid water and potentially life.",
        "expected": "SCIENCE"
    },
    {
        "headline": "Supreme Court Rules on Landmark Environmental Protection Case",
        "description": "In a 6-3 decision, the court upheld federal regulations limiting carbon emissions from power plants, marking a significant victory for environmental advocates and the Biden administration.",
        "expected": "POLITICS"
    },
    {
        "headline": "Viral TikTok Recipe for Cloud Bread Takes Internet by Storm",
        "description": "A simple three-ingredient recipe for fluffy, Instagram-worthy bread has garnered over 100 million views, with home bakers worldwide attempting the trendy technique that requires just eggs, sugar, and cornstarch.",
        "expected": "FOOD & DRINK or TASTE"
    },
    {
        "headline": "Paris Fashion Week Showcases Sustainable Luxury Collections",
        "description": "Major fashion houses including Chanel, Dior, and Louis Vuitton presented eco-friendly haute couture lines featuring recycled materials and carbon-neutral production methods.",
        "expected": "STYLE & BEAUTY or STYLE"
    },
    {
        "headline": "New Parenting App Uses AI to Detect Signs of Postpartum Depression",
        "description": "A Silicon Valley startup has developed an application that monitors new mothers' speech patterns, sleep, and activity levels to identify early warning signs of postpartum depression and connect them with mental health resources.",
        "expected": "PARENTING"
    }
]

#Testing
print("\n" + "="*80)
print("TESTING MODEL ON 10 DIVERSE NEWS ARTICLES")
print("="*80)

correct_predictions = 0
total_tests = len(test_articles)
all_results = []  # Store results to avoid re-running predictions

for i, article in enumerate(test_articles, 1):
    print(f"\n{'='*80}")
    print(f"TEST CASE #{i}")
    print(f"{'='*80}")
    print(f"üì∞ Headline: {article['headline']}")
    print(f"üìù Description: {article['description']}")
    print(f"üéØ Expected Category: {article['expected']}")
    print(f"{'-'*80}")

    result = classifier.predict(
        headline=article['headline'],
        description=article['description']
    )
    all_results.append(result)  # Store for later use

    print(f"\n‚úÖ PREDICTED: {result['predicted_category']}")
    print(f"üíØ CONFIDENCE: {result['confidence']:.2%}")

    expected_categories = article['expected'].split(' or ')
    if result['predicted_category'] in expected_categories:
        print(f"‚úì CORRECT PREDICTION!")
        correct_predictions += 1
    else:
        print(f"‚úó Prediction differs from expected")

    print(f"\nüìä TOP 5 PREDICTIONS:")
    for j, pred in enumerate(result['top_predictions'], 1):
        bar_length = int(pred['confidence'] * 50)
        bar = "‚ñà" * bar_length + "‚ñë" * (50 - bar_length)

        marker = "‚úì" if pred['category'] in expected_categories else " "
        print(f"  {marker} {j}. {pred['category']:<25} {bar} {pred['confidence']:.2%}")


print("\n" + "="*80)
print("TEST SUMMARY")
print("="*80)
print(f"Total test cases: {total_tests}")
print(f"Correct predictions: {correct_predictions}")
print(f"Accuracy: {(correct_predictions/total_tests)*100:.1f}%")
print(f"Model categories available: {len(classifier.label_mapping)}")
print(f"Device used: {classifier.device}")

# Average confidence using stored results (more efficient)
avg_confidence = sum([result['confidence'] for result in all_results]) / len(all_results)
print(f"Average confidence: {avg_confidence:.2%}")

print(f"\nüìã Available categories in model:")
categories_list = list(classifier.label_mapping.values())
for i in range(0, len(categories_list), 5):
    print("  " + ", ".join(categories_list[i:i+5]))

print("\n" + "="*80)
print("TESTING COMPLETE! üéâ")
print("="*80)

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import json
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import os

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# # STEP 0: Find your files
# print("Searching for dataset and model...")
# print("\nFiles in Google Drive root:")
# !ls "/content/drive/MyDrive/" | head -20


MODEL_PATH = '/content/drive/MyDrive/Model epoch 3/news_classifier_bert'
DATASET_PATH = '/content/drive/MyDrive/Model epoch 3/News_Category_Dataset_v3.json'



# Verify files exist
if not os.path.exists(MODEL_PATH):
    print(f"‚ùå Model not found at: {MODEL_PATH}")
    print("\nSearching for model folder...")
    !find /content/drive/MyDrive -name "news_classifier_bert" -type d 2>/dev/null
    raise FileNotFoundError(f"Model not found. Please update MODEL_PATH")

if not os.path.exists(DATASET_PATH):
    print(f"‚ùå Dataset not found at: {DATASET_PATH}")
    print("\nSearching for dataset...")
    !find /content/drive/MyDrive -name "*.json" -type f 2>/dev/null | grep -i news | head -5
    raise FileNotFoundError(f"Dataset not found. Please update DATASET_PATH")

print(f"‚úÖ Model found at: {MODEL_PATH}")
print(f"‚úÖ Dataset found at: {DATASET_PATH}")

# STEP 1: Load dataset and create label_mapping.json
print("\nLoading dataset...")
try:
    df = pd.read_json(DATASET_PATH, lines=True)
    print(f"‚úÖ Loaded {len(df)} articles")
except Exception as e:
    print(f"Error loading dataset: {e}")
    raise

# Filter categories
category_counts = df['category'].value_counts()
print(f"\nTotal unique categories: {len(category_counts)}")
top_categories = category_counts[category_counts >= 100].index
df = df[df['category'].isin(top_categories)]
print(f"Filtered to {len(top_categories)} categories with 100+ samples")

# Create label mapping
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['category'])
label_mapping = {idx: label for idx, label in enumerate(label_encoder.classes_)}

with open('label_mapping.json', 'w') as f:
    json.dump(label_mapping, f, indent=2)

print(f"‚úÖ Created label_mapping.json with {len(label_mapping)} categories\n")

# STEP 2: Define classifier
class NewsClassifier:
    def __init__(self, model_path, label_mapping_path='label_mapping.json'):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        print(f"Using device: {self.device}")

        with open(label_mapping_path, 'r') as f:
            self.label_mapping = json.load(f)
            self.label_mapping = {int(k): v for k, v in self.label_mapping.items()}

        print(f"Loading model from: {model_path}")
        self.tokenizer = BertTokenizer.from_pretrained(model_path, local_files_only=True)
        self.model = BertForSequenceClassification.from_pretrained(model_path, local_files_only=True)
        self.model = self.model.to(self.device)
        self.model.eval()

        print("‚úÖ Model loaded successfully!")
        print(f"Number of categories: {len(self.label_mapping)}\n")

    def predict(self, text=None, headline=None, description=None):
        if headline and description:
            input_text = f"{headline} [SEP] {description}"
        elif text:
            input_text = text
        else:
            raise ValueError("Must provide either 'text' or both 'headline' and 'description'")

        encoding = self.tokenizer.encode_plus(
            input_text,
            add_special_tokens=True,
            max_length=128,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        input_ids = encoding['input_ids'].to(self.device)
        attention_mask = encoding['attention_mask'].to(self.device)

        with torch.no_grad():
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            probabilities = torch.softmax(logits, dim=1)

        top_probs, top_indices = torch.topk(probabilities, k=min(5, len(self.label_mapping)))
        top_probs = top_probs.cpu().numpy()[0]
        top_indices = top_indices.cpu().numpy()[0]

        predicted_label = top_indices[0]
        predicted_category = self.label_mapping[predicted_label]
        confidence = top_probs[0]

        top_predictions = [
            {'category': self.label_mapping[idx], 'confidence': float(prob)}
            for idx, prob in zip(top_indices, top_probs)
        ]

        return {
            'predicted_category': predicted_category,
            'confidence': float(confidence),
            'top_predictions': top_predictions
        }

# STEP 3: Initialize classifier
print("="*80)
print("BERT NEWS CLASSIFIER - COMPREHENSIVE TEST SUITE")
print("="*80 + "\n")

classifier = NewsClassifier(model_path=MODEL_PATH)

# STEP 4: Diverse test articles
test_articles = [
    {
        "headline": "Apple Unveils iPhone 16 with Revolutionary AI Chip",
        "description": "Tech giant Apple announced its latest smartphone featuring a groundbreaking neural processing unit that promises 10x faster AI performance.",
        "expected": "TECH"
    },
    {
        "headline": "LeBron James Becomes NBA's All-Time Leading Scorer",
        "description": "The Los Angeles Lakers star broke Kareem Abdul-Jabbar's long-standing record with a fadeaway jumper in the third quarter.",
        "expected": "SPORTS"
    },
    {
        "headline": "Mediterranean Diet Linked to 30% Lower Dementia Risk",
        "description": "Harvard researchers conducted a 20-year study showing people following a Mediterranean diet have significantly lower rates of cognitive decline.",
        "expected": "HEALTHY LIVING"
    },
    {
        "headline": "Federal Reserve Cuts Interest Rates by 0.5%",
        "description": "The Fed announced a larger-than-expected rate cut aimed at preventing recession as inflation moderates and unemployment rises.",
        "expected": "BUSINESS"
    },
    {
        "headline": "Taylor Swift's Eras Tour Breaks Records with $2 Billion",
        "description": "The pop superstar's worldwide concert tour has become the highest-grossing music tour in history, boosting local economies everywhere.",
        "expected": "ENTERTAINMENT"
    },
    {
        "headline": "Scientists Discover Habitable Exoplanet 40 Light-Years Away",
        "description": "Astronomers using James Webb Telescope identified a rocky planet with atmospheric conditions that could support liquid water.",
        "expected": "SCIENCE"
    },
    {
        "headline": "Supreme Court Rules on Environmental Protection Case",
        "description": "In a 6-3 decision, the court upheld federal regulations limiting carbon emissions, marking a victory for environmental advocates.",
        "expected": "POLITICS"
    },
    {
        "headline": "Viral TikTok Cloud Bread Recipe Takes Internet by Storm",
        "description": "A simple three-ingredient recipe has garnered 100 million views, with home bakers worldwide attempting the trendy fluffy technique.",
        "expected": "TASTE"
    },
    {
        "headline": "Paris Fashion Week Showcases Sustainable Luxury",
        "description": "Chanel, Dior, and Louis Vuitton presented eco-friendly haute couture featuring recycled materials and carbon-neutral production.",
        "expected": "STYLE & BEAUTY"
    },
    {
        "headline": "AI App Detects Postpartum Depression in New Mothers",
        "description": "A startup developed an application that monitors speech patterns and sleep to identify early warning signs of postpartum depression.",
        "expected": "PARENTING"
    }
]

# STEP 5: Run tests
print("\n" + "="*80)
print("TESTING 10 DIVERSE ARTICLES")
print("="*80)

for i, article in enumerate(test_articles, 1):
    print(f"\n{'='*80}")
    print(f"TEST #{i}")
    print(f"{'='*80}")
    print(f"üì∞ {article['headline']}")
    print(f"üìù {article['description'][:80]}...")
    print(f"{'-'*80}")

    result = classifier.predict(headline=article['headline'], description=article['description'])

    print(f"\nüéØ PREDICTED: {result['predicted_category']} ({result['confidence']:.2%})")
    print(f"\nTop 5:")
    for j, pred in enumerate(result['top_predictions'], 1):
        bar = "‚ñà" * int(pred['confidence'] * 30)
        print(f"  {j}. {pred['category']:<25} {bar} {pred['confidence']:.1%}")

print("\n" + "="*80)
print("COMPLETE! üéâ")
print("="*80)