# BSC Dataset Labeling

This script applies the optimal BART model configuration to label the complete BSC dataset.
The model uses the combined approach (tweet + note) without the "other" class, 
with a confidence threshold of 0.9, achieving 90.4% committed accuracy and 91.2% coverage.

## Configuration
- **Model**: Combined input (tweet + note)
- **Classes**: economics, health, lifestyle, politics, science, sports
- **Confidence threshold**: 0.9
- **Expected performance**: 90.4% accuracy, 91.2% coverage

## Dependencies and Imports

In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import BartForSequenceClassification, BartTokenizer
from scipy.special import softmax
import os
from datetime import datetime
import warnings

warnings.filterwarnings('ignore')

## BSC Dataset Labeler Class
Core labeling functionality that loads the optimal BART model and applies it to the dataset with confidence thresholding.

In [None]:
class BSCDatasetLabeler:
    """
    Dataset labeler using optimal BART configuration for tweet-note classification.
    
    Configuration:
    - Model: Combined input (tweet + note)
    - Classes: economics, health, lifestyle, politics, science, sports
    - Confidence threshold: 0.9
    - Expected performance: 90.4% accuracy, 91.2% coverage
    """
    
    def __init__(self, model_path, confidence_threshold=0.9):
        """
        Initialize the dataset labeler.
        
        Args:
            model_path (str): Path to the trained BART model
            confidence_threshold (float): Confidence threshold for predictions
        """
        self.model_path = model_path
        self.confidence_threshold = confidence_threshold
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.class_labels = ['economics', 'health', 'lifestyle', 'politics', 'science', 'sports']
        
        print("Initializing BSC Dataset Labeler")
        print(f"Model path: {model_path}")
        print(f"Confidence threshold: {confidence_threshold}")
        print(f"Device: {self.device}")
        
        self._load_model()
    
    def _load_model(self):
        """Load the trained BART model and tokenizer."""
        print("Loading model and tokenizer...")
        
        try:
            self.model = BartForSequenceClassification.from_pretrained(self.model_path)
            self.tokenizer = BartTokenizer.from_pretrained(self.model_path)
            
            self.model.to(self.device)
            self.model.eval()
            
            print(f"Model loaded successfully")
            print(f"Number of classes: {self.model.config.num_labels}")
            print(f"Vocabulary size: {self.tokenizer.vocab_size}")
            
        except Exception as e:
            raise RuntimeError(f"Failed to load model: {str(e)}")
    
    # Data Preprocessing Methods
    # Functions for cleaning and preparing dataset for model input
    
    def _clean_text(self, text):
        """Clean and prepare text for model input."""
        if pd.isna(text) or text is None:
            return ""
        
        text = str(text).strip()
        # Remove non-printable characters except tabs, newlines, returns
        text = ''.join(char for char in text if ord(char) >= 32 or char in '\t\n\r')
        return text
    
    def _prepare_data(self, df):
        """
        Prepare the dataset for prediction.
        
        Args:
            df (pd.DataFrame): Input dataframe
            
        Returns:
            pd.DataFrame: Cleaned dataframe ready for prediction
        """
        print("Preparing data...")
        
        df_clean = df.copy()
        
        # Map dataset columns to model expected format
        df_clean['tweet_text'] = df_clean['full_text'].apply(self._clean_text)
        df_clean['note_text'] = df_clean['summary'].apply(self._clean_text)
        
        # Filter out rows with empty text in either field
        initial_count = len(df_clean)
        df_clean = df_clean[
            (df_clean['tweet_text'] != "") & 
            (df_clean['note_text'] != "")
        ].copy()
        final_count = len(df_clean)
        
        print(f"Samples processed: {initial_count}")
        print(f"Valid samples: {final_count}")
        print(f"Removed samples: {initial_count - final_count}")
        
        return df_clean
    
    # Batch Prediction Engine
    # Core prediction functionality with confidence thresholding
    
    def predict_batch(self, tweet_texts, note_texts, batch_size=16):
        """
        Generate predictions for a batch of tweet-note pairs.
        
        Args:
            tweet_texts (list): List of tweet texts
            note_texts (list): List of corresponding note texts
            batch_size (int): Batch size for processing
            
        Returns:
            tuple: (predictions, confidences) lists
        """
        all_predictions = []
        all_confidences = []
        
        print(f"Processing {len(tweet_texts)} samples in batches of {batch_size}")
        
        for i in range(0, len(tweet_texts), batch_size):
            batch_tweets = tweet_texts[i:i+batch_size]
            batch_notes = note_texts[i:i+batch_size]
            
            # Tokenize the batch
            inputs = self.tokenizer(
                batch_tweets,
                batch_notes,
                truncation="longest_first",
                padding="max_length",
                max_length=1024,
                return_tensors="pt"
            )
            
            # Move inputs to device
            inputs = {k: v.to(self.device) for k, v in inputs.items()}
            
            # Generate predictions
            with torch.no_grad():
                outputs = self.model(**inputs)
                probabilities = softmax(outputs.logits.cpu().numpy(), axis=1)
            
            # Process predictions for this batch
            for probs in probabilities:
                max_confidence = np.max(probs)
                predicted_class_idx = np.argmax(probs)
                
                # Apply confidence threshold
                if max_confidence >= self.confidence_threshold:
                    predicted_label = self.class_labels[predicted_class_idx]
                else:
                    predicted_label = 'low_confidence'
                
                all_predictions.append(predicted_label)
                all_confidences.append(max_confidence)
            
            # Progress reporting
            if (i // batch_size + 1) % 20 == 0:
                completed = min(i + batch_size, len(tweet_texts))
                print(f"Processed {completed}/{len(tweet_texts)} samples")
        
        return all_predictions, all_confidences
    
    # Dataset Labeling Pipeline
    # Main function orchestrating the complete labeling process
    
    def label_dataset(self, input_csv, output_csv=None, batch_size=16):
        """
        Label the complete dataset and save results.
        
        Args:
            input_csv (str): Path to input CSV file
            output_csv (str): Path for output CSV (optional)
            batch_size (int): Batch size for processing
            
        Returns:
            pd.DataFrame: Labeled dataframe
        """
        print(f"Starting dataset labeling process")
        print(f"Input file: {input_csv}")
        
        # Load the dataset
        try:
            df = pd.read_csv(input_csv)
            print(f"Dataset loaded: {len(df)} rows, {len(df.columns)} columns")
        except Exception as e:
            raise RuntimeError(f"Failed to load dataset: {str(e)}")
        
        # Prepare data for prediction
        df_clean = self._prepare_data(df)
        
        if len(df_clean) == 0:
            raise ValueError("No valid samples found after data cleaning")
        
        # Generate predictions
        print("Generating predictions...")
        start_time = datetime.now()
        
        predictions, confidences = self.predict_batch(
            df_clean['tweet_text'].tolist(),
            df_clean['note_text'].tolist(),
            batch_size=batch_size
        )
        
        end_time = datetime.now()
        processing_time = (end_time - start_time).total_seconds()
        
        # Add predictions to dataframe
        df_clean['predicted_classification'] = predictions
        df_clean['prediction_confidence'] = confidences
        df_clean['manual_review_required'] = df_clean['predicted_classification'] == 'low_confidence'
        
        # Calculate performance metrics
        total_samples = len(predictions)
        confident_predictions = sum(1 for p in predictions if p != 'low_confidence')
        coverage = confident_predictions / total_samples * 100
        
        print(f"\nPrediction Results:")
        print(f"Total samples: {total_samples}")
        print(f"Confident predictions: {confident_predictions}")
        print(f"Coverage: {coverage:.1f}%")
        print(f"Manual review needed: {total_samples - confident_predictions} ({100-coverage:.1f}%)")
        print(f"Processing time: {processing_time:.1f} seconds")
        
        # Show class distribution
        pred_counts = pd.Series(predictions).value_counts()
        print(f"\nClass Distribution:")
        for class_name, count in pred_counts.items():
            percentage = count / total_samples * 100
            print(f"  {class_name}: {count} ({percentage:.1f}%)")
        
        # Save results
        if output_csv is None:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            output_csv = f"labeling_results/bsc_dataset_labeled_{timestamp}.csv"
        
        # Create output directory
        os.makedirs(os.path.dirname(output_csv), exist_ok=True)
        
        try:
            df_clean.to_csv(output_csv, index=False)
            print(f"\nResults saved to: {output_csv}")
        except Exception as e:
            print(f"Warning: Could not save results - {str(e)}")
        
        return df_clean
    
    # Summary Report Generation
    # Creates comprehensive summary report with detailed statistics
    
    def generate_summary_report(self, labeled_df, output_path=None):
        """
        Generate a summary report of the labeling results.
        
        Args:
            labeled_df (pd.DataFrame): DataFrame with predictions
            output_path (str): Path for summary report (optional)
        """
        if output_path is None:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            output_path = f"labeling_results/labeling_summary_{timestamp}.txt"
        
        # Create output directory
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        
        # Calculate summary statistics
        total_samples = len(labeled_df)
        confident_predictions = len(labeled_df[labeled_df['predicted_classification'] != 'low_confidence'])
        coverage = confident_predictions / total_samples * 100
        
        confidence_stats = labeled_df['prediction_confidence'].describe()
        class_distribution = labeled_df['predicted_classification'].value_counts()
        
        # Generate report
        report = [
            "BSC Dataset Labeling Summary Report",
            "=" * 50,
            f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
            f"Model: Combined BART (tweet + note), without 'other' class",
            f"Confidence threshold: {self.confidence_threshold}",
            "",
            "Dataset Statistics:",
            f"  Total samples: {total_samples:,}",
            f"  Confident predictions: {confident_predictions:,}",
            f"  Coverage: {coverage:.1f}%",
            f"  Manual review required: {total_samples - confident_predictions:,} ({100-coverage:.1f}%)",
            "",
            "Confidence Score Statistics:",
            f"  Mean: {confidence_stats['mean']:.3f}",
            f"  Median: {confidence_stats['50%']:.3f}",
            f"  Standard deviation: {confidence_stats['std']:.3f}",
            f"  Minimum: {confidence_stats['min']:.3f}",
            f"  Maximum: {confidence_stats['max']:.3f}",
            "",
            "Class Distribution:",
        ]
        
        for class_name, count in class_distribution.items():
            percentage = count / total_samples * 100
            report.append(f"  {class_name}: {count:,} ({percentage:.1f}%)")
        
        # Save report
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write('\n'.join(report))
        
        print(f"Summary report saved to: {output_path}")
    
    # Detailed Analysis Functions
    # Examining prediction confidence distributions and label comparisons
    
    def analyze_predictions(self, labeled_df):
        """
        Analyze the prediction results with detailed confidence and distribution statistics.
        
        Args:
            labeled_df (pd.DataFrame): DataFrame with predictions
        """
        print(f"\nDetailed Analysis:")
        
        # Confidence distribution statistics
        confidence_stats = labeled_df['prediction_confidence'].describe()
        print(f"\nConfidence Score Distribution:")
        print(f"   Mean: {confidence_stats['mean']:.3f}")
        print(f"   Median: {confidence_stats['50%']:.3f}")
        print(f"   Standard deviation: {confidence_stats['std']:.3f}")
        print(f"   Minimum: {confidence_stats['min']:.3f}")
        print(f"   Maximum: {confidence_stats['max']:.3f}")
        
        # Confidence tier analysis
        high_conf = labeled_df[labeled_df['prediction_confidence'] > 0.95]
        moderate_conf = labeled_df[
            (labeled_df['prediction_confidence'] > self.confidence_threshold) & 
            (labeled_df['prediction_confidence'] <= 0.95)
        ]
        
        print(f"\nConfidence Tier Analysis:")
        print(f"   Very high confidence (>0.95): {len(high_conf)} samples ({len(high_conf)/len(labeled_df)*100:.1f}%)")
        print(f"   Moderate confidence ({self.confidence_threshold}-0.95): {len(moderate_conf)} samples ({len(moderate_conf)/len(labeled_df)*100:.1f}%)")
        
        # Compare with original labels if they exist
        if 'classification' in labeled_df.columns:
            print(f"\nComparison with Original Labels:")
            original_counts = labeled_df['classification'].value_counts()
            predicted_counts = labeled_df['predicted_classification'].value_counts()
            
            print("Original label distribution:")
            for label, count in original_counts.items():
                pct = count / len(labeled_df) * 100
                print(f"   {label}: {count} ({pct:.1f}%)")
            
            print("Predicted label distribution:")
            for label, count in predicted_counts.items():
                pct = count / len(labeled_df) * 100
                print(f"   {label}: {count} ({pct:.1f}%)")

## Main Execution
Execute the complete dataset labeling pipeline, including analysis of prediction confidence and class distributions.

**Configuration:**
- Model path: `./trained_models/final_without_other_combined/`
- Input dataset: `for_bsc_project.csv`
- Confidence threshold: 0.9
- Expected performance: ~90.4% accuracy on confident predictions

**Output Files:**
- `bsc_dataset_labeled_{timestamp}.csv`: Complete labeled dataset
- `labeling_summary_{timestamp}.txt`: Summary statistics report

In [None]:
def main():
    """Main execution function for dataset labeling."""
    
    print("BSC Dataset Labeling Tool")
    print("=" * 50)
    
    # Configuration
    MODEL_PATH = "./trained_models/final_without_other_combined/"
    INPUT_CSV = "for_bsc_project.csv"
    CONFIDENCE_THRESHOLD = 0.9
    BATCH_SIZE = 16
    
    # Validate inputs
    if not os.path.exists(MODEL_PATH):
        print(f"Error: Model not found at {MODEL_PATH}")
        print("Please ensure the model path is correct.")
        return
    
    if not os.path.exists(INPUT_CSV):
        print(f"Error: Dataset not found at {INPUT_CSV}")
        print("Please ensure the CSV file is in the current directory.")
        return
    
    # Initialize labeler and process dataset
    try:
        labeler = BSCDatasetLabeler(
            model_path=MODEL_PATH,
            confidence_threshold=CONFIDENCE_THRESHOLD
        )
        
        # Label the dataset
        labeled_df = labeler.label_dataset(
            input_csv=INPUT_CSV,
            batch_size=BATCH_SIZE
        )
        
        # Perform detailed analysis
        labeler.analyze_predictions(labeled_df)
        
        # Generate summary report
        labeler.generate_summary_report(labeled_df)
        
        print("\nDataset labeling completed successfully!")
        print("Expected performance: ~90.4% accuracy on confident predictions")
        
    except Exception as e:
        print(f"Error during processing: {str(e)}")
        return

# Execute the labeling process
if __name__ == "__main__":
    main()