In [1]:
import os
import pandas as pd
import torch
import numpy as np
from transformers import (
    XLMRobertaTokenizer, 
    XLMRobertaForSequenceClassification, 
    Trainer, 
    TrainingArguments,
    EarlyStoppingCallback,
    logging
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from tqdm.auto import tqdm
import gc
from sklearn.utils.class_weight import compute_class_weight

# Set logging
logging.set_verbosity_error()

class AdvancedBinaryClassifier:
    def __init__(self, model_name='xlm-roberta-base'):
        """Advanced classifier with confidence boosting techniques"""
        self.model_name = model_name
        
        # GPU setup
        if torch.cuda.is_available():
            self.device = torch.device('cuda')
            print(f"✅ GPU: {torch.cuda.get_device_name()}")
            print(f"✅ Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
            torch.cuda.empty_cache()
        else:
            self.device = torch.device('cpu')
            print("❌ Using CPU")
        
        self.tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)
        self.df = None
        self.class_weights = None
        
        # Advanced settings
        torch.backends.cudnn.benchmark = True
        if torch.cuda.is_available():
            torch.cuda.set_per_process_memory_fraction(0.9)

    def load_data(self, file_path):
        try:
            print(f"📂 Loading: {file_path}")
            
            if file_path.endswith('.csv'):
                encodings = ['utf-8', 'utf-8-sig', 'latin1', 'cp1252', 'windows-1256']
                for encoding in encodings:
                    try:
                        self.df = pd.read_csv(file_path, encoding=encoding)
                        print(f"✅ Loaded with {encoding}")
                        break
                    except:
                        continue
                else:
                    self.df = pd.read_csv(file_path, encoding='utf-8', errors='replace')
            elif file_path.endswith(('.xlsx', '.xls')):
                self.df = pd.read_excel(file_path)
            
            # Advanced data cleaning
            self.df = self.df.dropna()
            self.df = self.df.drop_duplicates()
            
            print(f"📊 Shape: {self.df.shape}")
            print(f"📂 Columns: {list(self.df.columns)}")
            return list(self.df.columns)
            
        except Exception as e:
            print(f"❌ Error: {e}")
            raise

    def prepare_data(self, label_column='label', text_column='text'):
        try:
            print(f"🔄 Preparing data...")
            
            if label_column not in self.df.columns:
                raise ValueError(f"Label column '{label_column}' not found")
            if text_column not in self.df.columns:
                raise ValueError(f"Text column '{text_column}' not found")
            
            # Get texts
            text_data = self.df[text_column].astype(str).tolist()
            
            # Handle labels
            labels = self.df[label_column].astype(int).tolist()
            
            # Ensure same length
            min_length = min(len(text_data), len(labels))
            text_data = text_data[:min_length]
            labels = labels[:min_length]
            
            # Calculate class weights for imbalanced datasets
            unique_labels = np.unique(labels)
            self.class_weights = compute_class_weight(
                'balanced', 
                classes=unique_labels, 
                y=labels
            )
            self.class_weights = torch.tensor(self.class_weights, dtype=torch.float).to(self.device)
            
            label_counts = pd.Series(labels).value_counts().sort_index()
            print(f"📊 Label distribution: {dict(label_counts)}")
            print(f"⚖️ Class weights: {self.class_weights.cpu().numpy()}")
            
            return text_data, labels
            
        except Exception as e:
            print(f"❌ Error preparing data: {e}")
            raise

    def create_dataset(self, texts, labels, max_length=384):
        print(f"🔄 Creating dataset with {len(texts)} samples...")
        
        class EnhancedTextDataset(Dataset):
            def __init__(self, texts, labels, tokenizer, max_length):
                self.texts = texts
                self.labels = labels
                self.tokenizer = tokenizer
                self.max_length = max_length
            
            def __len__(self):
                return len(self.texts)
            
            def __getitem__(self, idx):
                text = str(self.texts[idx])
                label = int(self.labels[idx])
                
                encoding = self.tokenizer(
                    text,
                    truncation=True,
                    padding='max_length',
                    max_length=self.max_length,
                    return_tensors='pt',
                    add_special_tokens=True,
                    return_attention_mask=True
                )
                
                return {
                    'input_ids': encoding['input_ids'].flatten(),
                    'attention_mask': encoding['attention_mask'].flatten(),
                    'labels': torch.tensor(label, dtype=torch.long)
                }
        
        return EnhancedTextDataset(texts, labels, self.tokenizer, max_length)

    def compute_metrics(self, eval_pred):
        predictions, labels = eval_pred
        preds = np.argmax(predictions, axis=1)
        
        probs = torch.softmax(torch.tensor(predictions), dim=-1).numpy()
        max_probs = np.max(probs, axis=1)
        avg_confidence = np.mean(max_probs)
        
        accuracy = accuracy_score(labels, preds)
        precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary', zero_division=0)
        
        tn, fp, fn, tp = confusion_matrix(labels, preds, labels=[0, 1]).ravel()
        specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
        balanced_accuracy = (recall + specificity) / 2
        
        return {
            'accuracy': float(accuracy),
            'balanced_accuracy': float(balanced_accuracy),
            'f1': float(f1),
            'precision': float(precision),
            'recall': float(recall),
            'specificity': float(specificity),
            'avg_confidence': float(avg_confidence),
            'true_positives': int(tp),
            'true_negatives': int(tn),
            'false_positives': int(fp),
            'false_negatives': int(fn)
        }

    def train(self, text_train, y_train, text_val, y_val, 
              save_model_path='./advanced_model',
              num_epochs=5,
              batch_size=12,
              learning_rate=1e-5,
              max_length=384):
        try:
            print(f"🚀 Advanced Training Started...")
            print(f"📊 Training: {len(text_train)} | Validation: {len(text_val)}")
            
            train_dataset = self.create_dataset(text_train, y_train, max_length)
            val_dataset = self.create_dataset(text_val, y_val, max_length)
            
            print(f"🔄 Loading enhanced model...")
            model = XLMRobertaForSequenceClassification.from_pretrained(
                self.model_name,
                num_labels=2,
                problem_type="single_label_classification",
                hidden_dropout_prob=0.1,
                attention_probs_dropout_prob=0.1,
                classifier_dropout=0.2
            )
            
            if self.class_weights is not None:
                model.loss_fct = torch.nn.CrossEntropyLoss(weight=self.class_weights)
            
            model = model.to(self.device)
            print(f"✅ Model on: {next(model.parameters()).device}")
            
            training_args = TrainingArguments(
                output_dir='./results',
                num_train_epochs=num_epochs,
                per_device_train_batch_size=batch_size,
                per_device_eval_batch_size=batch_size,
                gradient_accumulation_steps=2,
                warmup_ratio=0.1,
                weight_decay=0.01,
                learning_rate=learning_rate,
                
                eval_strategy="epoch",
                save_strategy="epoch",
                logging_steps=25,
                
                load_best_model_at_end=True,
                metric_for_best_model="f1",
                greater_is_better=True,
                
                fp16=True,
                gradient_checkpointing=True,
                dataloader_pin_memory=False,
                dataloader_num_workers=0,
                
                max_grad_norm=1.0,
                
                seed=42,
                report_to="none",
                remove_unused_columns=True,
            )
            
            trainer = Trainer(
                model=model,
                args=training_args,
                train_dataset=train_dataset,
                eval_dataset=val_dataset,
                compute_metrics=self.compute_metrics,
                callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
            )
            
            print(f"🚀 Training with advanced techniques...")
            trainer.train()
            
            print(f"💾 Saving to {save_model_path}")
            trainer.save_model(save_model_path)
            self.tokenizer.save_pretrained(save_model_path)
            
            print(f"📊 Final evaluation...")
            eval_results = trainer.evaluate()
            
            print("\n" + "="*60)
            print("🏆 ADVANCED TRAINING RESULTS:")
            for key, value in eval_results.items():
                if isinstance(value, float):
                    if 'confidence' in key:
                        print(f"{key}: {value:.1%}")
                    else:
                        print(f"{key}: {value:.4f}")
                else:
                    print(f"{key}: {value}")
            print("="*60)
            
            return eval_results
            
        except Exception as e:
            print(f"❌ Training error: {e}")
            raise
        finally:
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
                gc.collect()

    def predict_with_analysis(self, texts, model_path, confidence_threshold=0.9):
        
        try:
            print(f"🔄 Loading model for enhanced prediction...")
            model = XLMRobertaForSequenceClassification.from_pretrained(model_path)
            tokenizer = XLMRobertaTokenizer.from_pretrained(model_path)
            
            model = model.to(self.device)
            model.eval()
            
            results = []
            all_confidences = []
            
            print(f"🔮 Analyzing {len(texts)} samples...")
            
            for text in tqdm(texts):
                processed_text = text  # ✅ keep raw text
                
                inputs = tokenizer(
                    processed_text,
                    return_tensors='pt',
                    truncation=True,
                    padding=True,
                    max_length=384
                ).to(self.device)
                
                with torch.no_grad():
                    outputs = model(**inputs)
                    logits = outputs.logits
                    probs = torch.softmax(logits, dim=-1)
                    
                    pred = torch.argmax(logits, dim=-1).cpu().item()
                    conf = torch.max(probs).cpu().item()
                    
                    prob_not_allowed = probs[0][0].cpu().item()
                    prob_allowed = probs[0][1].cpu().item()
                    
                    pred_label = "not allowed" if pred == 0 else "allowed"
                    confidence_level = "HIGH" if conf >= confidence_threshold else "MEDIUM" if conf >= 0.7 else "LOW"
                    
                    results.append({
                        'text': text,
                        'processed_text': processed_text,
                        'prediction': pred_label,
                        'confidence': conf,
                        'confidence_level': confidence_level,
                        'prob_not_allowed': prob_not_allowed,
                        'prob_allowed': prob_allowed,
                        'prediction_numeric': pred
                    })
                    
                    all_confidences.append(conf)
            
            avg_confidence = np.mean(all_confidences)
            high_confidence_count = sum(1 for c in all_confidences if c >= confidence_threshold)
            
            print(f"\n📊 PREDICTION ANALYSIS:")
            print(f"Average confidence: {avg_confidence:.1%}")
            print(f"High confidence predictions: {high_confidence_count}/{len(texts)} ({high_confidence_count/len(texts):.1%})")
            
            return results
            
        except Exception as e:
            print(f"❌ Prediction error: {e}")
            raise
        finally:
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
                
def main():
    print("="*60)
    print("🚀 ADVANCED BINARY TEXT CLASSIFIER")
    print("="*60)
    
    print(f"PyTorch: {torch.__version__}")
    print(f"CUDA: {torch.cuda.is_available()}")
    if torch.cuda.is_available():
        print(f"GPU: {torch.cuda.get_device_name()}")
    
    classifier = AdvancedBinaryClassifier()
    data_path = "/kaggle/input/finalfinal/yarbitslh.csv"
    
    try:
        classifier.load_data(data_path)
        texts, labels = classifier.prepare_data()
        
        text_train, text_val, y_train, y_val = train_test_split(
            texts, labels,
            test_size=0.15,
            random_state=42,
            stratify=labels
        )
        
        print(f"📊 Training: {len(text_train)} | Validation: {len(text_val)}")
        
        results = classifier.train(
            text_train, y_train, text_val, y_val,
            save_model_path='./advanced_binary_model',
            num_epochs=5,
            batch_size=12,
            learning_rate=1e-5
        )
        sample_texts = [
            "This is a test message",
            "Another example text"
        ]
        
        detailed_results = classifier.predict_with_analysis(
            sample_texts, 
            './advanced_binary_model'
        )
        
        print("\n🔮 DETAILED PREDICTIONS:")
        for result in detailed_results:
            print(f"Text: {result['text']}")
            print(f"Prediction: {result['prediction']} ({result['confidence_level']} confidence)")
            print(f"Confidence: {result['confidence']:.1%}")
            print(f"Probabilities: Not Allowed={result['prob_not_allowed']:.1%}, Allowed={result['prob_allowed']:.1%}")
            print("-" * 50)
        
    except Exception as e:
        print(f"❌ Error: {e}")
        raise

if __name__ == "__main__":
    main()


2025-09-08 18:29:03.171436: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1757356143.378573      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1757356143.436575      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


🚀 ADVANCED BINARY TEXT CLASSIFIER
PyTorch: 2.6.0+cu124
CUDA: True
GPU: Tesla P100-PCIE-16GB
✅ GPU: Tesla P100-PCIE-16GB
✅ Memory: 17.1 GB


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

📂 Loading: /kaggle/input/finaldatamodel/finalDataModel.csv
✅ Loaded with utf-8
📊 Shape: (19388, 2)
📂 Columns: ['text', 'label']
🔄 Preparing data...
📊 Label distribution: {0: 10173, 1: 9215}
⚖️ Class weights: [0.9529146 1.0519805]
📊 Training: 16479 | Validation: 2909
🚀 Advanced Training Started...
📊 Training: 16479 | Validation: 2909
🔄 Creating dataset with 16479 samples...
🔄 Creating dataset with 2909 samples...
🔄 Loading enhanced model...


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

✅ Model on: cuda:0
🚀 Training with advanced techniques...
{'loss': 0.7015, 'grad_norm': 2.4555821418762207, 'learning_rate': 6.976744186046513e-07, 'epoch': 0.036390101892285295}
{'loss': 0.7019, 'grad_norm': 1.9594006538391113, 'learning_rate': 1.424418604651163e-06, 'epoch': 0.07278020378457059}
{'loss': 0.6989, 'grad_norm': 3.6748130321502686, 'learning_rate': 2.1511627906976745e-06, 'epoch': 0.1091703056768559}
{'loss': 0.6893, 'grad_norm': 2.1100189685821533, 'learning_rate': 2.8779069767441865e-06, 'epoch': 0.14556040756914118}
{'loss': 0.6644, 'grad_norm': 3.3923587799072266, 'learning_rate': 3.6046511627906977e-06, 'epoch': 0.1819505094614265}
{'loss': 0.597, 'grad_norm': 3.772350788116455, 'learning_rate': 4.302325581395349e-06, 'epoch': 0.2183406113537118}
{'loss': 0.4882, 'grad_norm': 3.699237108230591, 'learning_rate': 5e-06, 'epoch': 0.2547307132459971}
{'loss': 0.4123, 'grad_norm': 4.390717506408691, 'learning_rate': 5.697674418604652e-06, 'epoch': 0.29112081513828236}
{'

  0%|          | 0/8 [00:00<?, ?it/s]


📊 PREDICTION ANALYSIS:
Average confidence: 99.1%
High confidence predictions: 8/8 (100.0%)

✅ ALLOWED SAMPLES:
✅ allowed (99.3%): Hello, how are you today?
✅ allowed (98.9%): What's the weather like?
✅ allowed (99.6%): Thank you for your help
✅ allowed (99.1%): Good morning everyone
✅ allowed (99.1%): I hope you have a great day
✅ allowed (98.8%): This is a normal conversation
✅ allowed (99.3%): Can you help me with this question?
✅ allowed (98.8%): That's very interesting, tell me more
🔄 Loading model for enhanced prediction...
🔮 Analyzing 4 samples...


  0%|          | 0/4 [00:00<?, ?it/s]


📊 PREDICTION ANALYSIS:
Average confidence: 97.5%
High confidence predictions: 4/4 (100.0%)

❌ NOT ALLOWED SAMPLES:
❌ allowed (96.7%): This contains inappropriate content
❌ allowed (98.7%): Offensive language example
❌ allowed (97.2%): Hate speech sample
❌ allowed (97.3%): Violent content description
🔄 Loading model for enhanced prediction...
🔮 Analyzing 2 samples...


  0%|          | 0/2 [00:00<?, ?it/s]


📊 PREDICTION ANALYSIS:
Average confidence: 98.8%
High confidence predictions: 2/2 (100.0%)

🔮 DETAILED PREDICTIONS:
Text: This is a test message
Prediction: allowed (HIGH confidence)
Confidence: 98.6%
Probabilities: Not Allowed=1.4%, Allowed=98.6%
--------------------------------------------------
Text: Another example text
Prediction: allowed (HIGH confidence)
Confidence: 98.9%
Probabilities: Not Allowed=1.1%, Allowed=98.9%
--------------------------------------------------


In [29]:
# Quick test - run this in your notebook:
try:
    from transformers import XLMRobertaForSequenceClassification, XLMRobertaTokenizer
    import torch
    
    # Load your model
    model = XLMRobertaForSequenceClassification.from_pretrained('./advanced_binary_model')
    tokenizer = XLMRobertaTokenizer.from_pretrained('./advanced_binary_model')
    
    # Test it
    test_text ="Genocide in gaza increases day after day"
    inputs = tokenizer(test_text, return_tensors='pt', padding=True, truncation=True)
    
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        pred = torch.argmax(outputs.logits, dim=-1).item()
        conf = torch.softmax(outputs.logits, dim=-1).max().item()
    
    result = "allowed" if pred == 1 else "not allowed"
    print(f"✅ SUCCESS! Model works: '{result}' (confidence: {conf:.1%})")
    
except Exception as e:
    print(f"❌ Error: {e}")

✅ SUCCESS! Model works: 'allowed' (confidence: 93.4%)


In [32]:
try:
    from transformers import XLMRobertaForSequenceClassification, XLMRobertaTokenizer
    import torch

    # Function to load a model and test a text
    def test_model(model_path, test_text):
        # Load model + tokenizer
        model = XLMRobertaForSequenceClassification.from_pretrained(model_path)
        tokenizer = XLMRobertaTokenizer.from_pretrained(model_path)

        # Prepare inputs
        inputs = tokenizer(test_text, return_tensors='pt', padding=True, truncation=True)

        # Run inference
        model.eval()
        with torch.no_grad():
            outputs = model(**inputs)
            pred = torch.argmax(outputs.logits, dim=-1).item()
            conf = torch.softmax(outputs.logits, dim=-1).max().item()

        result = "allowed" if pred == 1 else "not allowed"
        return {"model": model_path, "prediction": result, "confidence": conf}

    # 🔎 Your test text
    test_text = ""

    # Two models to compare
    model1_path = "./advanced_binary_model"
    model2_path = "/kaggle/input/model3/transformers/default/1/ainexus_model"   # ⚠️ change this to your second model folder

    # Run both models
    result1 = test_model(model1_path, test_text)
    result2 = test_model(model2_path, test_text)

    # Display comparison
    print("🔮 COMPARISON RESULTS:")
    for r in [result1, result2]:
        print(f"Model: {r['model']}")
        print(f"Prediction: {r['prediction']} (confidence: {r['confidence']:.1%})")
        print("-" * 50)

except Exception as e:
    print(f"❌ Error: {e}")


🔮 COMPARISON RESULTS:
Model: ./advanced_binary_model
Prediction: allowed (confidence: 95.9%)
--------------------------------------------------
Model: /kaggle/input/model3/transformers/default/1/ainexus_model
Prediction: allowed (confidence: 89.7%)
--------------------------------------------------
