In [None]:
!python fast_fixed_meta_learning_1.py

Learn2Clean components loaded successfully
Initial number of rows: 801
After deduplication: Number of rows: 800
Initial number of rows: 800
After deduplication: Number of rows: 799
Initial number of rows: 2000
After deduplication: Number of rows: 1998
Initial number of rows: 2000
After deduplication: Number of rows: 1998
Initial number of rows: 2000
After deduplication: Number of rows: 1998
Initial number of rows: 2000
After deduplication: Number of rows: 1998
Initial number of rows: 2000
After deduplication: Number of rows: 1998
Initial number of rows: 2000
After deduplication: Number of rows: 1998
Initial number of rows: 2000
After deduplication: Number of rows: 1998
Initial number of rows: 2000
After deduplication: Number of rows: 1998


2025-07-27 23:42:37,341 - INFO - FAST FIXED META-LEARNING EXPERIMENT - ANLI R1 ONLY
2025-07-27 23:42:37,341 - INFO - Start time: 2025-07-27 23:42:37
2025-07-27 23:42:37,341 - INFO - Loading ANLI R1 dataset...
2025-07-28 00:09:53,089 - INFO - ANLI R1 loaded in 27m 15.7s
2025-07-28 00:09:53,094 - INFO - Sizes - Train: 16946, Val: 1000, Test: 1000
2025-07-28 00:09:53,099 - INFO - 
2025-07-28 00:09:53,100 - INFO - PHASE 1: HYBRID EXPLORATORY META-LEARNING (Training data only)
2025-07-28 00:09:53,142 - INFO - Using 2000 training samples for meta-learning
2025-07-28 00:09:53,142 - INFO - Starting meta-learning...
2025-07-28 00:09:53,151 - INFO - Generation 1/5
2025-07-28 00:09:54,137 - INFO - Pipeline 1: fitness=0.3854, retention=0.40 (799/2000)
2025-07-28 00:09:54,137 - INFO - 🏆 New best fitness: 0.3854 (retention: 0.40)
2025-07-28 00:09:54,322 - INFO - Learned length thresholds: 331 - 477
2025-07-28 00:09:54,322 - INFO - Learned quality threshold: 0.661
2025-07-28 00:09:54,511 - INFO - Fil

: 

In [None]:
import pandas as pd
import numpy as np
from autogluon.tabular import TabularPredictor
import os
import time
from datetime import datetime

# Configuration
TRAIN_FILE = "train.csv"  # Update with your actual train file name
TEST_FILE = "test.csv"    # Update with your actual test file name
MODEL_PATH = "autogluon_models"
TARGET_COLUMN = "label"   # Update with your actual target column name

def run_autogluon_experiment():
    """Run AutoGluon experiment on saved CSV files"""
    
    print("🚀 Starting AutoGluon Experiment")
    print("=" * 60)
    
    # Check if files exist
    if not os.path.exists(TRAIN_FILE):
        print(f"❌ Train file '{TRAIN_FILE}' not found!")
        return
    
    if not os.path.exists(TEST_FILE):
        print(f"❌ Test file '{TEST_FILE}' not found!")
        return
    
    # Load data
    print("📂 Loading data...")
    start_time = time.time()
    
    train_data = pd.read_csv(TRAIN_FILE)
    test_data = pd.read_csv(TEST_FILE)
    
    load_time = time.time() - start_time
    
    print(f"✅ Data loaded successfully in {load_time:.2f} seconds")
    print(f"   Train shape: {train_data.shape}")
    print(f"   Test shape: {test_data.shape}")
    
    # Check target column
    if TARGET_COLUMN not in train_data.columns:
        print(f"❌ Target column '{TARGET_COLUMN}' not found in train data!")
        print(f"Available columns: {list(train_data.columns)}")
        return
    
    # Display data info
    print("\n📊 Dataset Information:")
    print(f"   Target column: {TARGET_COLUMN}")
    print(f"   Target distribution:")
    print(train_data[TARGET_COLUMN].value_counts().to_string())
    
    # Check for text columns
    text_columns = [col for col in train_data.columns if train_data[col].dtype == 'object' and col != TARGET_COLUMN]
    print(f"   Text columns: {text_columns}")
    
    # Setup AutoGluon predictor
    print("\n🤖 Setting up AutoGluon TabularPredictor...")
    
    predictor = TabularPredictor(
        label=TARGET_COLUMN,
        path=MODEL_PATH,
        problem_type='classification',  # Change to 'regression' if needed
        eval_metric='accuracy',         # Change metric if needed
        verbosity=2
    )
    
    # Configure training parameters
    time_limit = 600  # 10 minutes - adjust as needed
    presets = 'best_quality'  # Options: 'best_quality', 'high_quality', 'good_quality', 'medium_quality'
    
    print(f"   Time limit: {time_limit} seconds")
    print(f"   Presets: {presets}")
    
    # Train the model
    print(f"\n🏋️ Training AutoGluon model (started at {datetime.now().strftime('%H:%M:%S')})...")
    train_start = time.time()
    
    try:
        predictor.fit(
            train_data=train_data,
            time_limit=time_limit,
            presets=presets,
            num_cpus='auto',
            num_gpus='auto' if 'cuda' in str(train_data.device) if hasattr(train_data, 'device') else 0
        )
        
        train_time = time.time() - train_start
        print(f"✅ Training completed in {train_time/60:.2f} minutes")
        
    except Exception as e:
        print(f"❌ Training failed: {e}")
        return
    
    # Get model information
    print("\n📋 Model Information:")
    leaderboard = predictor.leaderboard(train_data, silent=True)
    print("Top 5 models:")
    print(leaderboard.head().to_string())
    
    # Make predictions on test set
    print("\n🔮 Making predictions on test set...")
    pred_start = time.time()
    
    try:
        # Check if test data has target column (for evaluation)
        if TARGET_COLUMN in test_data.columns:
            # Test data has labels - we can evaluate
            test_labels = test_data[TARGET_COLUMN]
            test_features = test_data.drop(columns=[TARGET_COLUMN])
            
            predictions = predictor.predict(test_features)
            pred_proba = predictor.predict_proba(test_features)
            
            # Calculate accuracy
            from sklearn.metrics import accuracy_score, classification_report
            accuracy = accuracy_score(test_labels, predictions)
            
            print(f"✅ Test Accuracy: {accuracy:.4f}")
            print("\n📊 Classification Report:")
            print(classification_report(test_labels, predictions))
            
        else:
            # Test data doesn't have labels - just predict
            predictions = predictor.predict(test_data)
            pred_proba = predictor.predict_proba(test_data)
            
            print(f"✅ Predictions generated for {len(predictions)} samples")
        
        pred_time = time.time() - pred_start
        print(f"⏱️ Prediction time: {pred_time:.2f} seconds")
        
    except Exception as e:
        print(f"❌ Prediction failed: {e}")
        return
    
    # Save predictions
    print("\n💾 Saving predictions...")
    predictions_df = pd.DataFrame({
        'predictions': predictions
    })
    
    # Add prediction probabilities if available
    if pred_proba is not None and hasattr(pred_proba, 'columns'):
        for col in pred_proba.columns:
            predictions_df[f'prob_{col}'] = pred_proba[col]
    
    predictions_file = f"predictions_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
    predictions_df.to_csv(predictions_file, index=False)
    print(f"✅ Predictions saved to: {predictions_file}")
    
    # Feature importance
    try:
        print("\n🎯 Feature Importance:")
        feature_importance = predictor.feature_importance(train_data)
        print("Top 10 most important features:")
        print(feature_importance.head(10).to_string())
        
        # Save feature importance
        importance_file = f"feature_importance_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
        feature_importance.to_csv(importance_file)
        print(f"✅ Feature importance saved to: {importance_file}")
        
    except Exception as e:
        print(f"⚠️ Could not compute feature importance: {e}")
    
    # Summary
    total_time = time.time() - start_time
    print("\n" + "=" * 60)
    print("🎉 AutoGluon Experiment Summary:")
    print(f"   Total time: {total_time/60:.2f} minutes")
    print(f"   Training time: {train_time/60:.2f} minutes")
    print(f"   Model path: {MODEL_PATH}")
    print(f"   Predictions file: {predictions_file}")
    if TARGET_COLUMN in test_data.columns:
        print(f"   Test accuracy: {accuracy:.4f}")
    print("=" * 60)

# Run the experiment
run_autogluon_experiment()

In [None]:
# Quick AutoGluon Configuration - Update these variables as needed
TRAIN_FILE = "train.csv"      # Your train file name
TEST_FILE = "test.csv"        # Your test file name  
TARGET_COLUMN = "label"       # Your target column name
TIME_LIMIT = 300             # Training time in seconds (5 minutes)
PRESETS = 'medium_quality'   # Options: 'best_quality', 'high_quality', 'good_quality', 'medium_quality'

print("Current Configuration:")
print(f"  Train file: {TRAIN_FILE}")
print(f"  Test file: {TEST_FILE}")
print(f"  Target column: {TARGET_COLUMN}")
print(f"  Time limit: {TIME_LIMIT} seconds ({TIME_LIMIT/60:.1f} minutes)")
print(f"  Quality preset: {PRESETS}")

# Check if files exist
import os
print("\nFile Status:")
print(f"  {TRAIN_FILE}: {'✅ Found' if os.path.exists(TRAIN_FILE) else '❌ Not found'}")
print(f"  {TEST_FILE}: {'✅ Found' if os.path.exists(TEST_FILE) else '❌ Not found'}")

# Quick data preview
if os.path.exists(TRAIN_FILE):
    import pandas as pd
    df = pd.read_csv(TRAIN_FILE)
    print(f"\nTrain Data Preview:")
    print(f"  Shape: {df.shape}")
    print(f"  Columns: {list(df.columns)}")
    if TARGET_COLUMN in df.columns:
        print(f"  Target distribution:")
        print(f"    {df[TARGET_COLUMN].value_counts().to_dict()}")
    else:
        print(f"  ⚠️ Target column '{TARGET_COLUMN}' not found!")
        print(f"  Available columns: {list(df.columns)}")