# Learn2Clean Example: ScienceQA Dataset

This notebook demonstrates how to apply Learn2Clean to the ScienceQA dataset, focusing on cleaning text features for multimodal question answering.

## 0) Setup Learn2Clean Environment

In [None]:
# Install Learn2Clean in development mode
import os
if os.path.exists('../python-package'):
    %cd ../python-package
    !pip install -e .
    %cd ../examples
else:
    print("Learn2Clean python-package directory not found. Please check the path.")

## 1) Dataset Loading and Preparation

In [None]:
# Load required libraries
import pandas as pd
import numpy as np
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import os

def load_scienceqa_dataset():
    """Load and prepare ScienceQA dataset focusing on text features"""
    print("Loading ScienceQA dataset...")

    try:
        dataset = load_dataset("MothMalone/SLMS-KD-Benchmarks", "scienceqa")

        def prepare_scienceqa_data(split_data):
            data = []
            for item in split_data:
                # Focus on text features only as requested
                text_features = item['question']
                if item['choices']:
                    text_features += " [CHOICES] " + " | ".join(item['choices'])
                if item['hint']:
                    text_features += " [HINT] " + item['hint']
                if item['lecture']:
                    text_features += " [LECTURE] " + item['lecture']

                row = {
                    'text': text_features,
                    'question': item['question'],
                    'task': item['task'],
                    'grade': item['grade'],
                    'subject': item['subject'],
                    'topic': item['topic'],
                    'category': item['category'],
                    'answer': item['answer']
                }
                
                # Add text length as a feature for cleaning
                row['text_length'] = len(text_features)
                row['question_length'] = len(item['question'])
                row['has_choices'] = 1 if item['choices'] else 0
                row['has_hint'] = 1 if item['hint'] else 0
                row['has_lecture'] = 1 if item['lecture'] else 0
                
                data.append(row)
            return pd.DataFrame(data)

        # Use existing train/val/test splits if available
        train_df = prepare_scienceqa_data(dataset['train'])
        val_df = prepare_scienceqa_data(dataset['validation']) if 'validation' in dataset else None
        test_df = prepare_scienceqa_data(dataset['test']) if 'test' in dataset else None

        # If no validation/test splits, create them
        if val_df is None or test_df is None:
            print("Creating validation/test splits from train data")
            temp_df, test_df = train_test_split(train_df, test_size=0.2, random_state=42)
            train_df, val_df = train_test_split(temp_df, test_size=0.25, random_state=42)  # 0.25 * 0.8 = 0.2

        print(f"ScienceQA loaded: Train={len(train_df)}, Val={len(val_df)}, Test={len(test_df)}")
        return train_df, val_df, test_df
        
    except Exception as e:
        print(f"Error loading ScienceQA: {e}")
        return None, None, None

# Load the dataset
train_df, val_df, test_df = load_scienceqa_dataset()

In [None]:
# Display basic information about the dataset
if train_df is not None:
    print("Dataset shape:")
    print(f"Train: {train_df.shape}")
    print(f"Validation: {val_df.shape}")
    print(f"Test: {test_df.shape}")
    
    print("\nFirst few rows:")
    display(train_df.head())
    
    print("\nAnswer distribution:")
    print(train_df['answer'].value_counts())
    
    print("\nSubject distribution:")
    print(train_df['subject'].value_counts())
    
    print("\nText feature statistics:")
    print(f"Average text length: {train_df['text_length'].mean():.2f}")
    print(f"Average question length: {train_df['question_length'].mean():.2f}")
    print(f"Percentage with choices: {train_df['has_choices'].mean()*100:.1f}%")
    print(f"Percentage with hints: {train_df['has_hint'].mean()*100:.1f}%")
    print(f"Percentage with lectures: {train_df['has_lecture'].mean()*100:.1f}%")

## 2) Prepare Data for Learn2Clean

Learn2Clean works with CSV files, so we need to save our data and create a reader function.

In [None]:
# Create datasets directory if it doesn't exist
os.makedirs('../datasets/scienceqa', exist_ok=True)

# Save datasets as CSV files - KEEP TRAIN AND VALIDATION SEPARATE!
if train_df is not None:
    # Save train, validation, and test separately to avoid data leakage
    train_df.to_csv('../datasets/scienceqa/scienceqa_train.csv', index=False, encoding='utf-8')
    val_df.to_csv('../datasets/scienceqa/scienceqa_val.csv', index=False, encoding='utf-8')
    test_df.to_csv('../datasets/scienceqa/scienceqa_test.csv', index=False, encoding='utf-8')
    
    print("Datasets saved successfully!")
    print(f"Train size: {len(train_df)}")
    print(f"Validation size: {len(val_df)}")
    print(f"Test size: {len(test_df)}")
    print("\nIMPORTANT: Train/val/test kept separate to avoid data leakage for AutoGluon!")

In [None]:
# Define dataset reader function for Learn2Clean
def read_dataset(name):
    """Load datasets for Learn2Clean processing"""
    import pandas as pd
    if name == "scienceqa":
        df = pd.read_csv('../datasets/scienceqa/scienceqa_train.csv', sep=',', encoding='utf-8')
    elif name == "scienceqa_test":
        df = pd.read_csv('../datasets/scienceqa/scienceqa_test.csv', sep=',', encoding='utf-8')
    else: 
        raise ValueError('Invalid dataset name')               
    return df

# Test the reader function
test_load = read_dataset("scienceqa")
print(f"Loaded dataset shape: {test_load.shape}")
print(f"Columns: {test_load.columns.tolist()}")

## 3) Data Profiling with Learn2Clean

In [None]:
import learn2clean.loading.reader as rd 
import learn2clean.normalization.normalizer as nl 
import pandas as pd

# Execute profiling function for ScienceQA dataset
rd.profile_summary(read_dataset('scienceqa'), plot=False)

In [None]:
# Check the target variable
scienceqa_data = read_dataset('scienceqa')
print("Target variable (answer) distribution:")
print(scienceqa_data['answer'].value_counts())
print("\nTarget variable head:")
print(scienceqa_data['answer'].head())

print("\nText features analysis:")
print(f"Text length statistics:")
print(scienceqa_data['text_length'].describe())
print(f"\nQuestion length statistics:")
print(scienceqa_data['question_length'].describe())

## 4) Learn2Clean Data Processing

Now we'll use Learn2Clean's Reader class to process the ScienceQA dataset.

In [None]:
# Create Learn2Clean reader with encoding for classification
d_enc = rd.Reader(sep=',', verbose=True, encoding=True) 

# Process ScienceQA dataset - ONLY TRAIN DATA for Learn2Clean optimization
# This avoids data leakage by not using validation data in preprocessing decisions
scienceqa_files = ["../datasets/scienceqa/scienceqa_train.csv"]
scienceqa_encoded = d_enc.train_test_split(scienceqa_files, 'answer')

print("\nProcessed dataset structure (TRAIN ONLY):")
print(f"Train shape: {scienceqa_encoded['train'].shape}")
print(f"Target shape: {scienceqa_encoded['target'].shape}")
print(f"Target name: {scienceqa_encoded['target'].name}")
print("\nNote: Only training data used for Learn2Clean to avoid data leakage!")

## 5) Manual Data Cleaning Pipeline for Text Features

Let's create a manual preprocessing pipeline focusing on text feature cleaning.

In [None]:
# Import Learn2Clean modules for manual pipeline
import learn2clean.loading.reader as rd 
import learn2clean.normalization.normalizer as nl 
import learn2clean.feature_selection.feature_selector as fs
import learn2clean.duplicate_detection.duplicate_detector as dd
import learn2clean.outlier_detection.outlier_detector as od
import learn2clean.imputation.imputer as imp
import learn2clean.classification.classifier as cl

# Create a copy of the dataset for manual processing
manual_dataset = scienceqa_encoded.copy()

print("Starting manual preprocessing pipeline for text features...")

# Step 1: Handle missing values in text features
print("\n1. Imputation - Replace missing values")
imputer = imp.Imputer(dataset=manual_dataset, strategy='median', verbose=True)
manual_dataset = imputer.transform()

# Step 2: Outlier detection for text length features
print("\n2. Outlier Detection for text features")
outlier_detector = od.Outlier_detector(dataset=manual_dataset, strategy='LOF', verbose=True)
manual_dataset = outlier_detector.transform()

# Step 3: Duplicate detection
print("\n3. Duplicate Detection")
dup_detector = dd.Duplicate_detector(dataset=manual_dataset, strategy='drop_duplicates', verbose=True)
manual_dataset = dup_detector.transform()

# Step 4: Feature selection focusing on text-related features
print("\n4. Feature Selection for text features")
feat_selector = fs.Feature_selector(dataset=manual_dataset, strategy='WR', exclude='answer', verbose=True)
manual_dataset = feat_selector.transform()

# Step 5: Normalization of numerical text features
print("\n5. Normalization of text length features")
normalizer = nl.Normalizer(dataset=manual_dataset, strategy='standard', exclude='answer', verbose=True)
manual_dataset = normalizer.transform()

print("\nManual preprocessing completed!")
print(f"Final train shape: {manual_dataset['train'].shape}")
print(f"Final test shape: {manual_dataset['test'].shape}")

## 6) Classification with Manual Pipeline

In [None]:
# Test classification with manually cleaned data
print("Testing classification with manually cleaned text features...")

# Try different classifiers suitable for multiclass classification
classifiers = ['CART', 'NB', 'LDA']

for clf_name in classifiers:
    try:
        print(f"\nTesting {clf_name} classifier:")
        classifier = cl.Classifier(dataset=manual_dataset, goal=clf_name, target_goal='answer', verbose=True)
        result = classifier.transform()
        print(f"{clf_name} classification completed successfully")
    except Exception as e:
        print(f"Error with {clf_name}: {e}")

## 7) Automated Learn2Clean Pipeline

Now let's use Learn2Clean's Q-learning approach to automatically find the best preprocessing pipeline for text features.

In [None]:
import learn2clean.qlearning.qlearner as ql

# Create a fresh copy of the dataset for Learn2Clean
l2c_dataset = scienceqa_encoded.copy()

print("Starting Learn2Clean automated pipeline for text features...")
print("This may take several minutes to find the optimal preprocessing sequence.")

# Learn2Clean for CART classification
l2c_classification = ql.Qlearner(
    dataset=l2c_dataset,
    goal='CART', 
    target_goal='answer',
    threshold=0.6, 
    target_prepare=None, 
    file_name='scienceqa_example', 
    verbose=False
)

# Run Learn2Clean optimization
l2c_classification.learn2clean()

## 8) Alternative Classifier Testing

In [None]:
# Test Learn2Clean with different classifiers
classifiers_to_test = ['NB', 'LDA']

for clf in classifiers_to_test:
    try:
        print(f"\nTesting Learn2Clean with {clf} classifier...")
        l2c_alt = ql.Qlearner(
            dataset=scienceqa_encoded.copy(),
            goal=clf,
            target_goal='answer',
            threshold=0.6,
            target_prepare=None,
            file_name=f'scienceqa_{clf.lower()}_example',
            verbose=False
        )
        l2c_alt.learn2clean()
        print(f"Learn2Clean with {clf} completed")
    except Exception as e:
        print(f"Error with {clf}: {e}")

## 9) Random Baseline Comparison

In [None]:
# Compare with random preprocessing pipeline
random_dataset = scienceqa_encoded.copy()

print("Running random preprocessing pipeline for comparison...")

# Random preprocessing pipeline for CART classification
random_pipeline = ql.Qlearner(
    dataset=random_dataset,
    goal='CART',
    target_goal='answer',
    target_prepare=None, 
    verbose=False
)

try:
    random_pipeline.random_cleaning('scienceqa_random_example')
    print("Random pipeline completed successfully")
except Exception as e:
    print(f"Random pipeline error: {e}")

## 10) Results Analysis

The results of Learn2Clean and random cleaning are stored in the 'save' directory as text files.

In [None]:
# Check if results files exist and display them
import os

results_files = [
    'save/scienceqa_example_results.txt',
    'save/scienceqa_nb_example_results.txt',
    'save/scienceqa_lda_example_results.txt',
    'save/scienceqa_random_example_results_file.txt'
]

for file_path in results_files:
    if os.path.exists(file_path):
        print(f"\n=== Results from {file_path} ===")
        with open(file_path, 'r') as f:
            content = f.read()
            print(content[-500:])  # Show last 500 characters
    else:
        print(f"Results file not found: {file_path}")

## Summary

This notebook demonstrated how to apply Learn2Clean to the ScienceQA dataset with a focus on cleaning text features. The key steps were:

1. **Data Loading**: Loaded the ScienceQA dataset and prepared text features for analysis
2. **Text Feature Engineering**: Created additional features like text length, presence of choices/hints/lectures
3. **Data Preparation**: Converted the dataset to CSV format for Learn2Clean compatibility
4. **Profiling**: Used Learn2Clean's profiling capabilities to understand text feature distributions
5. **Manual Pipeline**: Created a comprehensive preprocessing pipeline focusing on text feature cleaning
6. **Automated Pipeline**: Used Learn2Clean's Q-learning approach to automatically optimize text feature preprocessing
7. **Multi-Classifier Testing**: Tested different classifiers (CART, NB, LDA) to find the best approach
8. **Comparison**: Compared Learn2Clean results with random preprocessing baselines

Learn2Clean automatically discovers the best sequence of data cleaning operations specifically for text features to maximize classification performance on the ScienceQA dataset. The focus on text features allows for targeted cleaning while preserving the multimodal nature of the original dataset.