# Learn2Clean Example: Temperature Rain Dataset

This notebook demonstrates how to apply Learn2Clean to the Temperature Rain time series dataset. Since Learn2Clean is designed for tabular data cleaning, we'll focus on cleanable features that can be extracted from the time series data.

## 0) Setup Learn2Clean Environment

In [None]:
# Install Learn2Clean in development mode
import os
if os.path.exists('../python-package'):
    %cd ../python-package
    !pip install -e .
    %cd ../examples
else:
    print("Learn2Clean python-package directory not found. Please check the path.")

## 1) Dataset Loading and Preparation

In [1]:
# Load required libraries
import pandas as pd
import numpy as np
import os
import zipfile
import sys
from sklearn.model_selection import train_test_split

def load_temperature_rain_dataset():
    """Load and prepare Temperature Rain time series dataset from TSF file"""
    print("Loading Temperature Rain dataset...")

    # Check if cached processed data exists
    cache_dir = './temp_data/temperature_rain_cache'
    train_cache = os.path.join(cache_dir, 'train_df.pkl')
    val_cache = os.path.join(cache_dir, 'val_df.pkl')
    test_cache = os.path.join(cache_dir, 'test_df.pkl')

    if os.path.exists(train_cache) and os.path.exists(val_cache) and os.path.exists(test_cache):
        print("Loading cached temperature_rain data...")
        try:
            train_df = pd.read_pickle(train_cache)
            val_df = pd.read_pickle(val_cache)
            test_df = pd.read_pickle(test_cache)

            print(f"Cached Temperature Rain loaded: Train={len(train_df)}, Val={len(val_df)}, Test={len(test_df)}")
            return train_df, val_df, test_df
        except Exception as e:
            print(f"Error loading cached data: {e}. Will reload from source.")

    try:
        # Add monash_tsf to path for utils
        sys.path.append('./monash_tsf')
        from utils import convert_tsf_to_dataframe

        # Extract and load the TSF file
        zip_path = './monash_tsf/data/temperature_rain_dataset_with_missing_values.zip'
        tsf_file = 'temperature_rain_dataset_with_missing_values.tsf'

        if not os.path.exists(zip_path):
            print(f"Temperature rain zip file not found at {zip_path}")
            print("Creating synthetic temperature data for demonstration...")
            return create_synthetic_temperature_data()

        # Extract TSF file temporarily
        os.makedirs('./temp_data', exist_ok=True)
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extract(tsf_file, './temp_data/')

        tsf_path = f'./temp_data/{tsf_file}'

        # Parse TSF file
        print("Parsing TSF file...")
        df, frequency, forecast_horizon, contain_missing_values, contain_equal_length = convert_tsf_to_dataframe(
            tsf_path,
            replace_missing_vals_with=np.nan,
            value_column_name="target"
        )

        print(f"TSF file parsed successfully:")
        print(f"  - Shape: {df.shape}")
        print(f"  - Frequency: {frequency}")
        print(f"  - Contains missing values: {contain_missing_values}")

        # Clean up temp file
        os.remove(tsf_path)

        # Convert to tabular format for Learn2Clean
        tabular_data = convert_to_tabular_features(df)
        
        # Split the data
        train_df, temp_df = train_test_split(tabular_data, test_size=0.4, random_state=42)
        val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

        print(f"Temperature Rain loaded: Train={len(train_df)}, Val={len(val_df)}, Test={len(test_df)}")

        # Cache the processed data
        os.makedirs(cache_dir, exist_ok=True)
        train_df.to_pickle(train_cache)
        val_df.to_pickle(val_cache)
        test_df.to_pickle(test_cache)

        return train_df, val_df, test_df

    except Exception as e:
        print(f"Error loading Temperature Rain: {e}")
        print("Creating synthetic temperature data for demonstration...")
        return create_synthetic_temperature_data()

def convert_to_tabular_features(df):
    """Convert time series data to tabular features suitable for Learn2Clean"""
    print("Converting time series to tabular features...")
    
    tabular_data = []
    
    for idx, row in df.iterrows():
        if idx % 100 == 0:
            print(f"Processing series {idx}/{len(df)}")
        
        # Get item_id
        item_id_col = 'series_name' if 'series_name' in row else 'station_id'
        item_id = row[item_id_col] if item_id_col in row else f'series_{idx}'
        
        # Get target values
        target_values = row['target']
        if isinstance(target_values, str):
            target_array = [float(x) for x in target_values.split() if x.strip()]
        elif isinstance(target_values, (list, np.ndarray)):
            target_array = [float(x) for x in target_values if pd.notna(x)]
        else:
            target_array = [float(target_values)] if pd.notna(target_values) else []
        
        if len(target_array) == 0:
            continue
            
        # Extract statistical features from time series
        target_array = np.array(target_array)
        
        features = {
            'item_id': str(item_id),
            'series_length': len(target_array),
            'mean_value': np.mean(target_array),
            'std_value': np.std(target_array),
            'min_value': np.min(target_array),
            'max_value': np.max(target_array),
            'median_value': np.median(target_array),
            'q25_value': np.percentile(target_array, 25),
            'q75_value': np.percentile(target_array, 75),
            'skewness': pd.Series(target_array).skew(),
            'kurtosis': pd.Series(target_array).kurtosis(),
            'missing_ratio': np.sum(pd.isna(target_array)) / len(target_array),
            'zero_ratio': np.sum(target_array == 0) / len(target_array),
            'trend': np.polyfit(range(len(target_array)), target_array, 1)[0],  # Linear trend
            'target_class': 'high' if np.mean(target_array) > np.median([np.mean(target_array) for _, row in df.iterrows()]) else 'low'
        }
        
        tabular_data.append(features)
    
    return pd.DataFrame(tabular_data)

def create_synthetic_temperature_data():
    """Create synthetic temperature data for demonstration"""
    print("Creating synthetic temperature data...")
    
    np.random.seed(42)
    n_stations = 100
    
    data = []
    for i in range(n_stations):
        # Generate synthetic time series statistics
        series_length = np.random.randint(300, 700)
        base_temp = np.random.normal(20, 10)  # Base temperature
        
        # Generate synthetic features
        features = {
            'item_id': f'station_{i}',
            'series_length': series_length,
            'mean_value': base_temp + np.random.normal(0, 2),
            'std_value': np.random.uniform(5, 15),
            'min_value': base_temp - np.random.uniform(15, 25),
            'max_value': base_temp + np.random.uniform(15, 25),
            'median_value': base_temp + np.random.normal(0, 1),
            'q25_value': base_temp - np.random.uniform(5, 10),
            'q75_value': base_temp + np.random.uniform(5, 10),
            'skewness': np.random.normal(0, 0.5),
            'kurtosis': np.random.normal(0, 1),
            'missing_ratio': np.random.uniform(0, 0.1),
            'zero_ratio': np.random.uniform(0, 0.05),
            'trend': np.random.normal(0, 0.01),
            'target_class': 'high' if base_temp > 20 else 'low'
        }
        data.append(features)
    
    full_df = pd.DataFrame(data)
    
    # Split the data
    train_df, temp_df = train_test_split(full_df, test_size=0.4, random_state=42)
    val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)
    
    print(f"Synthetic Temperature Rain created: Train={len(train_df)}, Val={len(val_df)}, Test={len(test_df)}")
    return train_df, val_df, test_df

# Load the dataset
train_df, val_df, test_df = load_temperature_rain_dataset()

Loading Temperature Rain dataset...
Error loading Temperature Rain: No module named 'utils'
Creating synthetic temperature data for demonstration...
Creating synthetic temperature data...
Synthetic Temperature Rain created: Train=60, Val=20, Test=20


In [None]:
# Display basic information about the dataset
if train_df is not None:
    print("Dataset shape:")
    print(f"Train: {train_df.shape}")
    print(f"Validation: {val_df.shape}")
    print(f"Test: {test_df.shape}")
    
    print("\nFirst few rows:")
    display(train_df.head())
    
    print("\nTarget class distribution:")
    print(train_df['target_class'].value_counts())
    
    print("\nFeature statistics:")
    print(train_df.describe())
    
    print("\nColumn info:")
    print(train_df.info())

## 2) Prepare Data for Learn2Clean

Learn2Clean works with CSV files, so we need to save our tabular features and create a reader function.

In [None]:
# Create datasets directory if it doesn't exist
os.makedirs('../datasets/temperature_rain', exist_ok=True)

# Save datasets as CSV files - KEEP TRAIN AND VALIDATION SEPARATE!
if train_df is not None:
    # Save train, validation, and test separately to avoid data leakage
    train_df.to_csv('../datasets/temperature_rain/temperature_rain_train.csv', index=False, encoding='utf-8')
    val_df.to_csv('../datasets/temperature_rain/temperature_rain_val.csv', index=False, encoding='utf-8')
    test_df.to_csv('../datasets/temperature_rain/temperature_rain_test.csv', index=False, encoding='utf-8')
    
    print("Datasets saved successfully!")
    print(f"Train size: {len(train_df)}")
    print(f"Validation size: {len(val_df)}")
    print(f"Test size: {len(test_df)}")
    print("\nIMPORTANT: Train/val/test kept separate to avoid data leakage for AutoGluon!")

In [None]:
# Define dataset reader function for Learn2Clean
def read_dataset(name):
    """Load datasets for Learn2Clean processing"""
    import pandas as pd
    if name == "temperature_rain":
        df = pd.read_csv('../datasets/temperature_rain/temperature_rain_train.csv', sep=',', encoding='utf-8')
    elif name == "temperature_rain_test":
        df = pd.read_csv('../datasets/temperature_rain/temperature_rain_test.csv', sep=',', encoding='utf-8')
    else: 
        raise ValueError('Invalid dataset name')               
    return df

# Test the reader function
test_load = read_dataset("temperature_rain")
print(f"Loaded dataset shape: {test_load.shape}")
print(f"Columns: {test_load.columns.tolist()}")

## 3) Data Profiling with Learn2Clean

In [None]:
import learn2clean.loading.reader as rd 
import learn2clean.normalization.normalizer as nl 
import pandas as pd

# Execute profiling function for Temperature Rain dataset
rd.profile_summary(read_dataset('temperature_rain'), plot=False)

In [None]:
# Check the target variable and feature distributions
temp_rain_data = read_dataset('temperature_rain')
print("Target variable (target_class) distribution:")
print(temp_rain_data['target_class'].value_counts())

print("\nNumerical feature correlations with target:")
numerical_cols = temp_rain_data.select_dtypes(include=[np.number]).columns
for col in numerical_cols:
    if col != 'target_class':
        high_mean = temp_rain_data[temp_rain_data['target_class'] == 'high'][col].mean()
        low_mean = temp_rain_data[temp_rain_data['target_class'] == 'low'][col].mean()
        print(f"{col}: High={high_mean:.3f}, Low={low_mean:.3f}")

## 4) Learn2Clean Data Processing

Now we'll use Learn2Clean's Reader class to process the Temperature Rain tabular features.

In [None]:
# Create Learn2Clean reader with encoding for classification
d_enc = rd.Reader(sep=',', verbose=True, encoding=True) 

# Process Temperature Rain dataset - ONLY TRAIN DATA for Learn2Clean optimization
# This avoids data leakage by not using validation data in preprocessing decisions
temp_rain_files = ["../datasets/temperature_rain/temperature_rain_train.csv"]
temp_rain_encoded = d_enc.train_test_split(temp_rain_files, 'target_class')

print("\nProcessed dataset structure (TRAIN ONLY):")
print(f"Train shape: {temp_rain_encoded['train'].shape}")
print(f"Target shape: {temp_rain_encoded['target'].shape}")
print(f"Target name: {temp_rain_encoded['target'].name}")
print("\nNote: Only training data used for Learn2Clean to avoid data leakage!")

## 5) Manual Data Cleaning Pipeline for Time Series Features

Let's create a manual preprocessing pipeline focusing on the statistical features extracted from time series.

In [None]:
# Import Learn2Clean modules for manual pipeline
import learn2clean.loading.reader as rd 
import learn2clean.normalization.normalizer as nl 
import learn2clean.feature_selection.feature_selector as fs
import learn2clean.duplicate_detection.duplicate_detector as dd
import learn2clean.outlier_detection.outlier_detector as od
import learn2clean.imputation.imputer as imp
import learn2clean.classification.classifier as cl

# Create a copy of the dataset for manual processing
manual_dataset = temp_rain_encoded.copy()

print("Starting manual preprocessing pipeline for time series features...")

# Step 1: Handle missing values
print("\n1. Imputation - Replace missing values")
imputer = imp.Imputer(dataset=manual_dataset, strategy='median', verbose=True)
manual_dataset = imputer.transform()

# Step 2: Outlier detection for statistical features
print("\n2. Outlier Detection")
outlier_detector = od.Outlier_detector(dataset=manual_dataset, strategy='LOF', verbose=True)
manual_dataset = outlier_detector.transform()

# Step 3: Normalization of statistical features
print("\n3. Normalization")
normalizer = nl.Normalizer(dataset=manual_dataset, strategy='standard', exclude='target_class', verbose=True)
manual_dataset = normalizer.transform()

# Step 4: Feature selection
print("\n4. Feature Selection")
feat_selector = fs.Feature_selector(dataset=manual_dataset, strategy='WR', exclude='target_class', verbose=True)
manual_dataset = feat_selector.transform()

# Step 5: Duplicate detection
print("\n5. Duplicate Detection")
dup_detector = dd.Duplicate_detector(dataset=manual_dataset, strategy='drop_duplicates', verbose=True)
manual_dataset = dup_detector.transform()

print("\nManual preprocessing completed!")
print(f"Final train shape: {manual_dataset['train'].shape}")
print(f"Final test shape: {manual_dataset['test'].shape}")

## 6) Classification with Manual Pipeline

In [None]:
# Test classification with manually cleaned data
print("Testing classification with manually cleaned time series features...")

# Try different classifiers
classifiers = ['CART', 'NB', 'LDA']

for clf_name in classifiers:
    try:
        print(f"\nTesting {clf_name} classifier:")
        classifier = cl.Classifier(dataset=manual_dataset, goal=clf_name, target_goal='target_class', verbose=True)
        result = classifier.transform()
        print(f"{clf_name} classification completed successfully")
    except Exception as e:
        print(f"Error with {clf_name}: {e}")

## 7) Automated Learn2Clean Pipeline

Now let's use Learn2Clean's Q-learning approach to automatically find the best preprocessing pipeline for time series features.

In [None]:
import learn2clean.qlearning.qlearner as ql

# Create a fresh copy of the dataset for Learn2Clean
l2c_dataset = temp_rain_encoded.copy()

print("Starting Learn2Clean automated pipeline for time series features...")
print("This may take several minutes to find the optimal preprocessing sequence.")

# Learn2Clean for CART classification
l2c_classification = ql.Qlearner(
    dataset=l2c_dataset,
    goal='CART', 
    target_goal='target_class',
    threshold=0.6, 
    target_prepare=None, 
    file_name='temperature_rain_example', 
    verbose=False
)

# Run Learn2Clean optimization
l2c_classification.learn2clean()

## 8) Alternative Approaches for Time Series

Since this is time series data, let's also test regression approaches which might be more suitable.

In [None]:
# Test Learn2Clean with regression using mean_value as target
try:
    print("Testing Learn2Clean with regression approach...")
    
    # Create dataset for regression (predicting mean temperature)
    regression_files = ["../datasets/temperature_rain/temperature_rain_train.csv", "../datasets/temperature_rain/temperature_rain_test.csv"]
    d_reg = rd.Reader(sep=',', verbose=True, encoding=False)  # No encoding for regression
    temp_rain_regression = d_reg.train_test_split(regression_files, 'mean_value')
    
    # Learn2Clean for regression
    l2c_regression = ql.Qlearner(
        dataset=temp_rain_regression,
        goal='LASSO',  # Regression approach
        target_goal='mean_value',
        threshold=0.6,
        target_prepare=None,
        file_name='temperature_rain_regression_example',
        verbose=False
    )
    
    l2c_regression.learn2clean()
    print("Regression approach completed")
    
except Exception as e:
    print(f"Error with regression approach: {e}")

## 9) Random Baseline Comparison

In [None]:
# Compare with random preprocessing pipeline
random_dataset = temp_rain_encoded.copy()

print("Running random preprocessing pipeline for comparison...")

# Random preprocessing pipeline for CART classification
random_pipeline = ql.Qlearner(
    dataset=random_dataset,
    goal='CART',
    target_goal='target_class',
    target_prepare=None, 
    verbose=False
)

try:
    random_pipeline.random_cleaning('temperature_rain_random_example')
    print("Random pipeline completed successfully")
except Exception as e:
    print(f"Random pipeline error: {e}")

## 10) Results Analysis

In [None]:
# Check if results files exist and display them
import os

results_files = [
    'save/temperature_rain_example_results.txt',
    'save/temperature_rain_regression_example_results.txt',
    'save/temperature_rain_random_example_results_file.txt'
]

for file_path in results_files:
    if os.path.exists(file_path):
        print(f"\n=== Results from {file_path} ===")
        with open(file_path, 'r') as f:
            content = f.read()
            print(content[-500:])  # Show last 500 characters
    else:
        print(f"Results file not found: {file_path}")

## Summary

This notebook demonstrated how to apply Learn2Clean to the Temperature Rain time series dataset by extracting cleanable tabular features. The key adaptations were:

1. **Feature Extraction**: Converted time series data into statistical features (mean, std, min, max, skewness, etc.)
2. **Target Creation**: Created both classification (high/low temperature) and regression (mean temperature) targets
3. **Data Preparation**: Converted extracted features to CSV format for Learn2Clean compatibility
4. **Profiling**: Used Learn2Clean's profiling capabilities to understand feature distributions
5. **Manual Pipeline**: Created preprocessing pipeline for statistical features including outlier detection, normalization, and feature selection
6. **Automated Pipeline**: Used Learn2Clean's Q-learning for both classification and regression approaches
7. **Comparison**: Compared Learn2Clean results with random preprocessing baselines

**Important Note**: Since Learn2Clean is designed for tabular data cleaning rather than time series analysis, this approach focuses on cleaning statistical features extracted from the time series. For pure time series forecasting tasks, specialized time series preprocessing and AutoGluon's TimeSeriesPredictor would be more appropriate.

The extracted features allow Learn2Clean to optimize preprocessing for tasks like:
- Classifying weather stations by temperature patterns
- Predicting average temperature from statistical features
- Detecting anomalous weather stations based on their time series characteristics