## *Since this is not part of the main labeling process, some dependencies need to be installed (including the kernel).*

In [None]:
import sys
import os
from datetime import datetime, timedelta
from collections import defaultdict
import matplotlib.pyplot as plt

# –î–æ–±–∞–≤–ª—è–µ–º –ø—É—Ç—å –∫ –∫–æ—Ä–Ω–µ–≤–æ–π –¥–∏—Ä–µ–∫—Ç–æ—Ä–∏–∏ –ø—Ä–æ–µ–∫—Ç–∞
project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
sys.path.append(project_root)

# –ò–º–ø–æ—Ä—Ç—ã –¥–ª—è —Ä–∞–±–æ—Ç—ã —Å –¥–∞–Ω–Ω—ã–º–∏
from src.data_formats import TimeSeriesDataset, TimeSeries, TimeSeriesPoint, JSONAdapter
import json

## **Here you need to load your dataset from your source in TimeSeriesDataset format.**

In [None]:
# get from any kind of source TimeSeriesDataset
dataset: TimeSeriesDataset

## Loading data from JSON

In [None]:
def load_dataset_from_json(input_path):
    """Load a TimeSeriesDataset from a JSON file"""
    try:
        adapter = JSONAdapter()
        dataset = adapter.load_data(input_path)
        print(f"Dataset successfully loaded from {input_path}")
        return dataset
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return None

# Load dataset from JSON
input_file = "../data/csdeals_sales_dataset_.json"
dataset = load_dataset_from_json(input_file)

if dataset:
    print(f"\nDataset loaded successfully!")
    print(f"File: {input_file}")
    
    # Show statistics
    print(f"\nDataset statistics:")
    print(f"    - Total time series: {len(dataset)}")
    print(f"    - Unlabeled series: {len(dataset.get_unlabeled_series())}")
    print(f"    - Labeled series: {len(dataset.get_labeled_series())}")
    
    # Show series examples
    if len(dataset) > 0:
        print(f"\nTime series examples:")
        for i, series in enumerate(dataset.series[:3]):
            print(f"  {i+1}. {series.name} - {series.length()} points, price: {series.metadata['min_price']:.2f}-{series.metadata['max_price']:.2f}")
    
    # Now the dataset is ready for use in the labeling application
    # or for further processing
else:
    print("Failed to load dataset")


# Filtering process based on trimmed_mean_hours + number of sales over different time periods
main idea:

1) *calculate the usual trimmed_mean for intervals, removing extreme min/max values*
2) *based on time periods, we establish limits:*

{1 day - 2 sales, 3 days - 5 sales, 7 days - 10 sales, 1 month - 18 sales}


In [None]:
from typing import List, Dict, Any, Optional, Tuple
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
from dataclasses import dataclass


@dataclass
class FilterResult:
    """Result of checking a single filter"""
    passed: bool
    value: Optional[float] = None
    details: Optional[Dict[str, Any]] = None


class TimeSeriesFilter:
    def __init__(self, settings: Dict[str, Any], reference_date: Optional[datetime] = None):
        self.settings = settings
        self.reference_date = reference_date or datetime.now()
    
    def filter_dataset(self, dataset: TimeSeriesDataset) -> TimeSeriesDataset:
        """Filter the entire dataset"""
        filtered_series = []
        
        for ts in dataset.series:
            if self._should_keep_series(ts):
                filtered_series.append(ts)
        
        return TimeSeriesDataset(
            name=f"{dataset.name}_filtered",
            description=f"Filtered version of {dataset.description or dataset.name}",
            series=filtered_series,
            metadata={
                **(dataset.metadata or {}),
                "filter_settings": self.settings,
                "original_count": len(dataset.series),
                "filtered_count": len(filtered_series)
            },
            created_at=dataset.created_at
        )
    
    def _should_keep_series(self, ts: TimeSeries) -> bool:
        """Determine whether to keep the time series"""
        # Initialize filter metadata
        if ts.metadata is None:
            ts.metadata = {}
        ts.metadata["filter_results"] = {}
        
        # Convert data to convenient format
        df = self._prepare_dataframe(ts)
        
        # Check each filter
        checks = [
            ("window_size", self._check_window_size(df)),
            ("avg_trimmed_hours", self._check_avg_trimmed_hours(df)),
            ("max_consecutive_empty_days", self._check_consecutive_empty_days(df)),
            ("periods", self._check_periods(df))
        ]
        
        # Save results and determine overall result
        all_passed = True
        for check_name, result in checks:
            ts.metadata["filter_results"][check_name] = {
                "passed": result.passed,
                "value": result.value,
                "details": result.details
            }
            all_passed = all_passed and result.passed
        
        ts.metadata["filter_results"]["overall_passed"] = all_passed
        return all_passed
    
    def _prepare_dataframe(self, ts: TimeSeries) -> pd.DataFrame:
        """Prepare DataFrame from TimeSeries"""
        data = []
        for point in ts.points:
            # Convert timestamp to datetime if needed
            if isinstance(point.timestamp, (int, float)):
                dt = datetime.fromtimestamp(point.timestamp)
            else:
                dt = pd.to_datetime(point.timestamp)
            
            data.append({
                'timestamp': dt,
                'value': point.value
            })
        
        df = pd.DataFrame(data)
        df = df.sort_values('timestamp').reset_index(drop=True)
        df['date'] = df['timestamp'].dt.date
        return df
    
    def _check_window_size(self, df: pd.DataFrame) -> FilterResult:
        """Check minimum window size"""
        length = len(df)
        required = self.settings['window_size']
        
        return FilterResult(
            passed=length >= required,
            value=length,
            details={"required": required, "actual": length}
        )
    
    def _check_avg_trimmed_hours(self, df: pd.DataFrame) -> FilterResult:
        """Check average time between sales (in hours)"""
        if len(df) < 2:
            return FilterResult(
                passed=False,
                value=None,
                details={"reason": "Not enough data points for interval calculation"}
            )
        
        # Calculate intervals between sales
        intervals = df['timestamp'].diff().dropna()
        intervals_hours = intervals.dt.total_seconds() / 3600
        
        # Remove extreme values (trimming)
        trimmed_intervals = self._trim_outliers(intervals_hours.values)
        avg_hours = np.mean(trimmed_intervals) if len(trimmed_intervals) > 0 else float('inf')
        
        max_allowed = self.settings['avg_trimmed_hours_max']
        
        return FilterResult(
            passed=avg_hours <= max_allowed,
            value=avg_hours,
            details={
                "max_allowed": max_allowed,
                "intervals_count": len(intervals_hours),
                "trimmed_count": len(trimmed_intervals)
            }
        )


    def _check_consecutive_empty_days(self, df: pd.DataFrame) -> FilterResult:
        """Check maximum number of consecutive empty days in the last N days"""
        
        period_days = self.settings.get('consecutive_empty_days_period', 14)
        max_allowed = self.settings['max_consecutive_empty_days']
        
        # Define period for analysis (last N days from reference_date)
        reference_date = self.reference_date.date()
        start_date = reference_date - timedelta(days=period_days - 1)
        
        # Create full calendar for the period
        all_dates = pd.date_range(start=start_date, end=reference_date, freq='D').date
        
        # Get dates with sales in the period
        period_df = df[
            (df['timestamp'].dt.date >= start_date) & 
            (df['timestamp'].dt.date <= reference_date)
        ]
        sales_dates = set(period_df['date'].unique()) if not period_df.empty else set()
    
        # Find maximum consecutive sequence of empty days
        max_consecutive = 0
        current_consecutive = 0
        empty_periods = []
        period_start = None
        
        for date in all_dates:
            if date not in sales_dates:
                # Empty day
                if current_consecutive == 0:
                    period_start = date
                current_consecutive += 1
                max_consecutive = max(max_consecutive, current_consecutive)
            else:
                # Day with sales - break the sequence
                if current_consecutive > 0:
                    empty_periods.append({
                        "start": str(period_start),
                        "end": str(date - timedelta(days=1)),
                        "days": current_consecutive
                    })
                current_consecutive = 0
        
        # If period ends with empty days
        if current_consecutive > 0:
            empty_periods.append({
                "start": str(period_start),
                "end": str(reference_date),
                "days": current_consecutive
            })
        
        return FilterResult(
            passed=max_consecutive <= max_allowed,
            value=max_consecutive,
            details={
                "max_allowed": max_allowed,
                "period_start": str(start_date),
                "period_end": str(reference_date),
                "period_days": period_days,
                "total_days_checked": len(all_dates),
                "sales_days": len(sales_dates),
                "empty_days": len(all_dates) - len(sales_dates),
                "empty_periods": empty_periods
            }
        )




    def _check_periods(self, df: pd.DataFrame) -> FilterResult:
        """Check period requirements"""
        if df.empty:
            return FilterResult(
                passed=False,
                value=None,
                details={"reason": "No data available"}
            )
        
        # Use passed date or current date
        reference_date = self.reference_date.date()
        
        period_results = {}
        all_passed = True
        
        for period_days, min_sales in self.settings['periods'].items():
            start_date = reference_date - timedelta(days=period_days - 1)
            
            # Count sales in the period
            period_sales = df[
                (df['timestamp'].dt.date >= start_date) & 
                (df['timestamp'].dt.date <= reference_date)
            ]
            actual_sales = len(period_sales)
            
            passed = actual_sales >= min_sales
            all_passed = all_passed and passed
            
            period_results[f"last_{period_days}_days"] = {
                "required": min_sales,
                "actual": actual_sales,
                "passed": passed,
                "period_start": str(start_date),
                "period_end": str(reference_date),
                "reference_date": str(reference_date)
            }
        
        return FilterResult(
            passed=all_passed,
            value=None,
            details=period_results
        )



    def _trim_outliers(self, values: np.ndarray, trim_percent: float = 0.1) -> np.ndarray:
        """Remove outliers from value array"""
        if len(values) <= 2:
            return values
        
        q_low = np.percentile(values, trim_percent * 100)
        q_high = np.percentile(values, (1 - trim_percent) * 100)
        
        return values[(values >= q_low) & (values <= q_high)]


# Function for convenient usage
def apply_filter(dataset: TimeSeriesDataset, 
                filter_settings: Dict[str, Any], 
                reference_date: Optional[datetime] = None) -> TimeSeriesDataset:
    """
    Apply filter to dataset
    
    Args:
        dataset: Original dataset
        filter_settings: Filter settings
        reference_date: Reference date for period checks (default - current)
    
    Returns:
        Filtered dataset
    """
    filter_instance = TimeSeriesFilter(filter_settings, reference_date)
    return filter_instance.filter_dataset(dataset)



# Function for analyzing filter results
def analyze_filter_results(dataset: TimeSeriesDataset) -> Dict[str, Any]:
    """Analyze filter results"""
    total_series = len(dataset.series)
    passed_series = sum(1 for ts in dataset.series 
                       if ts.metadata and ts.metadata.get("filter_results", {}).get("overall_passed", False))
    
    # Statistics for each filter
    filter_stats = {}
    filter_names = ["window_size", "avg_trimmed_hours", "max_consecutive_empty_days", "periods"]
    
    for filter_name in filter_names:
        passed_count = sum(1 for ts in dataset.series 
                          if ts.metadata and 
                          ts.metadata.get("filter_results", {}).get(filter_name, {}).get("passed", False))
        filter_stats[filter_name] = {
            "passed": passed_count,
            "total": total_series,
            "pass_rate": passed_count / total_series if total_series > 0 else 0
        }
    
    return {
        "total_series": total_series,
        "passed_series": passed_series,
        "overall_pass_rate": passed_series / total_series if total_series > 0 else 0,
        "filter_stats": filter_stats
    }


In [None]:
FILTER_SETTINGS = {
    'window_size': 10,
    'avg_trimmed_hours_max': 70,
    'max_consecutive_empty_days': 5,  # No more than 5 consecutive gap days
    'consecutive_empty_days_period': 14,  # Within the last 14 days
    'periods': {
        3: 2,
        7: 5,
        14: 9,
    }
}


# Apply filter
filtered_dataset = apply_filter(dataset, FILTER_SETTINGS)


# Analyze results
results = analyze_filter_results(dataset)
print(f"Passed filtering: {results['passed_series']} out of {results['total_series']}")
print(f"Overall pass rate: {results['overall_pass_rate']:.2%}")


# Detailed statistics for each filter
for filter_name, stats in results['filter_stats'].items():
    print(f"{filter_name}: {stats['passed']}/{stats['total']} ({stats['pass_rate']:.2%})")


## Visualization of accepted series

In [None]:
import matplotlib.pyplot as plt
from datetime import datetime, timedelta


def show_accepted_series_analysis(dataset, max_examples=10, filter_settings=None):
    """Show analysis of accepted series with individual plots and timestamps, visualizing filter periods."""
    
    print(f"\nAnalysis of accepted series (showing up to {max_examples} examples):")
    
    if not dataset or len(dataset.series) == 0:
        print("No accepted series for analysis")
        return
    
    # Collect only accepted series
    accepted_series = []
    for series in dataset.series:
        filter_results = series.metadata.get('filter_results', {})
        if filter_results.get('overall_passed', False):
            accepted_series.append(series)
    
    if not accepted_series:
        print("No accepted series found")
        return
    
    print(f"\n=== Overall statistics ===")
    print(f"Total accepted: {len(accepted_series)}")
    print(f"Total in dataset: {len(dataset.series)}")
    print(f"Acceptance rate: {len(accepted_series)/len(dataset.series)*100:.1f}%")
    
    # Get filter settings
    if filter_settings is None:
        # Try to extract from dataset metadata
        filter_settings = dataset.metadata.get('filter_settings', {})
    
    periods = filter_settings.get('periods', {14: 9})
    max_period_days = max(periods.keys()) if periods else 14
    current_date = datetime.now()
    
    # Sort by quality (fewer gaps = better)
    def get_quality_score(series):
        filter_results = series.metadata.get('filter_results', {})
        consecutive_empty = filter_results.get('max_consecutive_empty_days', {}).get('value', 0)
        avg_hours = filter_results.get('avg_trimmed_hours', {}).get('value', 999)
        return consecutive_empty * 10 + avg_hours  # Penalty for gaps is higher
    
    sorted_accepted = sorted(accepted_series, key=get_quality_score)
    
    # Show best examples
    examples_to_show = min(max_examples, len(sorted_accepted))
    print(f"\n=== Showing {examples_to_show} best examples ===")
    
    for i in range(examples_to_show):
        series = sorted_accepted[i]
        filter_results = series.metadata.get('filter_results', {})
        
        # Prepare data
        timestamps = series.get_timestamps()
        values = series.get_values()
        
        # Convert timestamp to datetime
        dates_all = []
        for ts in timestamps:
            if isinstance(ts, (int, float)):
                dates_all.append(datetime.fromtimestamp(ts))
            else:
                dates_all.append(pd.to_datetime(ts).to_pydatetime())
        
        # Filter data by maximum period
        start_date = current_date - timedelta(days=max_period_days)
        dates_filtered = []
        values_filtered = []
        
        for dt, val in zip(dates_all, values):
            if dt >= start_date:
                dates_filtered.append(dt)
                values_filtered.append(val)
        
        # If no data in period, take last points
        if not dates_filtered:
            dates_filtered = dates_all[-5:] if len(dates_all) >= 5 else dates_all
            values_filtered = values[-5:] if len(values) >= 5 else values
        
        # Add current date as virtual point
        dates_filtered.append(current_date)
        values_filtered.append(values_filtered[-1] if values_filtered else 0)
        
        # Create plot
        plt.figure(figsize=(14, 8))
        
        # Calculate intervals between sales in hours
        diffs_hours = []
        for j in range(1, len(dates_filtered)):
            delta = (dates_filtered[j] - dates_filtered[j-1]).total_seconds() / 3600
            diffs_hours.append(round(delta, 2))
        
        print(f"Series {i+1} - Time intervals (hours): {diffs_hours}")
        
        # Plot historical sales
        plt.plot(dates_filtered[:-1], values_filtered[:-1], 'g-o', 
                markersize=6, linewidth=2, label='Historical sales')
        
        # Current date
        plt.plot(dates_filtered[-1], values_filtered[-1], 'r*', 
                markersize=12, label='Current date')
        
        # Dashed line to current date
        if len(dates_filtered) > 1:
            plt.plot([dates_filtered[-2], dates_filtered[-1]], 
                    [values_filtered[-2], values_filtered[-1]], 
                    'r--', alpha=0.7, linewidth=1)
        
        # Highlight periods with colored bands
        colors = ['lightblue', 'lightgreen', 'lightyellow', 'lightcoral', 'lightpink']
        for idx, (period_days, min_sales) in enumerate(sorted(periods.items())):
            period_start = current_date - timedelta(days=period_days)
            plt.axvspan(period_start, current_date, alpha=0.2, 
                       color=colors[idx % len(colors)], 
                       label=f'{period_days}d period (need {min_sales} sales)')
        
        # Plot formatting
        plt.xlabel('Time', fontsize=12)
        plt.ylabel('Price', fontsize=12)
        plt.title(f'ACCEPTED SERIES #{i+1}: {series.name or series.id}\n'
                 f'Quality Score: {get_quality_score(series):.1f} (lower = better)', 
                 fontsize=14, fontweight='bold')
        plt.grid(True, alpha=0.3)
        plt.legend(fontsize=10)
        plt.xticks(rotation=45)
        
        # Information block with filter results
        info_lines = []
        info_lines.append("FILTER RESULTS:")
        
        # Window size
        window_result = filter_results.get('window_size', {})
        info_lines.append(f"‚Ä¢ Window size: {window_result.get('value', 'N/A')} "
                         f"({'‚úì' if window_result.get('passed') else '‚úó'})")
        
        # Average trimmed hours
        avg_hours_result = filter_results.get('avg_trimmed_hours', {})
        info_lines.append(f"‚Ä¢ Avg interval: {avg_hours_result.get('value', 0):.1f}h "
                         f"({'‚úì' if avg_hours_result.get('passed') else '‚úó'})")
        
        # Consecutive empty days
        empty_days_result = filter_results.get('max_consecutive_empty_days', {})
        info_lines.append(f"‚Ä¢ Max gaps: {empty_days_result.get('value', 0)} days "
                         f"({'‚úì' if empty_days_result.get('passed') else '‚úó'})")
        
        # Periods
        periods_result = filter_results.get('periods', {})
        info_lines.append("‚Ä¢ Periods:")
        if periods_result.get('details'):
            for period_key, period_data in periods_result['details'].items():
                period_days = period_key.replace('last_', '').replace('_days', '')
                info_lines.append(f"   - {period_days}d: {period_data['actual']}/{period_data['required']} "
                                f"({'‚úì' if period_data['passed'] else '‚úó'})")
        
        info_text = '\n'.join(info_lines)
        
        plt.text(0.02, 0.98, info_text, transform=plt.gca().transAxes, 
                verticalalignment='top', fontsize=9,
                bbox=dict(boxstyle='round,pad=0.5', facecolor='lightgreen', alpha=0.8))
        
        plt.tight_layout()
        plt.show()
        
        print(f"   Series {i+1}: {series.name or series.id} - Quality Score: {get_quality_score(series):.1f}")


def show_filter_statistics(dataset, filter_settings):
    """Show detailed filter statistics"""
    
    print("\n" + "="*60)
    print("DETAILED FILTER STATISTICS")
    print("="*60)
    
    total_series = len(dataset.series)
    passed_series = sum(1 for s in dataset.series 
                       if s.metadata.get('filter_results', {}).get('overall_passed', False))
    
    print(f"Overall: {passed_series}/{total_series} ({passed_series/total_series*100:.1f}%) passed")
    
    # Analysis for each filter
    filters = ['window_size', 'avg_trimmed_hours', 'max_consecutive_empty_days', 'periods']
    
    for filter_name in filters:
        print(f"\nüìä {filter_name.upper()}:")
        
        passed = 0
        failed_values = []
        passed_values = []
        
        for series in dataset.series:
            result = series.metadata.get('filter_results', {}).get(filter_name, {})
            if result.get('passed'):
                passed += 1
                if result.get('value') is not None:
                    passed_values.append(result['value'])
            else:
                if result.get('value') is not None:
                    failed_values.append(result['value'])
        
        failed = total_series - passed
        print(f"   Passed: {passed}/{total_series} ({passed/total_series*100:.1f}%)")
        print(f"   Failed: {failed}/{total_series} ({failed/total_series*100:.1f}%)")
        
        if failed_values:
            print(f"   Failed values - min: {min(failed_values):.2f}, "
                  f"max: {max(failed_values):.2f}, "
                  f"avg: {np.mean(failed_values):.2f}")
        
        if passed_values:
            print(f"   Passed values - min: {min(passed_values):.2f}, "
                  f"max: {max(passed_values):.2f}, "
                  f"avg: {np.mean(passed_values):.2f}")


# Usage:
show_filter_statistics(dataset, FILTER_SETTINGS)
show_accepted_series_analysis(filtered_dataset, max_examples=10, filter_settings=FILTER_SETTINGS)


## Visualization of rejected series


In [None]:
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
import numpy as np



def show_rejected_for_filter(dataset, filter_name, max_examples=10, filter_settings=None):
    """
    Show rejected series for a specific filter, sorted by closeness to acceptance
    
    Args:
        dataset: Dataset with filter results
        filter_name: Filter name ('window_size', 'avg_trimmed_hours', 'max_consecutive_empty_days', 'periods')
        max_examples: Maximum number of examples to show
        filter_settings: Filter settings
    """
    
    print(f"\n{'='*60}")
    print(f"REJECTED SERIES ANALYSIS FOR FILTER: {filter_name.upper()}")
    print(f"{'='*60}")
    
    if not dataset or len(dataset.series) == 0:
        print("No series in dataset")
        return
    
    # Collect series rejected specifically by this filter
    rejected_by_filter = []
    
    for series in dataset.series:
        filter_results = series.metadata.get('filter_results', {})
        
        # Check that series is rejected and specifically by this filter
        if filter_name in filter_results:
            filter_result = filter_results[filter_name]
            
            if not filter_result.get('passed', True):
                # Calculate distance to threshold for this filter
                distance = calculate_filter_distance(filter_name, filter_result, filter_settings)
                rejected_by_filter.append((series, distance, filter_result))
    
    if not rejected_by_filter:
        print(f"No series rejected specifically by filter '{filter_name}'")
        return
    
    # Sort by closeness to threshold (smaller distance = closer to acceptance)
    rejected_by_filter.sort(key=lambda x: x[1])
    
    total_rejected_by_filter = len(rejected_by_filter)
    print(f"Total series rejected by '{filter_name}': {total_rejected_by_filter}")
    
    # Distance statistics
    distances = [x[1] for x in rejected_by_filter if x != np.inf]
    if distances:
        print(f"Distance statistics:")
        print(f"  - Closest to acceptance: {min(distances):.2f}")
        print(f"  - Farthest from acceptance: {max(distances):.2f}")
        print(f"  - Average distance: {np.mean(distances):.2f}")
    
    # Show examples
    examples_to_show = min(max_examples, total_rejected_by_filter)
    print(f"\nShowing {examples_to_show} series closest to passing '{filter_name}':")
    
    # Get settings for visualization
    if filter_settings is None:
        filter_settings = dataset.metadata.get('filter_settings', {})
    
    periods = filter_settings.get('periods', {14: 9})
    max_period_days = max(periods.keys()) if periods else 14
    current_date = datetime.now()
    
    for i in range(examples_to_show):
        series, distance, filter_result = rejected_by_filter[i]
        
        # Prepare data for plot
        timestamps = series.get_timestamps()
        values = series.get_values()
        
        # Convert timestamp
        dates_all = []
        for ts in timestamps:
            if isinstance(ts, (int, float)):
                dates_all.append(datetime.fromtimestamp(ts))
            else:
                dates_all.append(pd.to_datetime(ts).to_pydatetime())
        
        # Filter by maximum period
        start_date = current_date - timedelta(days=max_period_days)
        dates_filtered = []
        values_filtered = []
        
        for dt, val in zip(dates_all, values):
            if dt >= start_date:
                dates_filtered.append(dt)
                values_filtered.append(val)
        
        # If no data in period, take the last ones
        if not dates_filtered:
            dates_filtered = dates_all[-10:] if len(dates_all) >= 10 else dates_all
            values_filtered = values[-10:] if len(values) >= 10 else values
        
        # Add current date
        dates_filtered.append(current_date)
        values_filtered.append(values_filtered[-1] if values_filtered else 0)
        
        # Create plot
        plt.figure(figsize=(15, 8))
        
        # Calculate intervals between sales
        diffs_hours = []
        for j in range(1, len(dates_filtered)):
            delta = (dates_filtered[j] - dates_filtered[j-1]).total_seconds() / 3600
            diffs_hours.append(round(delta, 2))
        
        print(f"\nSeries {i+1} intervals (hours): {diffs_hours}")
        
        # Sales plot
        plt.plot(dates_filtered[:-1], values_filtered[:-1], 'orange', 
                marker='o', markersize=6, linewidth=2, label='Historical sales')
        
        # Current date
        plt.plot(dates_filtered[-1], values_filtered[-1], 'r*', 
                markersize=12, label='Current date')
        
        # Dashed line to current date  
        if len(dates_filtered) > 1:
            plt.plot([dates_filtered[-2], dates_filtered[-1]], 
                    [values_filtered[-2], values_filtered[-1]], 
                    'r--', alpha=0.7, linewidth=1)
        
        # Highlight periods with orange shades
        colors = ['moccasin', 'peachpuff', 'navajowhite', 'papayawhip', 'bisque']
        for idx, (period_days, min_sales) in enumerate(sorted(periods.items())):
            period_start = current_date - timedelta(days=period_days)
            plt.axvspan(period_start, current_date, alpha=0.2, 
                       color=colors[idx % len(colors)], 
                       label=f'{period_days}d period (need {min_sales} sales)')
        
        # Plot formatting
        plt.xlabel('Time', fontsize=12)
        plt.ylabel('Price', fontsize=12)
        plt.title(f'REJECTED BY {filter_name.upper()} #{i+1}: {series.name or series.id}\n'
                 f'Distance to passing: {distance:.2f} (closer to 0 = closer to acceptance)', 
                 fontsize=14, fontweight='bold', color='darkorange')
        plt.grid(True, alpha=0.3)
        plt.legend(fontsize=10)
        plt.xticks(rotation=45)
        
        # Detailed filter information
        info_lines = create_filter_info(filter_name, filter_result, filter_settings, distance)
        info_text = '\n'.join(info_lines)
        
        plt.text(0.02, 0.98, info_text, transform=plt.gca().transAxes, 
                verticalalignment='top', fontsize=10,
                bbox=dict(boxstyle='round,pad=0.5', facecolor='moccasin', alpha=0.8))
        
        plt.tight_layout()
        plt.show()
        
        print(f"   #{i+1}: {series.name or series.id} - Distance: {distance:.2f}")


def calculate_filter_distance(filter_name, filter_result, filter_settings):
    """Calculate distance from current value to acceptance threshold"""
    
    actual = filter_result.get('value')
    if actual is None:
        return np.inf
    
    if filter_name == 'window_size':
        threshold = filter_settings.get('window_size', 10)
        return max(0, threshold - actual)  # Need more data
    
    elif filter_name == 'avg_trimmed_hours':
        threshold = filter_settings.get('avg_trimmed_hours_max', 700)
        return max(0, actual - threshold)  # Need fewer hours
    
    elif filter_name == 'max_consecutive_empty_days':
        max_consec = filter_settings.get('max_consecutive_empty_days', 5)
        if isinstance(max_consec, dict):
            threshold = max_consec.get('max_gaps', 5)
        else:
            threshold = max_consec
        return max(0, actual - threshold)  # Need fewer gaps
    
    elif filter_name == 'periods':
        # For periods take minimum missing number of sales
        details = filter_result.get('details', {})
        min_shortage = np.inf
        
        for period_key, period_data in details.items():
            if not period_data.get('passed', True):
                required = period_data.get('required', 0)
                actual_sales = period_data.get('actual', 0)
                shortage = required - actual_sales
                min_shortage = min(min_shortage, shortage)
        
        return min_shortage if min_shortage != np.inf else 0
    
    return np.inf


def create_filter_info(filter_name, filter_result, filter_settings, distance):
    """Create information block for specific filter"""
    
    info_lines = [f"üîç FILTER: {filter_name.upper()}"]
    info_lines.append(f"Distance to passing: {distance:.2f}")
    info_lines.append("-" * 25)
    
    if filter_name == 'window_size':
        actual = filter_result.get('value', 0)
        required = filter_settings.get('window_size', 10)
        info_lines.append(f"Actual length: {actual}")
        info_lines.append(f"Required: {required}")
        info_lines.append(f"Need {required - actual} more points")
    
    elif filter_name == 'avg_trimmed_hours':
        actual = filter_result.get('value', 0)
        max_allowed = filter_settings.get('avg_trimmed_hours_max', 700)
        info_lines.append(f"Avg interval: {actual:.1f} hours")
        info_lines.append(f"Max allowed: {max_allowed} hours")
        info_lines.append(f"Exceeds by: {actual - max_allowed:.1f} hours")
    
    elif filter_name == 'max_consecutive_empty_days':
        actual = filter_result.get('value', 0)
        max_consec = filter_settings.get('max_consecutive_empty_days', 5)
        if isinstance(max_consec, dict):
            max_allowed = max_consec.get('max_gaps', 5)
            period = max_consec.get('period', 14)
            info_lines.append(f"Max gaps in {period}d: {actual}")
        else:
            max_allowed = max_consec
            info_lines.append(f"Max consecutive gaps: {actual}")
        info_lines.append(f"Max allowed: {max_allowed}")
        info_lines.append(f"Exceeds by: {actual - max_allowed} days")
    
    elif filter_name == 'periods':
        info_lines.append("Period requirements:")
        details = filter_result.get('details', {})
        for period_key, period_data in details.


In [None]:
# Analysis of specific filter
#show_rejected_for_filter(dataset, 'max_consecutive_empty_days', max_examples=10, filter_settings=FILTER_SETTINGS)


# Or for periods
#show_rejected_for_filter(dataset, 'periods', max_examples=10, filter_settings=FILTER_SETTINGS) 


# Analysis of all filters at once
analyze_all_filters(dataset, FILTER_SETTINGS, max_per_filter=7)


# The filtering process that leads to FILTERING_SETTINGS - the final filtration based on various statistical methods (cv, outliers, max_gaps, max_range vs mean, etc)


In [None]:
def save_dataset_to_json(dataset, output_path):
    """Save TimeSeriesDataset to a JSON file"""
    try:
        adapter = JSONAdapter()
        adapter.save_data(dataset, output_path)
        print(f"Dataset successfully saved to {output_path}")
        return True
    except Exception as e:
        print(f"Error saving dataset: {e}")
        return False


# Save dataset to JSON
if dataset:
    output_file = "../data/dataset.json"
    #success = save_dataset_to_json(dataset, output_file)


    print(f"\nDataset ready for use in the annotation application!")
    print(f"File: {output_file}")
    
    # Display statistics
    print(f"\nDataset statistics:")
    print(f"  - Total time series: {len(dataset)}")
    print(f"  - Unlabeled series: {len(dataset.get_unlabeled_series())}")
    print(f"  - Labeled series: {len(dataset.get_labeled_series())}")
    
    # Display sample series
    if len(dataset) > 0:
        print(f"\nSample time series:")
        for i, series in enumerate(dataset.series[:3]):
            print(f"  {i+1}. {series.name} - {series.length()} points, price: {series.metadata['min_price']:.2f}-{series.metadata['max_price']:.2f}")


Filter Dataset by Series Length


In [None]:
# Additional function to filter by series length
def filter_dataset_by_length(dataset, min_length=8, max_length=None):
    """Filter dataset by time series length"""
    filtered_dataset = dataset.filter_by_length(min_length, max_length)
    
    print(f"Filtering by series length:")
    print(f"  - Minimum length: {min_length}")
    print(f"  - Maximum length: {max_length or 'unlimited'}")
    print(f"  - Original series count: {len(dataset)}")
    print(f"  - After filtering: {len(filtered_dataset)}")
    
    return filtered_dataset



filtered_dataset_by_length = filter_dataset_by_length(dataset, min_length=10, max_length=9999)


# Save filtered dataset
if len(filtered_dataset_by_length) > 0:
    filtered_output_file = "../data/dataset_filtered_by_length.json"
    save_dataset_to_json(filtered_dataset_by_length, filtered_output_file)


Remove Duplicate Timestamps from Series
This is very specific to my dataset, as the minimum sales interval = 1 day (with multiple sales possible in a single day), but in my case this only distorted the picture.

In [None]:
def remove_duplicate_timestamps(series):
    """Remove duplicate timestamps from a single series"""
    if not series.points:
        return series
    
    # Create dictionary for unique timestamps
    unique_points = {}
    
    for point in series.points:
        timestamp = point.timestamp
        
        # If timestamp exists, keep the point with higher value (or first)
        if timestamp not in unique_points:
            unique_points[timestamp] = point
        else:
            # Strategy: keep point with higher value
            if point.value > unique_points[timestamp].value:
                unique_points[timestamp] = point
    
    # Sort points by time
    sorted_points = sorted(unique_points.values(), key=lambda x: x.timestamp)
    
    # Create new series without duplicates
    cleaned_series = TimeSeries(
        id=series.id,
        name=series.name,
        points=sorted_points,
        metadata={
            **series.metadata,
            'duplicates_removed': True,
            'original_length': len(series.points),
            'cleaned_length': len(sorted_points),
            'duplicates_count': len(series.points) - len(sorted_points)
        },
        labeled_values=series.labeled_values
    )
    
    return cleaned_series


def filter_dataset_remove_duplicates(dataset):
    """Filter dataset by removing duplicate timestamps"""
    print("Removing duplicate timestamps...")
    
    original_count = len(dataset)
    total_duplicates = 0
    cleaned_series = []
    
    for i, series in enumerate(dataset.series):
        if i % 100 == 0:
            print(f"Processing series {i+1}/{original_count}")
        
        # Remove duplicates for current series
        cleaned_series_item = remove_duplicate_timestamps(series)
        
        # Count removed duplicates
        duplicates_removed = cleaned_series_item.metadata.get('duplicates_count', 0)
        total_duplicates += duplicates_removed
        
        cleaned_series.append(cleaned_series_item)
    
    # Create new dataset
    cleaned_dataset = TimeSeriesDataset(
        name=f"{dataset.name} (Duplicates Removed)",
        description=f"Dataset with removed duplicate timestamps. Created {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
        series=cleaned_series,
        metadata={
            **dataset.metadata,
            'duplicates_removed': True,
            'total_duplicates_removed': total_duplicates,
            'original_series_count': original_count
        },
        created_at=datetime.now()
    )
    
    print(f"\nDuplicate removal completed:")
    print(f"  - Original series count: {original_count}")
    print(f"  - Total duplicates removed: {total_duplicates}")
    print(f"  - Remaining series: {len(cleaned_dataset)}")
    
    return cleaned_dataset


# Apply duplicate filtering to existing dataset
if dataset:
    print("Applying duplicate filtering...")
    filtered_dataset_by_duplicates = filter_dataset_remove_duplicates(filtered_dataset_by_length)


## Calculate Sale Time Statistics and Filter by Sale Time (main logic of filtering process)
This cell defines functions to calculate sale time statistics (including trimmed averages, variation metrics, and outlier detection) for series and to filter the dataset based on these statistics using configurable settings. It applies the filter with specified parameters, updates the dataset, and prints filtering results.

In [None]:
def calculate_sale_time_statistics(series, window_size=10):
    """Calculate sale time statistics for a time series including current date"""
    if series.length() < window_size:
        return None
    
    # Get last N points
    recent_points = series.points[-window_size:]
    
    # Sort by time
    sorted_points = sorted(recent_points, key=lambda x: x.timestamp)
    
    # Add current date as virtual point for last interval
    current_timestamp = datetime.now().timestamp()
    
    # Create temporary list with current date
    points_with_current = sorted_points + [type('Point', (), {'timestamp': current_timestamp})()]
    
    # Calculate intervals between sales in seconds (including to current date)
    time_intervals = []
    for i in range(1, len(points_with_current)):
        interval = points_with_current[i].timestamp - points_with_current[i-1].timestamp
        time_intervals.append(interval)
    
    
    # Save interval to current date separately
    current_interval = time_intervals[-1]  # Last interval - to current date
    
    # Check if interval to current date is minimum or maximum
    is_current_min = current_interval == min(time_intervals)
    is_current_max = current_interval == max(time_intervals)
    
    # Remove outliers (maximum and minimum time)
    original_intervals = time_intervals.copy()
    time_intervals.sort()
    
    # If interval to current date is min or max, do NOT remove it
    if is_current_min and is_current_max:
        # Interval to current date is both min and max (all intervals identical)
        trimmed_intervals = time_intervals
    elif is_current_min:
        # Interval to current date is minimum, remove only maximum
        trimmed_intervals = time_intervals[:-1]  # Remove only max
    elif is_current_max:
        # Interval to current date is maximum, remove only minimum
        trimmed_intervals = time_intervals[1:]  # Remove only min
    else:
        # Interval to current date is neither min nor max, remove both outliers
        trimmed_intervals = time_intervals[1:-1]  # Remove min and max
    
    # Convert to hours
    intervals_hours = [interval / 3600 for interval in trimmed_intervals]
    
    # Calculate statistics
    avg_interval_hours = sum(intervals_hours) / len(intervals_hours)
    min_interval_hours = min(intervals_hours)
    max_interval_hours = max(intervals_hours)
    
    # Methods to detect inadequate distribution:
    
    # 1. Coefficient of variation (CV) - standard deviation / mean
    mean_val = avg_interval_hours
    variance = sum((x - mean_val) ** 2 for x in intervals_hours) / len(intervals_hours)
    std_dev = variance ** 0.5
    coefficient_of_variation = std_dev / mean_val if mean_val > 0 else float('inf')
    
    # 2. Range relative to mean
    range_ratio = (max_interval_hours - min_interval_hours) / mean_val if mean_val > 0 else float('inf')
    
    # 3. Check for clustering (grouping of close values)
    sorted_intervals = sorted(intervals_hours)
    gaps = [sorted_intervals[i+1] - sorted_intervals[i] for i in range(len(sorted_intervals)-1)]
    avg_gap = sum(gaps) / len(gaps) if gaps else 0
    max_gap = max(gaps) if gaps else 0
    gap_ratio = max_gap / avg_gap if avg_gap > 0 else float('inf')
    
    # 4. Check for outliers (more than 2 standard deviations from mean)
    outliers = [x for x in intervals_hours if abs(x - mean_val) > 2 * std_dev]
    outlier_ratio = len(outliers) / len(intervals_hours)
    
    # Additional info about current interval
    current_interval_hours = current_interval / 3600
    
    # Check if interval to current date is an outlier
    is_current_interval_outlier = abs(current_interval_hours - mean_val) > 2 * std_dev
    
    return {
        'avg_interval_hours': avg_interval_hours,
        'min_interval_hours': min_interval_hours,
        'max_interval_hours': max_interval_hours,
        'coefficient_of_variation': coefficient_of_variation,
        'range_ratio': range_ratio,
        'gap_ratio': gap_ratio,
        'outlier_ratio': outlier_ratio,
        'total_intervals': len(intervals_hours),
        'std_dev': std_dev,
        'intervals_hours': intervals_hours,  # Save for analysis
        'current_interval_hours': current_interval_hours,  # Interval to current date
        'is_current_interval_outlier': is_current_interval_outlier,  # Is it an outlier
        'is_current_min': is_current_min,  # Is interval to current date minimal
        'is_current_max': is_current_max,  # Is interval to current date maximal
        'analysis_includes_current': True,  # Flag that analysis includes current date
        'trimming_info': f"Outliers removed: {len(original_intervals) - len(trimmed_intervals)} (current interval {'preserved' if (is_current_min or is_current_max) else 'was not min/max'})"
    }
    


def filter_dataset_by_sale_time(dataset, 
                               window_size=10, 
                               avg_trimmed_hours_max=168,
                               max_cv=0.8,           # Maximum coefficient of variation
                               max_range_ratio=2.0,  # Maximum range/mean
                               max_gap_ratio=5.0,    # Maximum gap
                               max_outlier_ratio=0.3): # Maximum outlier ratio
    """
    Filter dataset by sale time
    
    Parameters:
    - dataset: TimeSeriesDataset for filtering
    - window_size: number of last sales for analysis (default 10)
    - avg_trimmed_hours_max: maximum mean time between sales in hours (trimmed mean)
    - max_cv: maximum coefficient of variation (standard deviation / mean)
    - max_range_ratio: maximum range relative to mean ((max-min)/mean)
    - max_gap_ratio: maximum gap between neighboring values
    - max_outlier_ratio: maximum ratio of outliers (values > 2*std_dev from mean)
    """
    
    print(f"Filtering dataset by sale time...")
    print(f"  - Analysis window: last {window_size} sales")
    print(f"  - Max trimmed mean time: {avg_trimmed_hours_max} hours")
    print(f"  - Max coefficient of variation: {max_cv}")
    print(f"  - Max range/mean: {max_range_ratio}")
    print(f"  - Max gap: {max_gap_ratio}")
    print(f"  - Max outlier ratio: {max_outlier_ratio}")
    
    original_count = len(dataset)
    filtered_series = []
    skipped_count = 0
    skipped_time_count = 0
    skipped_cv_count = 0
    skipped_range_count = 0
    skipped_gap_count = 0
    skipped_outlier_count = 0
    
    # Collect info about rejected series for analysis
    rejected_series = {
        'cv': [],
        'range_ratio': [],
        'gap_ratio': [],
        'outlier_ratio': []
    }
    
    for i, series in enumerate(dataset.series):
        if i % 100 == 0:
            print(f"Processing series {i+1}/{original_count}")
        
        # Calculate sale time statistics
        time_stats = calculate_sale_time_statistics(series, window_size)
        
        if time_stats is None:
            skipped_count += 1
            continue
        
        # Check trimmed mean sale time
        avg_time = time_stats['avg_interval_hours']
        if avg_time > avg_trimmed_hours_max:
            skipped_time_count += 1
            continue
        
        # Check distribution quality and save rejected
        if time_stats['coefficient_of_variation'] > max_cv:
            skipped_cv_count += 1
            rejected_series['cv'].append((series, time_stats))
            continue
        elif time_stats['range_ratio'] > max_range_ratio:
            skipped_range_count += 1
            rejected_series['range_ratio'].append((series, time_stats))
            continue
        elif time_stats['gap_ratio'] > max_gap_ratio:
            skipped_gap_count += 1
            rejected_series['gap_ratio'].append((series, time_stats))
            continue
        elif time_stats['outlier_ratio'] > max_outlier_ratio:
            skipped_outlier_count += 1
            rejected_series['outlier_ratio'].append((series, time_stats))
            continue
        
        # Add statistics to metadata
        series.metadata['sale_time_stats'] = time_stats
        filtered_series.append(series)
    
    # Create new dataset
    filtered_dataset = TimeSeriesDataset(
        name=f"{dataset.name} (Filtered by Sale Time)",
        description=f"Dataset filtered by sale time. Created {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
        series=filtered_series,
        metadata={
            **dataset.metadata,
            'filter_by_sale_time': True,
            'sale_time_window': window_size,
            'avg_trimmed_hours_max': avg_trimmed_hours_max,
            'max_cv': max_cv,
            'max_range_ratio': max_range_ratio,
            'max_gap_ratio': max_gap_ratio,
            'max_outlier_ratio': max_outlier_ratio,
            'original_count': original_count,
            'filtered_count': len(filtered_series)
        },
        created_at=datetime.now()
    )
    
    print(f"\nFiltering completed:")
    print(f"  - Original series count: {original_count}")
    print(f"  - Skipped (insufficient data): {skipped_count}")
    print(f"  - Skipped (too long sale time): {skipped_time_count}")
    print(f"  - Skipped (high variation): {skipped_cv_count}")
    print(f"  - Skipped (large range): {skipped_range_count}")
    print(f"  - Skipped (large gaps): {skipped_gap_count}")
    print(f"  - Skipped (many outliers): {skipped_outlier_count}")
    print(f"  - Remaining series: {len(filtered_dataset)}")
    print(f"  - Retained percentage: {len(filtered_dataset)/original_count*100:.1f}%")
    
    return filtered_dataset, rejected_series



# Sale Time Filtering Settings
# This dictionary defines parameters for the filter_dataset_by_sale_time function.
# The filter analyzes time intervals between last sales (including to current date)
# to select series with regular, stable sales. Core idea: calculate interval statistics
# (after trimming outliers), check against thresholds, and reject series with rare,
# unstable, or anomalous sales. Math includes min/max trimming, CV, ranges, etc.
# for distribution quality detection. Rejected series are saved in rejected_series for analysis.


# check below for more details (next cell)
FILTER_SETTINGS = {


    'window_size': 10,              # Number of last sales (points in time series) for time interval analysis.
                                    # Math: Slice recent_points = series.points[-window_size:], sort by timestamp,
                                    # add current date as virtual point, calculate Œît_i = t_i - t_{i-1} (in seconds),
                                    # total window_size intervals (including to current date).
                                    # Core: Defines "window" for stability assessment; if points < window_size, series skipped (time_stats = None).
                                    # Impact: Larger value provides reliable stats (longer history),
                                    # but rejects short-history series; smaller allows more but stats may be noisy.
                                    # Recommended: 5‚Äì20 for balance between data and accuracy.
                                    # Example: For window_size=10 and 8 sales ‚Äî skip; for 10 sales ‚Äî 10 intervals for analysis.
    
    'avg_trimmed_hours_max': 50,    # Maximum trimmed mean time between sales (in hours) after outlier trimming.
                                    # Math: trimmed_intervals = intervals without min/max (preserving current interval if min/max);
                                    # intervals_hours = [Œît_i / 3600]; avg = (‚àë intervals_hours) / len; if avg > threshold, skip.
                                    # Core: Selects series with frequent enough sales; trimming ignores anomalies, focusing on typical pace.
                                    # Impact: Smaller value requires frequent sales (e.g., daily),
                                    # larger allows rare (e.g., weekly), but risks including inactive items.
                                    # Recommended: 24‚Äì168 hours depending on market (e.g., 50 for moderately active items).
                                    # Example: trimmed_intervals = [10, 15, 20] h; avg ‚âà15 <50 ‚Äî pass; avg=60 ‚Äî skip (skipped_time_count +=1).
    
    'max_cv': 1,                  # Maximum coefficient of variation (CV) for intervals ‚Äî measure of relative variability.
                                    # Math: Œº = avg_interval_hours; œÉ¬≤ = (‚àë (x_i - Œº)¬≤) / n; œÉ = ‚àöœÉ¬≤; CV = œÉ / Œº (if Œº>0, else ‚àû);
                                    # if CV > threshold, skip and save to rejected_series['cv'].
                                    # Core: Detects interval stability ‚Äî low CV means predictable sales, high ‚Äî chaos (e.g., bursts).
                                    # Impact: Low threshold (0.5‚Äì0.8) retains few but quality series; high (1.7+) allows more, including slightly unstable.
                                    # Recommended: 0.5‚Äì1.0 for strict stability, up to 2.0 for flexibility.
                                    # Example: Intervals [10,10,30]; Œº‚âà16.67, œÉ‚âà9.43, CV‚âà0.57 <1.7 ‚Äî pass; [1,1,200]; CV>1.7 ‚Äî skip.
    
    'max_range_ratio': 1.5,         # Maximum range of intervals relative to mean ((max - min) / mean).
                                    # Math: range = max(intervals_hours) - min; range_ratio = range / Œº (if Œº>0, else ‚àû);
                                    # if > threshold, skip and save to rejected_series['range_ratio'].
                                    # Core: Checks uniformity ‚Äî large range indicates extreme pauses/bursts, even if CV is ok.
                                    # Impact: Low threshold (1.5‚Äì2.5) requires uniformity, high (3.0+) tolerates differences.
                                    # Recommended: 2.0‚Äì4.0 for balance between strict selection and inclusivity.
                                    # Example: [5,10,15]; Œº=10, range=10, ratio=1.0 <3.0 ‚Äî pass; [1,1,100]; ratio‚âà4.85 >3.0 ‚Äî skip.
    
    'max_gap_ratio': 3.0,           # Maximum gap between neighboring sorted intervals (max_gap / avg_gap).
                                    # Math: sorted_intervals; gaps_i = sorted_{i+1} - sorted_i; avg_gap = (‚àë gaps) / (n-1);
                                    # max_gap = max(gaps); gap_ratio = max_gap / avg_gap (if avg_gap>0, else ‚àû); if > threshold, skip to rejected_series['gap_ratio'].
                                    # Core: Detects clustering (groups of close intervals with large gaps) ‚Äî sign of uneven sales (e.g., "bursts" + pauses).
                                    # Impact: Low threshold (3.0‚Äì5.0) avoids clusters, high allows seasonal data.
                                    # Recommended: 4.0‚Äì6.0 for data with potential seasonality.
                                    # Example: sorted [1,2,10]; gaps [1,8]; avg_gap=4.5, max_gap=8, ratio‚âà1.78 <5.0 ‚Äî pass; [1,2,50]; ratio>5.0 ‚Äî skip.
    
    'max_outlier_ratio': 0.3        # Maximum ratio of outliers in intervals (values > 2*œÉ from Œº).
                                    # Math: outliers = [x where |x - Œº| > 2*œÉ]; outlier_ratio = len(outliers) / len(intervals_hours);
                                    # if > threshold, skip and save to rejected_series['outlier_ratio'].
                                    # Core: Checks data quality ‚Äî many outliers mean anomalies (e.g., rare events), indicating instability.
                                    # Impact: Low threshold (0.1‚Äì0.3) requires few anomalies, high allows noisy data.
                                    # Recommended: 0.2‚Äì0.4 for balance between strictness and data volume.
                                    # Example: Intervals [10,11,12,50]; Œº‚âà20.75, œÉ‚âà17.7; outliers=[50] (ratio=0.25 <0.3) ‚Äî pass; 2 outliers of 5 (ratio=0.4 >0.3) ‚Äî skip.
    }


    
print("Applying filtering with settings:")
for key, value in FILTER_SETTINGS.items():
    print(f"  - {key}: {value}")


filtered_dataset, rejected_series = filter_dataset_by_sale_time(filtered_dataset_by_duplicates, **FILTER_SETTINGS)     
# Display analysis of rejected series     
# Update main dataset
dataset = filtered_dataset


## Complementary Analysis of FILTER_SETTINGS Parameters


These four parameters (`max_cv`, `max_range_ratio`, `max_gap_ratio`, and `max_outlier_ratio`) in the `FILTER_SETTINGS` dictionary form a multi-faceted system for filtering time series data based on sale interval stability. They analyze the distribution of time intervals between sales (after trimming outliers) to detect instability, irregularity, or anomalies. By combining global measures (like overall variability) with local ones (like gaps or specific outliers), they provide a robust, layered filter that reduces false positives/negatives compared to using any single metric.

The parameters complement each other by addressing different aspects of distribution quality:
- **Global vs. Local Focus**: CV and range_ratio assess overall spread, while gap_ratio and outlier_ratio zoom in on specific patterns (e.g., clusters or anomalies).
- **Relative vs. Absolute**: CV and range_ratio normalize by the mean (relative), making them scale-invariant, whereas gap_ratio examines differences in sorted data, and outlier_ratio counts explicit deviations.
- **Holistic Coverage**: Together, they catch scenarios where one metric might pass unstable data (e.g., high CV might miss clustered data that gap_ratio catches).

Below, I'll describe:
1. **How They Complement Each Other**: Pairwise explanations of synergies.
2. **Strengths and Weaknesses**: For each parameter, its strong sides, weak sides, and how others compensate.

### 1. How the Parameters Complement Each Other

These metrics are applied sequentially in the filter (after checking average interval), so a series must pass *all* to be retained. This creates a "defense in depth" where each catches issues the others might miss, ensuring only stable, uniform distributions pass.

- **max_cv and max_range_ratio**: CV measures overall relative dispersion (how spread out intervals are around the mean), but it can be insensitive to extreme values if they're balanced. Range_ratio complements by focusing solely on the extrema (max - min relative to mean), catching cases with rare but large deviations (e.g., one long pause) that don't heavily skew CV. Conversely, range_ratio ignores the distribution in between, so CV fills this by quantifying the full variability.

- **max_cv and max_gap_ratio**: CV detects general instability but might pass clustered data (e.g., many short intervals + a few long ones averaging out). Gap_ratio complements by scanning sorted intervals for large "jumps," identifying non-uniformity like bimodal distributions (clusters of quick sales followed by gaps). CV provides the broad picture, while gap_ratio adds granularity for uneven spacing.

- **max_cv and max_outlier_ratio**: CV incorporates outliers into its calculation (via standard deviation), but if outliers are few, CV might stay low. Outlier_ratio explicitly counts them (using a 2œÉ threshold), catching sparse anomalies that don't dominate CV. Outlier_ratio adds a direct "quality check," while CV ensures the overall spread isn't too high even without outliers.

- **max_range_ratio and max_gap_ratio**: Range_ratio spots large overall spreads but doesn't distinguish if the spread is due to even dispersion or clusters. Gap_ratio complements by detecting if the spread comes from big gaps in the sorted list, revealing clustering (e.g., tight groups separated by voids). Range_ratio sets a "ceiling" on extremes, and gap_ratio ensures the intervals are evenly distributed within that range.

- **max_range_ratio and max_outlier_ratio**: Range_ratio can be inflated by a single outlier, but since it's calculated on trimmed data, it might understate issues if trimming removes too much. Outlier_ratio complements by quantifying how many values are anomalous relative to the mean and std dev, catching persistent outliers that range might normalize. Outlier_ratio provides a count-based view, while range_ratio focuses on magnitude.

- **max_gap_ratio and max_outlier_ratio**: Gap_ratio identifies structural issues like clustering but might miss small, scattered outliers that don't create large gaps. Outlier_ratio complements by flagging individual extreme values (e.g., isolated long intervals not forming a "gap" in sorted order). Gap_ratio handles distributional shape, while outlier_ratio ensures low anomaly density.

**Overall Synergy**: Using all four creates a comprehensive filter. For example, a series with moderate CV but one huge outlier might pass CV but fail outlier_ratio or range_ratio. Clustered data might pass range_ratio (if extremes aren't too far) but fail gap_ratio. This reduces over-filtering (retaining more data) while minimizing unstable series slipping through.

### 2. Strengths and Weaknesses of Each Parameter

For each parameter, I'll outline its strengths, weaknesses, and how the others compensate (filling gaps).

#### max_cv (Coefficient of Variation)
- **Strengths**:
  - Captures relative variability (scale-invariant), making it robust for intervals of different magnitudes.
  - Holistic: Considers all data points via mean and std dev, good for detecting general instability.
  - Sensitive to both high and low variability (e.g., chaotic bursts).
- **Weaknesses**:
  - Can be skewed by the mean (e.g., if Œº is small, even minor variations inflate CV).
  - Insensitive to distribution shape (e.g., passes clustered or multimodal data if overall spread is low).
  - Averages out extremes if they're not dominant.
- **How Others Compensate**:
  - Range_ratio fills the extreme-focus gap by emphasizing max-min differences.
  - Gap_ratio addresses shape insensitivity by detecting clusters.
  - Outlier_ratio adds explicit anomaly counting, catching what CV averages over.

#### max_range_ratio (Range Relative to Mean)
- **Strengths**:
  - Simple and direct: Focuses on extrema, catching rare but impactful deviations (e.g., long pauses).
  - Relative to mean, so scale-invariant like CV.
  - Effective after trimming, as it ignores absolute outliers and checks core spread.
- **Weaknesses**:
  - Ignores internal distribution (e.g., passes if extremes are far but middle is uniform).
  - Sensitive to sample size (small n can exaggerate range).
  - Doesn't distinguish between even spread and clustered extremes.
- **How Others Compensate**:
  - CV provides overall variability, filling the "internal" gap.
  - Gap_ratio detects if the range is due to clustering/gaps.
  - Outlier_ratio quantifies if the range is driven by multiple anomalies.

#### max_gap_ratio (Max Gap Relative to Average Gap)
- **Strengths**:
  - Specifically detects non-uniformity and clustering in sorted data (e.g., seasonal bursts).
  - Focuses on local differences, revealing patterns like bimodality that global metrics miss.
  - Robust to overall scale, as it's relative to average gap.
- **Weaknesses**:
  - Less effective for uniformly high variability (no large gaps but consistent noise).
  - Requires sorted data and enough points (small n might yield infinite ratios).
  - Ignores the magnitude of intervals, focusing only on gaps between them.
- **How Others Compensate**:
  - CV covers uniform high variability.
  - Range_ratio handles overall magnitude of spreads.
  - Outlier_ratio catches isolated extremes that don't form detectable gaps.

#### max_outlier_ratio (Outlier Ratio)
- **Strengths**:
  - Directly quantifies data quality by counting anomalies (using statistical threshold: >2œÉ).
  - Flexible for noisy data; allows some outliers without rejecting everything.
  - Independent of distribution shape, focusing on deviations from the mean.
- **Weaknesses**:
  - Depends on accurate mean/std dev (if data is skewed, "outliers" might be misidentified).
  - Doesn't capture overall spread or patterns (e.g., passes if outliers are few but clustered).
  - Threshold (2œÉ) is arbitrary; might over/under-count in non-normal distributions.
- **How Others Compensate**:
  - CV incorporates outliers into global variability.
  - Range_ratio catches extreme magnitudes even if not counted as outliers.
  - Gap_ratio detects if outliers create structural gaps/clusters.

This setup ensures balanced filtering: strengths amplify coverage, while weaknesses are mitigated by complements. If your data has specific characteristics (e.g., heavy seasonality), adjust thresholds to emphasize certain metrics (e.g., higher `max_gap_ratio`). For further tuning, consider correlating these with domain knowledge or visualizing rejected series.

## Visualization of accepted series

In [None]:
def show_accepted_series_analysis(dataset, max_examples=10, window_size=10):
    """Show analysis of accepted series with individual plots and timestamps"""
    print(f"\nAnalysis of accepted series (showing up to {max_examples} examples):")
    
    if not dataset or len(dataset) == 0:
        print("No accepted series for analysis")
        return
    
    # Sort series by quality (best first)
    # Use average sale time as a quality indicator (lower is better)
    series_with_stats = []
    
    for series in dataset.series:
        # Get sale time statistics for each series
        stats = series.metadata.get('sale_time_stats')
        if stats:
            series_with_stats.append((series, stats))
    
    # Sort by average sale time (best = lower values)
    sorted_accepted = sorted(series_with_stats, key=lambda x: x[1]['avg_interval_hours'])
    
    # Show statistics for all accepted series
    if series_with_stats:
        avg_times = [stats['avg_interval_hours'] for _, stats in series_with_stats]
        cvs = [stats['coefficient_of_variation'] for _, stats in series_with_stats]
        current_intervals = [stats.get('current_interval_hours', 0) for _, stats in series_with_stats]
        
        print(f"\n=== Statistics for all accepted series ===")
        print(f"Total accepted: {len(series_with_stats)}")
        print(f"    - Average sale time: {sum(avg_times)/len(avg_times):.2f} hours")
        print(f"    - Average CV: {sum(cvs)/len(cvs):.3f}")
        print(f"    - Average interval to current date: {sum(current_intervals)/len(current_intervals):.2f} hours")
        print(f"    - Best sale time: {min(avg_times):.2f} hours")
        print(f"    - Worst sale time: {max(avg_times):.2f} hours")
    
    # Show individual plots for the best examples
    examples_to_show = min(max_examples, len(sorted_accepted))
    
    print(f"\n=== Showing {examples_to_show} best examples ===")
    
    for i in range(examples_to_show):
        series, stats = sorted_accepted[i]
        values = series.get_values()
        timestamps = series.get_timestamps()
        
        # Take only the last N points for the plot
        if len(values) > window_size:
            values = values[-window_size:]
            timestamps = timestamps[-window_size:]
        
        # Add the current date as a virtual point
        current_timestamp = datetime.now().timestamp()
        current_datetime = datetime.now()
        
        # Add the current date to the plot data
        extended_timestamps = timestamps + [current_timestamp]
        # For the price at the current moment, use the last known price
        extended_values = values + [values[-1]] # Last price as current
        
        # Create a separate plot for each series
        plt.figure(figsize=(12, 6))
        
        # Convert timestamps to datetime for better display
        if isinstance(extended_timestamps[0], (int, float)):
            # If timestamps are in Unix format, convert to datetime
            dates = [datetime.fromtimestamp(ts) for ts in extended_timestamps]
        else:
            dates = extended_timestamps
        
        # Print the difference in hours between consecutive dates
        diffs_hours = []
        for j in range(1, len(dates)):
            delta = (dates[j] - dates[j-1]).total_seconds() / 3600
            diffs_hours.append(round(delta, 2))
        print(f"Series {i+1} - Differences between dates (hours): {diffs_hours}")

        # Plot the time series (including the current date)
        plt.plot(dates[:-1], values, 'g-o', markersize=4, linewidth=2, label='Historical sales')
        
        # Add the current date as a separate point
        plt.plot(dates[-1], extended_values[-1], 'r*', markersize=8, label='Current date')
        
        # Connect the last sale to the current date with a dashed line
        plt.plot([dates[-2], dates[-1]], [values[-1], extended_values[-1]], 'r--', alpha=0.7)
        
        # Axis setup
        plt.xlabel('Time')
        plt.ylabel('Price')
        
        # Show the series quality
        quality_score = f"Quality: {stats['avg_interval_hours']:.1f}h (lower is better)"
        
        plt.title(f'ACCEPTED SERIES: {series.name}\n{quality_score}\n(Showing last {len(values)} points + current date)')
        plt.grid(True, alpha=0.3)
        plt.legend()
        
        # Rotate time labels for better readability
        plt.xticks(rotation=45)
        
        # Add statistics information to the plot
        current_interval_info = f"‚Ä¢ Interval to current date: {stats.get('current_interval_hours', 0):.2f} hours"
        if stats.get('is_current_interval_outlier', False):
            current_interval_info += " (outlier!)"
        
        # Show all metrics with green color (accepted)
        cv_text = f"‚Ä¢ Coefficient of variation: {stats['coefficient_of_variation']:.3f} ‚úì"
        range_text = f"‚Ä¢ Range/mean: {stats['range_ratio']:.3f} ‚úì"
        gap_text = f"‚Ä¢ Max gap: {stats['gap_ratio']:.3f} ‚úì"
        outlier_text = f"‚Ä¢ Outlier ratio: {stats['outlier_ratio']:.3f} ‚úì"
        
        info_text = f"""Sale time statistics (last {window_size} sales + current date):
‚Ä¢ Average time: {stats['avg_interval_hours']:.2f} hours ‚úì
{cv_text}
{range_text}
{gap_text}
{outlier_text}
‚Ä¢ Number of intervals: {stats['total_intervals']}
{current_interval_info}"""
        
        plt.text(0.02, 0.98, info_text, transform=plt.gca().transAxes, 
                 verticalalignment='top', fontsize=10,
                 bbox=dict(boxstyle='round', facecolor='lightgreen', alpha=0.8))
        
        plt.tight_layout()
        plt.show()
        
        print(f"    Plot {i+1}/{examples_to_show}: {series.name} - Average time: {stats['avg_interval_hours']:.2f}h")

# Usage:
show_accepted_series_analysis(dataset, max_examples=10, window_size=10)


## Visualization of rejected series


In [None]:
def show_rejected_series_analysis(rejected_series, max_examples=10, custom_rejected_type=None):
    """Show analysis of rejected series with individual plots and timestamps"""
    print(f"\nAnalysis of rejected series (showing up to {max_examples} examples of each type):")
    
    for rejection_type, rejected_list in rejected_series.items():
        if not rejected_list:
            continue

        if custom_rejected_type and custom_rejected_type != rejection_type:
            continue
            
        print(f"\n=== Rejected by {rejection_type.upper()} ===")
        print(f"Total rejected: {len(rejected_list)}")
        
        # Show statistics for rejected series
        if rejected_list:
            avg_times = [stats['avg_interval_hours'] for _, stats in rejected_list]
            cvs = [stats['coefficient_of_variation'] for _, stats in rejected_list]
            current_intervals = [stats.get('current_interval_hours', 0) for _, stats in rejected_list]
            print(f"    - Average sale time: {sum(avg_times)/len(avg_times):.2f} hours")
            print(f"    - Average CV: {sum(cvs)/len(cvs):.3f}")
            print(f"    - Average interval to current date: {sum(current_intervals)/len(current_intervals):.2f} hours")
        
        # Sort by proximity to the threshold for each rejection type
        if rejection_type == 'cv':
            # Sort by ascending CV (closest to the threshold)
            sorted_rejected = sorted(rejected_list, key=lambda x: x[1]['coefficient_of_variation'])
        elif rejection_type == 'range_ratio':
            # Sort by ascending range_ratio
            sorted_rejected = sorted(rejected_list, key=lambda x: x[1]['range_ratio'])
        elif rejection_type == 'gap_ratio':
            # Sort by ascending gap_ratio
            sorted_rejected = sorted(rejected_list, key=lambda x: x[1]['gap_ratio'])
        elif rejection_type == 'outlier_ratio':
            # Sort by ascending outlier_ratio
            sorted_rejected = sorted(rejected_list, key=lambda x: x[1]['outlier_ratio'])
        else:
            sorted_rejected = rejected_list
        
        # Show individual plots for each example (starting with the most borderline cases)
        examples_to_show = min(max_examples, len(sorted_rejected))
        
        print(f"    Showing {examples_to_show} most borderline cases:")
        
        for i in range(examples_to_show):
            series, stats = sorted_rejected[i]
            values = series.get_values()
            timestamps = series.get_timestamps()
            
            # Take only the last 10 points for the plot
            window_size = 10
            if len(values) > window_size:
                values = values[-window_size:]
                timestamps = timestamps[-window_size:]
            
            # Add the current date as a virtual point
            current_timestamp = datetime.now().timestamp()
            current_datetime = datetime.now()
            
            # Add the current date to the plot data
            extended_timestamps = timestamps + [current_timestamp]
            # For the price at the current moment, use the last known price
            extended_values = values + [values[-1]] # Last price as current
            
            # Create a separate plot for each series
            plt.figure(figsize=(12, 6))
            
            # Convert timestamps to datetime for better display
            if isinstance(extended_timestamps[0], (int, float)):
                # If timestamps are in Unix format, convert to datetime
                dates = [datetime.fromtimestamp(ts) for ts in extended_timestamps]
            else:
                dates = extended_timestamps
            
            # Print the difference in hours between consecutive dates
            diffs_hours = []
            for j in range(1, len(dates)):
                delta = (dates[j] - dates[j-1]).total_seconds() / 3600
                diffs_hours.append(round(delta, 2))
            print("Differences between dates (hours):", diffs_hours)

            # Plot the time series (including the current date)
            plt.plot(dates[:-1], values, 'b-o', markersize=4, linewidth=2, label='Historical sales')
            
            # Add the current date as a separate point
            plt.plot(dates[-1], extended_values[-1], 'r*', markersize=8, label='Current date')
            
            # Connect the last sale to the current date with a dashed line
            plt.plot([dates[-2], dates[-1]], [values[-1], extended_values[-1]], 'r--', alpha=0.7)
            
            # Axis setup
            plt.xlabel('Time')
            plt.ylabel('Price')
            
            # Show the value that led to rejection
            threshold_value = ""
            if rejection_type == 'cv':
                threshold_value = f"CV: {stats['coefficient_of_variation']:.3f} (threshold: {FILTER_SETTINGS['max_cv']})"
            elif rejection_type == 'range_ratio':
                threshold_value = f"Range: {stats['range_ratio']:.3f} (threshold: {FILTER_SETTINGS['max_range_ratio']})"
            elif rejection_type == 'gap_ratio':
                threshold_value = f"Gap: {stats['gap_ratio']:.3f} (threshold: {FILTER_SETTINGS['max_gap_ratio']})"
            elif rejection_type == 'outlier_ratio':
                threshold_value = f"Outliers: {stats['outlier_ratio']:.3f} (threshold: {FILTER_SETTINGS['max_outlier_ratio']})"
            
            plt.title(f'Series: {series.name}\nRejected by: {rejection_type.upper()}\n{threshold_value}\nAvg: {stats["avg_interval_hours"]:.1f}h\n(Showing last {len(values)} points + current date)')
            plt.grid(True, alpha=0.3)
            plt.legend()
            
            # Rotate time labels for better readability
            plt.xticks(rotation=45)
            
            # Add statistics information to the plot
            current_interval_info = f"‚Ä¢ Interval to current date: {stats.get('current_interval_hours', 0):.2f} hours"
            if stats.get('is_current_interval_outlier', False):
                current_interval_info += " (outlier!)"
            
            # Show all metrics, highlighting the one that caused the rejection
            cv_text = f"‚Ä¢ Coefficient of variation: {stats['coefficient_of_variation']:.3f}"
            range_text = f"‚Ä¢ Range/mean: {stats['range_ratio']:.3f}"
            gap_text = f"‚Ä¢ Max gap: {stats['gap_ratio']:.3f}"
            outlier_text = f"‚Ä¢ Outlier ratio: {stats['outlier_ratio']:.3f}"
            
            # Highlight the problematic metric
            if rejection_type == 'cv':
                cv_text += " ‚Üê PROBLEM"
            elif rejection_type == 'range_ratio':
                range_text += " ‚Üê PROBLEM"
            elif rejection_type == 'gap_ratio':
                gap_text += " ‚Üê PROBLEM"
            elif rejection_type == 'outlier_ratio':
                outlier_text += " ‚Üê PROBLEM"
            
            info_text = f"""Sale time statistics (last {window_size} sales + current date):
‚Ä¢ Average time: {stats['avg_interval_hours']:.2f} hours - {FILTER_SETTINGS['avg_trimmed_hours_max']}
{cv_text} - {FILTER_SETTINGS['max_cv']}
{range_text} - {FILTER_SETTINGS['max_range_ratio']}
{gap_text} - {FILTER_SETTINGS['max_gap_ratio']}
{outlier_text} - {FILTER_SETTINGS['max_outlier_ratio']}
‚Ä¢ Number of intervals: {stats['total_intervals']}
{current_interval_info}"""
            
            plt.text(0.02, 0.98, info_text, transform=plt.gca().transAxes, 
                     verticalalignment='top', fontsize=10,
                     bbox=dict(boxstyle='round', facecolor='lightblue', alpha=0.8))
            
            plt.tight_layout()
            plt.show()
            
            print(f"    Plot {i+1}/{examples_to_show}: {series.name} - {threshold_value}")

show_rejected_series_analysis(rejected_series, max_examples=10, custom_rejected_type='range_ratio')
