In [6]:
# gel_data_generator.py
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import random

class GelColumnRunGenerator:
    def __init__(self, config=None):
        self.config = config or {
            'run_duration_hrs': {'min': 4, 'max': 12},  # Hours per column run
            'batches_per_run': {'min': 3, 'max': 8},    # Batches per run
            'batch_duration_mins': {'min': 30, 'max': 120},  # Minutes per batch
            'degradation_per_run': 0.02,  # Gel score increase per run
            'pressure_noise': 0.5,        # Measurement noise
            'run_break_hours': {'min': 1, 'max': 4}  # Downtime between runs
        }
        
    def generate_run_schedule(self, total_days=30):
        """Generate realistic column run schedule"""
        runs = []
        current_time = datetime(2024, 1, 1, 8, 0)  # Start at 8 AM
        
        run_id = 1
        while current_time.date() <= datetime(2024, 1, 1).date() + timedelta(days=total_days):
            # Determine run characteristics
            run_duration = random.uniform(
                self.config['run_duration_hrs']['min'],
                self.config['run_duration_hrs']['max']
            )
            
            n_batches = random.randint(
                self.config['batches_per_run']['min'],
                self.config['batches_per_run']['max']
            )
            
            # Create run
            run_end = current_time + timedelta(hours=run_duration)
            runs.append({
                'run_id': run_id,
                'start_time': current_time,
                'end_time': run_end,
                'duration_hrs': run_duration,
                'n_batches': n_batches,
                'status': 'planned'
            })
            
            # Downtime between runs
            downtime = random.uniform(
                self.config['run_break_hours']['min'],
                self.config['run_break_hours']['max']
            )
            
            current_time = run_end + timedelta(hours=downtime)
            
            # Don't start runs late at night
            if current_time.hour > 20:  # After 8 PM
                # Move to next morning
                next_day = current_time.date() + timedelta(days=1)
                current_time = datetime(next_day.year, next_day.month, next_day.day, 8, 0)
            
            run_id += 1
        
        return pd.DataFrame(runs)
    
    def generate_batch_level_data(self, run_schedule):
        """Generate data at batch level within runs"""
        all_batches = []
        batch_counter = 1
        
        for _, run in run_schedule.iterrows():
            # Base gel degradation for this run (increases with each run)
            base_gel_score = min(0.1 + (run['run_id'] - 1) * self.config['degradation_per_run'], 0.95)
            
            # Generate batches within this run
            batch_durations = np.random.uniform(
                self.config['batch_duration_mins']['min'],
                self.config['batch_duration_mins']['max'],
                size=int(run['n_batches'])
            )
            
            # Ensure batches fit within run duration
            total_batch_mins = sum(batch_durations)
            scaling_factor = (run['duration_hrs'] * 60) / total_batch_mins
            batch_durations = batch_durations * scaling_factor
            
            current_batch_start = run['start_time']
            
            for batch_idx in range(int(run['n_batches'])):
                batch_duration = batch_durations[batch_idx]
                batch_end = current_batch_start + timedelta(minutes=batch_duration)
                
                # Generate batch-level metrics
                batch_data = self._generate_batch_metrics(
                    batch_counter,
                    run['run_id'],
                    batch_idx,
                    current_batch_start,
                    batch_end,
                    base_gel_score
                )
                
                all_batches.append(batch_data)
                current_batch_start = batch_end
                batch_counter += 1
        
        return pd.DataFrame(all_batches)
    
    def _generate_batch_metrics(self, batch_id, run_id, batch_idx, start_time, end_time, base_gel_score):
        """Generate detailed metrics for a single batch"""
        duration_minutes = (end_time - start_time).total_seconds() / 60
        
        # Gel degradation increases slightly within batch
        batch_gel_start = base_gel_score + (batch_idx * 0.005)
        batch_gel_end = batch_gel_start + 0.01  # Minor degradation during batch
        
        # Simulate pressure profiles during batch
        time_points = max(5, int(duration_minutes // 15))  # Sample every 15 minutes
        timestamps = pd.date_range(start=start_time, end=end_time, periods=time_points)
        
        # Pressure trends - typically stable with minor fluctuations
        elution_pressure = 12 + (batch_gel_start * 8)  # Increases with gel degradation
        flow_pressure = 6 + (batch_gel_start * 4)
        
        # Add batch phase effects
        if batch_idx == 0:  # First batch might have priming effects
            elution_pressure += 2
        elif batch_idx > 5:  # Later batches show more degradation
            elution_pressure += 1
        
        # Add random noise
        elution_pressure += np.random.normal(0, self.config['pressure_noise'])
        flow_pressure += np.random.normal(0, self.config['pressure_noise'] * 0.5)
        
        # Column age effect (older columns show higher pressure)
        column_age_factor = min(run_id / 100, 0.3)  # Up to 30% increase over 100 runs
        elution_pressure *= (1 + column_age_factor)
        flow_pressure *= (1 + column_age_factor * 0.7)
        
        # Calculate batch-level average gel score
        batch_gel_score = (batch_gel_start + batch_gel_end) / 2
        
        return {
            'batch_id': batch_id,
            'run_id': run_id,
            'batch_in_run': batch_idx + 1,
            'start_time': start_time,
            'end_time': end_time,
            'duration_minutes': duration_minutes,
            'gel_score': batch_gel_score,
            'elution_pressure_avg': elution_pressure,
            'flow_pressure_avg': flow_pressure,
            'column_runs_completed': run_id - 1,  # Runs before this one
            'column_age_factor': column_age_factor,
            'phase': 'loading' if batch_idx == 0 else 'elution'
        }
    
    def generate_high_resolution_data(self, batch_data, samples_per_batch=10):
        """Generate high-resolution time series within batches"""
        high_res_records = []
        
        for _, batch in batch_data.iterrows():
            # Create time points within this batch
            time_points = pd.date_range(
                start=batch['start_time'],
                end=batch['end_time'],
                periods=samples_per_batch
            )
            
            # Simulate pressure fluctuations during batch
            time_fraction = np.linspace(0, 1, samples_per_batch)
            
            # Base pressure with gradual increase during batch
            pressure_trend = batch['elution_pressure_avg'] + (time_fraction * 0.5)
            
            # Add operational noise (pump fluctuations, etc.)
            operational_noise = np.random.normal(0, 0.3, samples_per_batch)
            
            # Add periodic effects (pump cycles every ~2 minutes)
            if batch['duration_minutes'] > 2:
                pump_cycle = 0.2 * np.sin(2 * np.pi * time_fraction * 
                                         (batch['duration_minutes'] / 2))
            else:
                pump_cycle = 0
            
            elution_pressure = pressure_trend + operational_noise + pump_cycle
            flow_pressure = batch['flow_pressure_avg'] * 0.5 + operational_noise * 0.3
            
            # Gel degradation increases slightly during batch
            gel_score = batch['gel_score'] + (time_fraction * 0.005)
            
            for i, timestamp in enumerate(time_points):
                high_res_records.append({
                    'timestamp': timestamp,
                    'run_id': batch['run_id'],
                    'batch_id': batch['batch_id'],
                    'batch_in_run': batch['batch_in_run'],
                    'elution_pressure': elution_pressure[i],
                    'flow_pressure': flow_pressure[i],
                    'gel_score': min(gel_score[i], 1.0),
                    'column_runs': batch['column_runs_completed'],
                    'is_operational': 1,
                    'phase': 'elution' if time_fraction[i] > 0.2 else 'loading'
                })
        
        df = pd.DataFrame(high_res_records)
        
        # Add anomalies
        df = self._add_anomalies(df, anomaly_rate=0.02)
        
        return df
    
    def _add_anomalies(self, df, anomaly_rate=0.02):
        """Add realistic anomalies to the data"""
        n_anomalies = int(len(df) * anomaly_rate)
        anomaly_indices = np.random.choice(len(df), n_anomalies, replace=False)
        
        df['anomaly'] = 0
        df['anomaly_type'] = 'none'
        
        anomaly_types = [
            'pressure_spike',
            'pressure_drop', 
            'gradual_drift',
            'stuck_value'
        ]
        
        for idx in anomaly_indices:
            anomaly_type = random.choice(anomaly_types)
            
            if anomaly_type == 'pressure_spike':
                df.loc[idx, 'elution_pressure'] *= 1.5  # 50% spike
                df.loc[idx, 'gel_score'] = min(df.loc[idx, 'gel_score'] + 0.1, 1.0)
                
            elif anomaly_type == 'pressure_drop':
                df.loc[idx, 'elution_pressure'] *= 0.7  # 30% drop
                
            elif anomaly_type == 'gradual_drift':
                # Affects this and next 5 samples
                for j in range(min(6, len(df) - idx)):
                    drift = 1 + (0.05 * (j + 1))
                    df.loc[idx + j, 'elution_pressure'] *= drift
                    df.loc[idx + j, 'gel_score'] = min(
                        df.loc[idx + j, 'gel_score'] + (0.02 * (j + 1)), 
                        1.0
                    )
                    df.loc[idx + j, 'anomaly'] = 1
                    df.loc[idx + j, 'anomaly_type'] = 'gradual_drift'
                continue
                
            elif anomaly_type == 'stuck_value':
                # Value gets stuck for several readings
                stuck_value = df.loc[idx, 'elution_pressure']
                for j in range(min(4, len(df) - idx)):
                    df.loc[idx + j, 'elution_pressure'] = stuck_value
                    df.loc[idx + j, 'anomaly'] = 1
                    df.loc[idx + j, 'anomaly_type'] = 'stuck_value'
                continue
            
            df.loc[idx, 'anomaly'] = 1
            df.loc[idx, 'anomaly_type'] = anomaly_type
        
        return df
    
    def generate_complete_dataset(self, total_days=30, resolution='high'):
        """Generate complete dataset with column run hierarchy"""
        print("Generating column run schedule...")
        run_schedule = self.generate_run_schedule(total_days)
        
        print(f"Generated {len(run_schedule)} column runs")
        print(f"Total operational hours: {run_schedule['duration_hrs'].sum():.1f}")
        
        print("\nGenerating batch-level data...")
        batch_data = self.generate_batch_level_data(run_schedule)
        
        print(f"Generated {len(batch_data)} batches")
        print(f"Average batches per run: {batch_data.groupby('run_id').size().mean():.1f}")
        
        if resolution == 'high':
            print("\nGenerating high-resolution time series...")
            time_series_data = self.generate_high_resolution_data(batch_data)
            print(f"Generated {len(time_series_data)} time points")
            return {
                'run_schedule': run_schedule,
                'batch_data': batch_data,
                'time_series': time_series_data
            }
        else:
            return {
                'run_schedule': run_schedule,
                'batch_data': batch_data
            }

ModuleNotFoundError: No module named 'gel_data_generator'

Unnamed: 0,timestamp,gel_score,elution_pressure,flow_pressure,column_runs,anomaly
0,2024-01-01 00:00:00,0.091802,9.983162,5.248398,2,0
1,2024-01-01 00:15:00,0.088615,11.431219,4.598737,3,0
2,2024-01-01 00:30:00,0.06705,10.941074,5.015491,7,0
3,2024-01-01 00:45:00,0.046035,10.515478,5.527602,9,0
4,2024-01-01 01:00:00,0.158832,9.112687,5.781643,11,0
