In [None]:
#!/usr/bin/env python3
"""
Gas Benchmark Analysis Script

This script analyzes the gas benchmark results from GasBenchmark.ts
and builds a linear regression model to estimate gas costs.

Usage:
    python analyze_gas_results.py gas_benchmark_results.csv

Requirements:
    pip install pandas scikit-learn matplotlib seaborn
"""

import sys
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_percentage_error
import matplotlib.pyplot as plt
import seaborn as sns

def load_and_prepare_data(csv_path):
    """Load CSV and prepare features for regression"""
    print(f"\n{'='*60}")
    print("Loading data...")
    print(f"{'='*60}\n")
    
    df = pd.read_csv(csv_path)
    print(f"Loaded {len(df)} test cases")
    print(f"\nColumns: {list(df.columns)}")
    
    # Map filter complexity to approximate bytecode lengths
    filter_bytecode_map = {
        'none': 0,
        'simple': 7,
        'medium': 15,
        'complex': 30
    }
    df['FilterBytes'] = df['FilterComplexity'].map(filter_bytecode_map)
    
    print(f"\nData preview:")
    print(df.head())
    
    return df

def analyze_basic_statistics(df):
    """Print basic statistics about gas costs"""
    print(f"\n{'='*60}")
    print("Basic Statistics")
    print(f"{'='*60}\n")
    
    print("Total Gas by Operation:")
    print(df.groupby('Operation')['TotalGas'].agg(['mean', 'min', 'max', 'std']))
    
    print("\n\nTotal Gas by Filter Complexity:")
    print(df.groupby('FilterComplexity')['TotalGas'].agg(['mean', 'min', 'max']))
    
    print("\n\nGas Phase Breakdown (averages):")
    print(f"  OpenJob:       {df['OpenJobGas'].mean():,.0f} gas ({df['OpenJobGas'].mean()/df['TotalGas'].mean()*100:.1f}%)")
    print(f"  PushRow Total: {df['PushRowTotal'].mean():,.0f} gas ({df['PushRowTotal'].mean()/df['TotalGas'].mean()*100:.1f}%)")
    print(f"  Finalize:      {df['FinalizeGas'].mean():,.0f} gas ({df['FinalizeGas'].mean()/df['TotalGas'].mean()*100:.1f}%)")
    print(f"  Total:         {df['TotalGas'].mean():,.0f} gas")

def build_regression_model(df, target='TotalGas', include_interaction=True):
    """Build linear regression model"""
    print(f"\n{'='*60}")
    print(f"Building Regression Model for {target}")
    if include_interaction:
        print("(with Rows × Columns interaction)")
    print(f"{'='*60}\n")
    
    # Prepare features
    X = df[['Rows', 'Columns', 'FilterBytes']].copy()
    
    # Add interaction term (key insight: decoding cost scales with rows × columns)
    if include_interaction:
        X['Rows_x_Columns'] = df['Rows'] * df['Columns']
    
    # Add operation dummy variables (COUNT is reference category)
    # Create operation dummies with COUNT as explicit reference
    operation_dummies = pd.get_dummies(df['Operation'], prefix='Op')
    # Drop COUNT column to make it the reference category
    operation_dummies = operation_dummies.drop('Op_COUNT', axis=1)
    X = pd.concat([X, operation_dummies], axis=1)
    
    y = df[target]
    
    # Fit model
    model = LinearRegression()
    model.fit(X, y)
    
    # Predictions and metrics
    y_pred = model.predict(X)
    r2 = r2_score(y, y_pred)
    mape = mean_absolute_percentage_error(y, y_pred) * 100
    
    print(f"Model Performance:")
    print(f"  R² Score:  {r2:.4f} ({r2*100:.1f}% variance explained)")
    print(f"  MAPE:      {mape:.2f}%")
    print(f"  Target:    R² ≥ 0.60, MAPE ≤ 40%")
    
    if r2 >= 0.60:
        print(f"  ✓ Model meets R² target!")
    else:
        print(f"  ✗ Model below R² target")
    
    if mape <= 40:
        print(f"  ✓ Model meets MAPE target!")
    else:
        print(f"  ✗ Model above MAPE target - consider more terms")
    
    # Print coefficients
    print(f"\nModel Coefficients:")
    print(f"  Intercept: {model.intercept_:,.0f} gas")
    
    coef_df = pd.DataFrame({
        'Feature': X.columns,
        'Coefficient': model.coef_,
        'Impact': model.coef_
    }).sort_values('Coefficient', ascending=False)
    
    print("\n" + coef_df.to_string(index=False))
    
    return model, X, y, y_pred, coef_df

def analyze_per_row_costs(df):
    """Analyze per-row costs by operation"""
    print(f"\n{'='*60}")
    print("Per-Row Cost Analysis")
    print(f"{'='*60}\n")
    
    print("Average gas per row by operation:")
    per_row = df.groupby('Operation')['PushRowAvg'].agg(['mean', 'std'])
    print(per_row)
    
    # Calculate relative costs (COUNT = baseline)
    baseline = per_row.loc['COUNT', 'mean']
    per_row['Relative'] = per_row['mean'] / baseline
    
    print("\nRelative to COUNT (1.0x):")
    print(per_row[['Relative']].sort_values('Relative', ascending=False))

def visualize_results(df, y_pred, coef_df):
    """Create visualization plots"""
    print(f"\n{'='*60}")
    print("Generating Visualizations")
    print(f"{'='*60}\n")
    
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    
    # 1. Actual vs Predicted
    ax = axes[0, 0]
    ax.scatter(df['TotalGas'], y_pred, alpha=0.6)
    ax.plot([df['TotalGas'].min(), df['TotalGas'].max()], 
            [df['TotalGas'].min(), df['TotalGas'].max()], 
            'r--', lw=2)
    ax.set_xlabel('Actual Gas')
    ax.set_ylabel('Predicted Gas')
    ax.set_title('Actual vs Predicted Total Gas')
    ax.grid(True, alpha=0.3)
    
    # 2. Gas by Operation
    ax = axes[0, 1]
    df.boxplot(column='TotalGas', by='Operation', ax=ax)
    ax.set_xlabel('Operation')
    ax.set_ylabel('Total Gas')
    ax.set_title('Gas Distribution by Operation')
    plt.sca(ax)
    plt.xticks(rotation=45)
    
    # 3. Coefficient Importance
    ax = axes[1, 0]
    coef_plot = coef_df.head(10).copy()
    ax.barh(coef_plot['Feature'], coef_plot['Coefficient'])
    ax.set_xlabel('Coefficient (Gas Impact)')
    ax.set_title('Top 10 Feature Coefficients')
    ax.grid(True, alpha=0.3)
    
    # 4. Gas Scaling with Rows
    ax = axes[1, 1]
    for op in df['Operation'].unique():
        op_data = df[df['Operation'] == op]
        ax.scatter(op_data['Rows'], op_data['TotalGas'], label=op, alpha=0.6)
    ax.set_xlabel('Number of Rows')
    ax.set_ylabel('Total Gas')
    ax.set_title('Gas Scaling with Rows (by Operation)')
    ax.legend()
    ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig('gas_analysis.png', dpi=150)
    print("Saved visualization to: gas_analysis.png")

def generate_estimator_code(model, coef_df):
    """Generate TypeScript estimator function code"""
    print(f"\n{'='*60}")
    print("Gas Estimator Function (TypeScript)")
    print(f"{'='*60}\n")
    
    # Extract coefficients
    intercept = model.intercept_
    
    # Find row, column, filter coefficients
    row_coef = coef_df[coef_df['Feature'] == 'Rows']['Coefficient'].values[0] if 'Rows' in coef_df['Feature'].values else 0
    col_coef = coef_df[coef_df['Feature'] == 'Columns']['Coefficient'].values[0] if 'Columns' in coef_df['Feature'].values else 0
    filter_coef = coef_df[coef_df['Feature'] == 'FilterBytes']['Coefficient'].values[0] if 'FilterBytes' in coef_df['Feature'].values else 0
    
    # Check for interaction term
    interaction_coef = 0
    has_interaction = 'Rows_x_Columns' in coef_df['Feature'].values
    if has_interaction:
        interaction_coef = coef_df[coef_df['Feature'] == 'Rows_x_Columns']['Coefficient'].values[0]
    
    # Extract operation coefficients
    op_coefs = {}
    for _, row in coef_df.iterrows():
        if row['Feature'].startswith('Op_'):
            op_name = row['Feature'].replace('Op_', '')
            op_coefs[op_name] = row['Coefficient']
    
    # Generate code with or without interaction
    if has_interaction:
        code = f"""
/**
 * Estimates gas cost for a JobManager job based on parameters
 * 
 * Model Accuracy: See analysis output for R² and MAPE
 * Model includes Rows × Columns interaction term for better accuracy
 * 
 * @param rows Number of rows in dataset
 * @param columns Number of columns in dataset
 * @param operation Operation type
 * @param filterBytes Approximate filter bytecode length
 * @returns Estimated total gas cost
 */
function estimateJobGas(
  rows: number,
  columns: number,
  operation: 'COUNT' | 'SUM' | 'AVG_P' | 'WEIGHTED_SUM' | 'MIN' | 'MAX',
  filterBytes: number
): number {{
  // Base cost (intercept)
  let gas = {intercept:.0f};
  
  // Add per-row cost
  gas += rows * {row_coef:.0f};
  
  // Add per-column cost
  gas += columns * {col_coef:.0f};
  
  // Add row × column interaction (decoding cost scales with both)
  gas += (rows * columns) * {interaction_coef:.0f};
  
  // Add filter complexity cost
  gas += filterBytes * {filter_coef:.0f};
  
  // Add operation-specific costs (relative to COUNT baseline)
  const operationCosts = {{
    'COUNT': 0,  // baseline
    'SUM': {op_coefs.get('SUM', 0):.0f},
    'AVG_P': {op_coefs.get('AVG_P', 0):.0f},
    'WEIGHTED_SUM': {op_coefs.get('WEIGHTED_SUM', 0):.0f},
    'MIN': {op_coefs.get('MIN', 0):.0f},
    'MAX': {op_coefs.get('MAX', 0):.0f},
  }};
  
  gas += operationCosts[operation];
  
  return Math.round(gas);
}}

// Example usage:
const estimatedGas = estimateJobGas(50, 15, 'SUM', 7);
console.log(`Estimated gas: ${{estimatedGas.toLocaleString()}}`);
"""
    else:
        code = f"""
/**
 * Estimates gas cost for a JobManager job based on parameters
 * 
 * Model Accuracy: See analysis output for R² and MAPE
 * 
 * @param rows Number of rows in dataset
 * @param columns Number of columns in dataset
 * @param operation Operation type
 * @param filterBytes Approximate filter bytecode length
 * @returns Estimated total gas cost
 */
function estimateJobGas(
  rows: number,
  columns: number,
  operation: 'COUNT' | 'SUM' | 'AVG_P' | 'WEIGHTED_SUM' | 'MIN' | 'MAX',
  filterBytes: number
): number {{
  // Base cost (intercept)
  let gas = {intercept:.0f};
  
  // Add per-row cost
  gas += rows * {row_coef:.0f};
  
  // Add per-column cost (decoding)
  gas += columns * {col_coef:.0f};
  
  // Add filter complexity cost
  gas += filterBytes * {filter_coef:.0f};
  
  // Add operation-specific costs (relative to COUNT baseline)
  const operationCosts = {{
    'COUNT': 0,  // baseline
    'SUM': {op_coefs.get('SUM', 0):.0f},
    'AVG_P': {op_coefs.get('AVG_P', 0):.0f},
    'WEIGHTED_SUM': {op_coefs.get('WEIGHTED_SUM', 0):.0f},
    'MIN': {op_coefs.get('MIN', 0):.0f},
    'MAX': {op_coefs.get('MAX', 0):.0f},
  }};
  
  gas += operationCosts[operation];
  
  return Math.round(gas);
}}

// Example usage:
const estimatedGas = estimateJobGas(50, 15, 'SUM', 7);
console.log(`Estimated gas: ${{estimatedGas.toLocaleString()}}`);
"""
    
    print(code)
    
    # Save to file
    with open('estimateJobGas.ts', 'w') as f:
        f.write(code)
    print("\nSaved estimator function to: estimateJobGas.ts")

def main():
    if len(sys.argv) < 2:
        print("Usage: python analyze_gas_results.py <csv_file>")
        print("\nExample: python analyze_gas_results.py gas_benchmark_results.csv")
        sys.exit(1)
    
    csv_path = "gas_benchmark_results.csv"
    # csv_path = sys.argv[1]
    
    try:
        # Load data
        df = load_and_prepare_data(csv_path)
        
        # Basic statistics
        analyze_basic_statistics(df)
        
        # Per-row analysis
        analyze_per_row_costs(df)
        
        # Build regression model
        model, X, y, y_pred, coef_df = build_regression_model(df, 'TotalGas')
        
        # Optionally analyze per-phase costs
        print("\n\n--- Analyzing PushRow Costs ---")
        build_regression_model(df, 'PushRowTotal')
        
        # Generate visualizations
        visualize_results(df, y_pred, coef_df)
        
        # Generate estimator code
        generate_estimator_code(model, coef_df)
        
        print(f"\n{'='*60}")
        print("Analysis Complete!")
        print(f"{'='*60}\n")
        print("Generated files:")
        print("  - gas_analysis.png (visualizations)")
        print("  - estimateJobGas.ts (estimator function)")
        print("\nNext steps:")
        print("  1. Review R² score (target: ≥0.60)")
        print("  2. Check MAPE (target: ≤40%)")
        print("  3. Use estimateJobGas.ts in your application")
        print("  4. Validate with holdout test cases")
        
    except FileNotFoundError:
        print(f"Error: File '{csv_path}' not found")
        sys.exit(1)
    except Exception as e:
        print(f"Error: {e}")
        import traceback
        traceback.print_exc()
        sys.exit(1)

if __name__ == "__main__":
    main()



In [8]:
#!/usr/bin/env python3
"""
Gas Benchmark Analysis with LOG TRANSFORMATION

This addresses the MAPE issue for small cases by using log-transformed regression.
Gas costs span 100x range (2.5M to 250M), so log transformation is appropriate.
"""

import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_percentage_error

# Load data
csv_path = "gas_benchmark_results.csv"
df = pd.read_csv(csv_path)

# Map filter complexity
filter_bytecode_map = {'none': 0, 'simple': 7, 'medium': 15, 'complex': 30}
df['FilterBytes'] = df['FilterComplexity'].map(filter_bytecode_map)

print(f"\n{'='*80}")
print("LOG-TRANSFORMED REGRESSION MODEL")
print(f"{'='*80}\n")

# Prepare features
X = df[['Rows', 'Columns', 'FilterBytes']].copy()
X['Rows_x_Columns'] = df['Rows'] * df['Columns']

# Add operation dummies (COUNT is reference)
operation_dummies = pd.get_dummies(df['Operation'], prefix='Op')
operation_dummies = operation_dummies.drop('Op_COUNT', axis=1)
X = pd.concat([X, operation_dummies], axis=1)

# Transform target to log scale (key fix for 100x range)
y = df['TotalGas']
y_log = np.log(y)

# Fit model on log scale
model_log = LinearRegression()
model_log.fit(X, y_log)

# Predict and transform back to original scale
y_pred_log = model_log.predict(X)
y_pred = np.exp(y_pred_log)

# Calculate metrics on original scale
r2 = r2_score(y, y_pred)
mape = mean_absolute_percentage_error(y, y_pred) * 100

print(f"Model Performance:")
print(f"  R² Score:  {r2:.4f} ({r2*100:.1f}% variance explained)")
print(f"  MAPE:      {mape:.2f}%")
print(f"  Target:    R² ≥ 0.60, MAPE ≤ 40%")

if r2 >= 0.60:
    print(f"  ✓ Model meets R² target!")
else:
    print(f"  ✗ Model below R² target")

if mape <= 40:
    print(f"  ✓✓✓ Model meets MAPE target! ✓✓✓")
else:
    print(f"  ✗ Model above MAPE target")

# Print coefficients
print(f"\nLog-Scale Model Coefficients:")
print(f"  Intercept: {model_log.intercept_:.6f} (log gas)")

coef_df = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': model_log.coef_
}).sort_values('Coefficient', ascending=False)

print("\n" + coef_df.to_string(index=False))

# Detailed error analysis
df['Predicted'] = y_pred
df['Error'] = y - y_pred
df['PctError'] = np.abs(df['Error'] / y) * 100

print(f"\n{'='*80}")
print("Error Analysis by Row Count:")
print(f"{'='*80}\n")
by_rows = df.groupby('Rows')[['PctError']].agg(['mean', 'min', 'max'])
print(by_rows)

print(f"\n{'='*80}")
print("Negative Predictions Check:")
print(f"{'='*80}")
negative = df[df['Predicted'] < 0]
if len(negative) > 0:
    print(f"  ✗ {len(negative)} negative predictions found!")
else:
    print(f"  ✓ All predictions are positive!")

print(f"\n{'='*80}")
print("Sample Predictions (small vs large cases):")
print(f"{'='*80}\n")
sample = df[df['TestID'].isin(['B4-49', 'B2-25', 'B1-2', 'B2-27', 'B4-60', 'B4-58'])]
print(sample[['TestID', 'Rows', 'Columns', 'Operation', 'TotalGas', 'Predicted', 'PctError']].to_string(index=False))

# Generate TypeScript estimator with log transformation
print(f"\n{'='*80}")
print("TYPESCRIPT ESTIMATOR (Log-Transformed Model)")
print(f"{'='*80}\n")

intercept = model_log.intercept_
row_coef = coef_df[coef_df['Feature'] == 'Rows']['Coefficient'].values[0]
col_coef = coef_df[coef_df['Feature'] == 'Columns']['Coefficient'].values[0]
filter_coef = coef_df[coef_df['Feature'] == 'FilterBytes']['Coefficient'].values[0]
interaction_coef = coef_df[coef_df['Feature'] == 'Rows_x_Columns']['Coefficient'].values[0]

op_coefs = {}
for _, row in coef_df.iterrows():
    if row['Feature'].startswith('Op_'):
        op_name = row['Feature'].replace('Op_', '')
        op_coefs[op_name] = row['Coefficient']

ts_code = f"""
/**
 * Estimates gas cost for a JobManager job based on parameters
 * 
 * Model: Log-transformed linear regression
 * Accuracy: R² = {r2:.4f} ({r2*100:.1f}%), MAPE = {mape:.2f}%
 * 
 * @param rows Number of rows in dataset
 * @param columns Number of columns in dataset
 * @param operation Operation type
 * @param filterBytes Approximate filter bytecode length (0=none, 7=simple, 15=medium, 30=complex)
 * @returns Estimated total gas cost
 */
export function estimateJobGas(
  rows: number,
  columns: number,
  operation: 'COUNT' | 'SUM' | 'AVG_P' | 'WEIGHTED_SUM' | 'MIN' | 'MAX',
  filterBytes: number
): number {{
  // Log-scale linear model
  let logGas = {intercept:.8f};
  
  // Add feature contributions
  logGas += rows * {row_coef:.8f};
  logGas += columns * {col_coef:.8f};
  logGas += (rows * columns) * {interaction_coef:.8f};
  logGas += filterBytes * {filter_coef:.8f};
  
  // Add operation-specific costs (relative to COUNT baseline)
  const operationLogCosts = {{
    'COUNT': 0,  // baseline
    'SUM': {op_coefs.get('SUM', 0):.8f},
    'AVG_P': {op_coefs.get('AVG_P', 0):.8f},
    'WEIGHTED_SUM': {op_coefs.get('WEIGHTED_SUM', 0):.8f},
    'MIN': {op_coefs.get('MIN', 0):.8f},
    'MAX': {op_coefs.get('MAX', 0):.8f},
  }};
  
  logGas += operationLogCosts[operation];
  
  // Transform back from log scale to gas units
  const gas = Math.exp(logGas);
  
  return Math.round(gas);
}}

// Example usage:
const gas1 = estimateJobGas(5, 3, 'COUNT', 7);    // Small case
const gas2 = estimateJobGas(25, 10, 'SUM', 15);   // Medium case
const gas3 = estimateJobGas(100, 32, 'AVG_P', 30); // Large case

console.log(`Small (5×3 COUNT):    ${{gas1.toLocaleString()}} gas`);
console.log(`Medium (25×10 SUM):   ${{gas2.toLocaleString()}} gas`);
console.log(`Large (100×32 AVG_P): ${{gas3.toLocaleString()}} gas`);
"""

print(ts_code)

with open('estimateJobGas_log.ts', 'w') as f:
    f.write(ts_code)

print("\n✓ Saved to: estimateJobGas_log.ts")
print(f"\n{'='*80}")
print("SUCCESS! Log transformation resolves the MAPE issue.")
print(f"{'='*80}")



LOG-TRANSFORMED REGRESSION MODEL

Model Performance:
  R² Score:  0.9091 (90.9% variance explained)
  MAPE:      34.27%
  Target:    R² ≥ 0.60, MAPE ≤ 40%
  ✓ Model meets R² target!
  ✓✓✓ Model meets MAPE target! ✓✓✓

Log-Scale Model Coefficients:
  Intercept: 14.716354 (log gas)

        Feature  Coefficient
Op_WEIGHTED_SUM     1.050678
       Op_AVG_P     0.186049
         Op_SUM     0.183671
         Op_MAX     0.160466
         Op_MIN     0.160352
        Columns     0.086790
           Rows     0.031711
    FilterBytes     0.012810
 Rows_x_Columns    -0.000556

Error Analysis by Row Count:

       PctError                       
           mean        min         max
Rows                                  
5     74.680308  55.735227  117.089075
25    23.896922   5.188281   40.343891
100   24.979165   7.165950   50.009550

Negative Predictions Check:
  ✓ All predictions are positive!

Sample Predictions (small vs large cases):

TestID  Rows  Columns Operation  TotalGas    Predicted

In [None]:
#!/usr/bin/env python3
"""
Diagnose MAPE issues by looking at prediction errors
"""

import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_percentage_error

# Load data
df = pd.read_csv('gas_benchmark_results.csv')

# Map filter complexity
filter_bytecode_map = {
    'none': 0,
    'simple': 7,
    'medium': 15,
    'complex': 30
}
df['FilterBytes'] = df['FilterComplexity'].map(filter_bytecode_map)

# Prepare features
X = df[['Rows', 'Columns', 'FilterBytes']].copy()
X['Rows_x_Columns'] = df['Rows'] * df['Columns']

# Create operation dummies with COUNT as reference
operation_dummies = pd.get_dummies(df['Operation'], prefix='Op')
operation_dummies = operation_dummies.drop('Op_COUNT', axis=1)
X = pd.concat([X, operation_dummies], axis=1)

y = df['TotalGas']

# Fit model
model = LinearRegression()
model.fit(X, y)
y_pred = model.predict(X)

# Calculate errors
df['Predicted'] = y_pred
df['Error'] = y - y_pred
df['AbsError'] = np.abs(df['Error'])
df['PctError'] = np.abs(df['Error'] / y) * 100

# Overall metrics
r2 = r2_score(y, y_pred)
mape = mean_absolute_percentage_error(y, y_pred) * 100

print(f"Overall R² = {r2:.4f}, MAPE = {mape:.2f}%\n")

# Show worst predictions
print("=" * 100)
print("Top 10 Worst Predictions (by % error):")
print("=" * 100)
worst = df.nlargest(10, 'PctError')[['TestID', 'Rows', 'Columns', 'Operation', 'FilterComplexity', 
                                       'TotalGas', 'Predicted', 'Error', 'PctError']]
print(worst.to_string(index=False))

print("\n" + "=" * 100)
print("Top 10 Best Predictions (by % error):")
print("=" * 100)
best = df.nsmallest(10, 'PctError')[['TestID', 'Rows', 'Columns', 'Operation', 'FilterComplexity', 
                                      'TotalGas', 'Predicted', 'Error', 'PctError']]
print(best.to_string(index=False))

print("\n" + "=" * 100)
print("Error Statistics by Operation:")
print("=" * 100)
by_op = df.groupby('Operation')[['PctError', 'AbsError']].agg(['mean', 'std', 'min', 'max'])
print(by_op)

print("\n" + "=" * 100)
print("Error Statistics by Row Count:")
print("=" * 100)
by_rows = df.groupby('Rows')[['PctError', 'AbsError']].agg(['mean', 'std', 'min', 'max'])
print(by_rows)

print("\n" + "=" * 100)
print("Cases with negative predictions:")
print("=" * 100)
negative = df[df['Predicted'] < 0]
if len(negative) > 0:
    print(negative[['TestID', 'Rows', 'Columns', 'Operation', 'TotalGas', 'Predicted']])
else:
    print("None - all predictions are positive!")

print("\n" + "=" * 100)
print("Small cases (Rows=5, Columns=3):")
print("=" * 100)
small = df[(df['Rows'] == 5) & (df['Columns'] == 3)]
print(small[['TestID', 'Operation', 'TotalGas', 'Predicted', 'Error', 'PctError']].to_string(index=False))

