In [1]:
import sys
import numpy as np
from pathlib import Path

# Add parent directory to path so we can import the package
module_path = str(Path('.').absolute().parent)
if module_path not in sys.path:
    sys.path.append(module_path)

import pandas as pd
from pd_1d_rule_solver.pd_1d_rule_solver import RuleFinder  # This registers the extension automatically

In [2]:
import numpy as np
import pandas as pd

def generate_rule_data(n_samples=100000, n_features=12, random_state=42):
    np.random.seed(random_state)
    
    # Generate random features
    X = np.random.normal(0, 1, (n_samples, n_features))
    df = pd.DataFrame(X, columns=[f'x{i}' for i in range(n_features)])
    
    # Create target variable
    df['target'] = np.random.normal(10, 2, n_samples)
    
    # Define random rule conditions
    rule_cols = np.random.choice(n_features, 3, replace=False)
    # Generate four thresholds for defining 2 intervals but with a larger spread
    thresholds = np.random.uniform(-2, 2, 4)  # Wider range for thresholds
    thresholds = np.sort(thresholds)

    # Adjust intervals to be wider for a larger signal
    interval_width = 0.5  # You can adjust this value to control how wide the intervals are
    thresholds[1] += interval_width
    thresholds[3] += interval_width

    # Apply rule: multiply target by 2 when conditions are met
    mask = (df[f'x{rule_cols[0]}'] >= thresholds[0]) & \
           (df[f'x{rule_cols[0]}'] < thresholds[1]) & \
           (df[f'x{rule_cols[1]}'] >= thresholds[2]) & \
           (df[f'x{rule_cols[1]}'] < thresholds[3]) & \
           (df[f'x{rule_cols[2]}'] >= thresholds[1])  # Use the upper bound of the first interval for the third condition

    df.loc[mask, 'target'] *= 2
    
    print(f"Rule: IF ({thresholds[0]:.3f} <= x{rule_cols[0]} < {thresholds[1]:.3f} AND "
          f"{thresholds[2]:.3f} <= x{rule_cols[1]} < {thresholds[3]:.3f} AND "
          f"x{rule_cols[2]} >= {thresholds[1]:.3f})")
    print("Then: target *= 2")
    print(f"Rule applies to {mask.sum()} samples ({mask.mean()*100:.1f}%)")
    
    return df

In [3]:
df = generate_rule_data()

Rule: IF (0.154 <= x11 < 0.700 AND 0.486 <= x2 < 2.263 AND x1 >= 0.700)
Then: target *= 2
Rule applies to 1467 samples (1.5%)


In [4]:
list(df.columns[:-1])

['x0', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10', 'x11']

In [7]:
# Find rules with depth=3
dfcopy=df.sample(5000)
result = dfcopy.findrule(
    target='target',
    direction='maximize',
    variables=list(df.columns[:-1]),
    visualize=True,
    depth=3  # Try to find up to 3 conditions
)

# Print evolution of rules
print("Rule Evolution:")
for i, (rule, metrics) in enumerate(result['evolution']):
    print(f"\nIteration {i}:")
    print(f"Rule: {rule}")
    print(f"Score: {metrics['score']:.3f}")
    print(f"Coverage: {metrics['coverage']:.1%}")
    if i > 0:
        improvement = (metrics['score'] - result['evolution'][i-1][1]['score']) / abs(result['evolution'][i-1][1]['score'])
        print(f"Improvement: {improvement:.1%}")

# Print visualization for final rule
print("\nVisualization of final rule:")
print(result['visualization'])

# Print detailed metrics for final rule
print("\nDetailed metrics for the final rule:")
print(f"Score: {result['metrics']['score']:.3f}")
print(f"Coverage: {result['metrics']['coverage']:.1%}")
print(f"Matching samples: {result['metrics']['matching_samples']}")

# Print top 3 individual variable rules
print("\nTop 3 individual variable rules by score:")
sorted_1d_rules = sorted(
    result['onedim_rules'].items(),
    key=lambda x: x[1]['metrics']['score'],
    reverse=True
)[:3]

for var, rule_info in sorted_1d_rules:
    print(f"\n{var}:")
    print(f"Interval: {rule_info['rule'][var]}")
    print(f"Score: {rule_info['metrics']['score']:.3f}")
    print(f"Coverage: {rule_info['metrics']['coverage']:.1%}")

Rule Evolution:

Iteration 0:
Rule: {'x1': (np.float64(0.6970632375634072), np.float64(1.8011695245588597))}
Score: 0.033
Coverage: 22.5%

Iteration 1:
Rule: {'x1': (np.float64(0.6970632375634072), np.float64(1.8011695245588597)), 'x11': (np.float64(0.15784220705488444), np.float64(0.7052288041328205))}
Score: 0.126
Coverage: 4.7%
Improvement: 285.8%

Iteration 2:
Rule: {'x1': (np.float64(0.6970632375634072), np.float64(1.8011695245588597)), 'x11': (np.float64(0.15784220705488444), np.float64(0.7052288041328205)), 'x2': (np.float64(0.4857958627222803), np.float64(2.1583091797193545))}
Score: 0.980
Coverage: 1.6%
Improvement: 678.3%

Visualization of final rule:
Rule Impact Analysis

Rule Conditions:
  x1: 0.697 to 1.801
  x11: 0.158 to 0.705
  x2: 0.486 to 2.158

Matching Samples: 78 (1.6% of data)

target Distribution:
Original median: 10.04
Rule median: 19.82

     3.1 │░                                                           
     5.5 │░░░░░░░░░░░░░░░░░░░░░                       