In [1]:
"""
FEATURE ENGINEERING FOR PRICE OPTIMIZATION
==========================================

BUSINESS OBJECTIVE: Create features that capture pricing dynamics
TECHNICAL OBJECTIVE: Transform raw data into predictive features for ML

FEATURE CATEGORIES:
1. Price Features - How our price relates to market
2. Temporal Features - Time-based patterns
3. Competitive Features - Market positioning
4. Historical Features - Past behavior indicators
5. Interaction Features - Combined effects
6. Target Engineering - What we're predicting
"""

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')



In [2]:
# Load data
df = pd.read_csv('lab_equipment_pricing.csv')
df['date'] = pd.to_datetime(df['date'])

print(f"Starting with {len(df):,} records and {df.shape[1]} columns")
print()

Starting with 10,000 records and 17 columns



In [4]:
# ============================================================================
# CATEGORY 1: PRICE FEATURES
# ============================================================================
print("="*80)
print("CATEGORY 1: PRICE FEATURES")
print("="*80)
print()
print("WHY: Absolute price doesn't matter as much as RELATIVE price")
print("BUSINESS LOGIC: Customers compare our price to competitors and their own budget")
print()

print("1.1 Competitive Position Features")
print("-"*80)

# Price difference from competitor
df['price_diff_competitor'] = df['price'] - df['competitor_price']
print("✓ price_diff_competitor: Dollar difference from competitor")
print("  Business: Are we $500 more expensive or cheaper?")
print()

# Price ratio to competitor
df['price_ratio_competitor'] = df['price'] / df['competitor_price']
print("✓ price_ratio_competitor: Ratio of our price to competitor")
print("  Business: Are we 1.1x (10% premium) or 0.9x (10% discount)?")
print()

# Percent difference from competitor
df['price_pct_vs_competitor'] = ((df['price'] - df['competitor_price']) / df['competitor_price'] * 100).round(2)
print("✓ price_pct_vs_competitor: Percentage premium/discount")
print("  Business: +15% means we're 15% more expensive")
print()

# Competitive position flag
df['is_premium_vs_competitor'] = (df['price'] > df['competitor_price']).astype(int)
print("✓ is_premium_vs_competitor: Binary flag (1=more expensive, 0=cheaper)")
print("  Business: Quick filter for premium vs discount positioning")
print()



CATEGORY 1: PRICE FEATURES

WHY: Absolute price doesn't matter as much as RELATIVE price
BUSINESS LOGIC: Customers compare our price to competitors and their own budget

1.1 Competitive Position Features
--------------------------------------------------------------------------------
✓ price_diff_competitor: Dollar difference from competitor
  Business: Are we $500 more expensive or cheaper?

✓ price_ratio_competitor: Ratio of our price to competitor
  Business: Are we 1.1x (10% premium) or 0.9x (10% discount)?

✓ price_pct_vs_competitor: Percentage premium/discount
  Business: +15% means we're 15% more expensive

✓ is_premium_vs_competitor: Binary flag (1=more expensive, 0=cheaper)
  Business: Quick filter for premium vs discount positioning



In [5]:
print("1.2 Product-Specific Price Features")
print("-"*80)

# Calculate product base price (median)
product_base_prices = df.groupby('product')['price'].median().to_dict()
df['product_base_price'] = df['product'].map(product_base_prices)
print("✓ product_base_price: Typical price for this product")
print("  Business: Benchmark for 'normal' pricing")
print()

# Price deviation from product norm
df['price_vs_product_norm'] = ((df['price'] - df['product_base_price']) / df['product_base_price'] * 100).round(2)
print("✓ price_vs_product_norm: How far from typical price")
print("  Business: +20% means this is priced 20% above normal for this product")
print()

# Price positioning (low/medium/high for product)
df['price_tier'] = df.groupby('product')['price'].transform(
    lambda x: pd.qcut(x, q=3, labels=['Low', 'Medium', 'High'], duplicates='drop')
)
print("✓ price_tier: Price bucket within product (Low/Medium/High)")
print("  Business: Is this a budget, standard, or premium instance?")
print()


1.2 Product-Specific Price Features
--------------------------------------------------------------------------------
✓ product_base_price: Typical price for this product
  Business: Benchmark for 'normal' pricing

✓ price_vs_product_norm: How far from typical price
  Business: +20% means this is priced 20% above normal for this product

✓ price_tier: Price bucket within product (Low/Medium/High)
  Business: Is this a budget, standard, or premium instance?



In [6]:
# ============================================================================
# CATEGORY 2: TEMPORAL FEATURES
# ============================================================================
print("\n" + "="*80)
print("CATEGORY 2: TEMPORAL FEATURES")
print("="*80)
print()
print("WHY: Demand fluctuates by time (academic calendar, budget cycles)")
print("BUSINESS LOGIC: Can charge more in high season, must discount in low season")
print()

print("2.1 Calendar Features")
print("-"*80)

# Extract time components
df['year'] = df['date'].dt.year
df['week_of_year'] = df['date'].dt.isocalendar().week
df['day_of_year'] = df['date'].dt.dayofyear
print("✓ year, week_of_year, day_of_year: Basic calendar features")
print("  Business: Identify long-term trends")
print()

print("2.2 Seasonality Indicators")
print("-"*80)

# Academic year patterns
df['is_academic_start'] = df['month'].isin([9, 10]).astype(int)
print("✓ is_academic_start: Flag for Sep/Oct (high season)")
print("  Business: Universities start new year, budgets refresh")
print()

df['is_summer_slowdown'] = df['month'].isin([6, 7, 8]).astype(int)
print("✓ is_summer_slowdown: Flag for summer months")
print("  Business: Academic slowdown, fewer orders")
print()

df['is_year_end'] = (df['month'] == 12).astype(int)
print("✓ is_year_end: Flag for December")
print("  Business: Budget flush (spend remaining budget)")
print()

df['is_quarter_end'] = df['month'].isin([3, 6, 9, 12]).astype(int)
print("✓ is_quarter_end: Flag for quarter-end months")
print("  Business: Procurement departments have quarterly goals")
print()

print("2.3 Business Cycle Features")
print("-"*80)

# Weekend flag
df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
print("✓ is_weekend: Flag for Sat/Sun")
print("  Business: B2B sales typically lower on weekends")
print()

# Season encoding
def get_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Fall'

df['season'] = df['month'].apply(get_season)
print("✓ season: Meteorological season")
print("  Business: Broader seasonal patterns")
print()


CATEGORY 2: TEMPORAL FEATURES

WHY: Demand fluctuates by time (academic calendar, budget cycles)
BUSINESS LOGIC: Can charge more in high season, must discount in low season

2.1 Calendar Features
--------------------------------------------------------------------------------
✓ year, week_of_year, day_of_year: Basic calendar features
  Business: Identify long-term trends

2.2 Seasonality Indicators
--------------------------------------------------------------------------------
✓ is_academic_start: Flag for Sep/Oct (high season)
  Business: Universities start new year, budgets refresh

✓ is_summer_slowdown: Flag for summer months
  Business: Academic slowdown, fewer orders

✓ is_year_end: Flag for December
  Business: Budget flush (spend remaining budget)

✓ is_quarter_end: Flag for quarter-end months
  Business: Procurement departments have quarterly goals

2.3 Business Cycle Features
--------------------------------------------------------------------------------
✓ is_weekend: Flag 

In [7]:
# ============================================================================
# CATEGORY 3: COMPETITIVE FEATURES
# ============================================================================
print("\n" + "="*80)
print("CATEGORY 3: COMPETITIVE FEATURES")
print("="*80)
print()
print("WHY: Our pricing doesn't exist in vacuum - competitors matter")
print("BUSINESS LOGIC: Must respond to competitive pressure")
print()

print("3.1 Competitive Pressure Indicators")
print("-"*80)

# Competitor promotion flag already exists
print("✓ competitor_promotion: Already in data (0/1 flag)")
print("  Business: When competitor discounts, we face pressure")
print()

# Competitive intensity (price gap when they're promoting)
df['competitive_intensity'] = df['competitor_promotion'] * np.abs(df['price_diff_competitor'])
print("✓ competitive_intensity: Price gap × competitor promotion")
print("  Business: High when competitor promotes AND we're expensive")
print()

# Days since our last promotion
print("✓ days_since_promotion: Already in data")
print("  Business: Recent promo = customers may wait for next one")
print()

# Promotion recency categories
df['promo_recency'] = pd.cut(df['days_since_promotion'], 
                              bins=[0, 30, 90, 180],
                              labels=['Recent', 'Medium', 'Long_Ago'],
                              include_lowest=True)
print("✓ promo_recency: Categorical version of promo timing")
print("  Business: Recent (<30 days), Medium (30-90), Long ago (90+)")
print()


CATEGORY 3: COMPETITIVE FEATURES

WHY: Our pricing doesn't exist in vacuum - competitors matter
BUSINESS LOGIC: Must respond to competitive pressure

3.1 Competitive Pressure Indicators
--------------------------------------------------------------------------------
✓ competitor_promotion: Already in data (0/1 flag)
  Business: When competitor discounts, we face pressure

✓ competitive_intensity: Price gap × competitor promotion
  Business: High when competitor promotes AND we're expensive

✓ days_since_promotion: Already in data
  Business: Recent promo = customers may wait for next one

✓ promo_recency: Categorical version of promo timing
  Business: Recent (<30 days), Medium (30-90), Long ago (90+)



In [8]:
# ============================================================================
# CATEGORY 4: INVENTORY FEATURES
# ============================================================================
print("\n" + "="*80)
print("CATEGORY 4: INVENTORY FEATURES")
print("="*80)
print()
print("WHY: High inventory creates urgency to sell")
print("BUSINESS LOGIC: Holding costs + obsolescence risk = pressure to discount")
print()

print("4.1 Inventory Pressure Metrics")
print("-"*80)

# Inventory level already exists
print("✓ inventory_level: Already in data (0-200 scale)")
print()

# Inventory as percentage of max
df['inventory_pct'] = (df['inventory_level'] / df['inventory_level'].max() * 100).round(1)
print("✓ inventory_pct: Inventory as % of maximum")
print("  Business: 80% = high inventory pressure")
print()

# Inventory categories
df['inventory_status'] = pd.cut(df['inventory_level'],
                                 bins=[0, 50, 100, 150, 200],
                                 labels=['Low', 'Medium', 'High', 'Very_High'])
print("✓ inventory_status: Categorical inventory levels")
print("  Business: Very High = need to move stock")
print()

# High inventory flag (top 25%)
threshold = df['inventory_level'].quantile(0.75)
df['high_inventory_flag'] = (df['inventory_level'] > threshold).astype(int)
print(f"✓ high_inventory_flag: 1 if inventory > {threshold:.0f}")
print("  Business: Trigger for promotional consideration")
print()


CATEGORY 4: INVENTORY FEATURES

WHY: High inventory creates urgency to sell
BUSINESS LOGIC: Holding costs + obsolescence risk = pressure to discount

4.1 Inventory Pressure Metrics
--------------------------------------------------------------------------------
✓ inventory_level: Already in data (0-200 scale)

✓ inventory_pct: Inventory as % of maximum
  Business: 80% = high inventory pressure

✓ inventory_status: Categorical inventory levels
  Business: Very High = need to move stock

✓ high_inventory_flag: 1 if inventory > 149
  Business: Trigger for promotional consideration



In [9]:
# ============================================================================
# CATEGORY 5: INTERACTION FEATURES
# ============================================================================
print("\n" + "="*80)
print("CATEGORY 5: INTERACTION FEATURES")
print("="*80)
print()
print("WHY: Combined effects matter more than individual features")
print("BUSINESS LOGIC: High price + high inventory = big problem")
print()

print("5.1 Price-Inventory Interactions")
print("-"*80)

# Price × Inventory (problem when both high)
df['price_inventory_pressure'] = (df['price_vs_product_norm'] / 100) * (df['inventory_pct'] / 100)
print("✓ price_inventory_pressure: Normalized price × inventory")
print("  Business: High = overpriced product with high stock (bad)")
print()

# High price AND high inventory flag
df['overpriced_overstocked'] = ((df['price_vs_product_norm'] > 0) & 
                                  (df['high_inventory_flag'] == 1)).astype(int)
print("✓ overpriced_overstocked: 1 if both price and inventory high")
print("  Business: Red flag situation")
print()

print("5.2 Competitive-Temporal Interactions")
print("-"*80)

# Competitor promo during high season
df['competitor_promo_high_season'] = (df['competitor_promotion'] * df['is_academic_start'])
print("✓ competitor_promo_high_season: Competitor promotion in peak season")
print("  Business: Worst time for competitor to promote")
print()

# Premium pricing in low season
df['premium_in_low_season'] = (df['is_premium_vs_competitor'] * df['is_summer_slowdown'])
print("✓ premium_in_low_season: Expensive during slow period")
print("  Business: Risky strategy - may lose sales")
print()

print("5.3 Product-Segment Interactions")
print("-"*80)

# Combine product and segment (some segments prefer certain products)
df['product_segment_combo'] = df['product'] + '_' + df['customer_segment']
print("✓ product_segment_combo: Unique product-segment pairs")
print("  Business: Pharma buying Centrifuge ≠ Academic buying Centrifuge")
print()



CATEGORY 5: INTERACTION FEATURES

WHY: Combined effects matter more than individual features
BUSINESS LOGIC: High price + high inventory = big problem

5.1 Price-Inventory Interactions
--------------------------------------------------------------------------------
✓ price_inventory_pressure: Normalized price × inventory
  Business: High = overpriced product with high stock (bad)

✓ overpriced_overstocked: 1 if both price and inventory high
  Business: Red flag situation

5.2 Competitive-Temporal Interactions
--------------------------------------------------------------------------------
✓ competitor_promo_high_season: Competitor promotion in peak season
  Business: Worst time for competitor to promote

✓ premium_in_low_season: Expensive during slow period
  Business: Risky strategy - may lose sales

5.3 Product-Segment Interactions
--------------------------------------------------------------------------------
✓ product_segment_combo: Unique product-segment pairs
  Business: Pharma

In [10]:
# ============================================================================
# CATEGORY 6: HISTORICAL/LAG FEATURES
# ============================================================================
print("\n" + "="*80)
print("CATEGORY 6: HISTORICAL FEATURES")
print("="*80)
print()
print("WHY: Past behavior predicts future (momentum, trends)")
print("BUSINESS LOGIC: If we raised prices recently, demand still adjusting")
print()

print("6.1 Rolling Statistics (by Product)")
print("-"*80)

# Sort by product and date for rolling calculations
df = df.sort_values(['product', 'date']).reset_index(drop=True)

# 7-day rolling average price
df['price_ma_7d'] = df.groupby('product')['price'].transform(
    lambda x: x.rolling(window=7, min_periods=1).mean()
)
print("✓ price_ma_7d: 7-day moving average price")
print("  Business: Recent average price for context")
print()

# 30-day rolling average price
df['price_ma_30d'] = df.groupby('product')['price'].transform(
    lambda x: x.rolling(window=30, min_periods=1).mean()
)
print("✓ price_ma_30d: 30-day moving average price")
print("  Business: Longer-term price trend")
print()

# Price momentum (current vs 30-day average)
df['price_momentum'] = ((df['price'] - df['price_ma_30d']) / df['price_ma_30d'] * 100).round(2)
print("✓ price_momentum: Current price vs 30-day trend")
print("  Business: +10% = prices rising, -10% = prices falling")
print()

# Quantity rolling averages
df['qty_ma_7d'] = df.groupby('product')['quantity_sold'].transform(
    lambda x: x.rolling(window=7, min_periods=1).mean()
)
df['qty_ma_30d'] = df.groupby('product')['quantity_sold'].transform(
    lambda x: x.rolling(window=30, min_periods=1).mean()
)
print("✓ qty_ma_7d, qty_ma_30d: Demand moving averages")
print("  Business: Is demand trending up or down?")
print()

# Demand trend (7d vs 30d)
df['demand_trend'] = df['qty_ma_7d'] - df['qty_ma_30d']
print("✓ demand_trend: Short-term demand vs long-term")
print("  Business: Positive = accelerating demand")
print()



CATEGORY 6: HISTORICAL FEATURES

WHY: Past behavior predicts future (momentum, trends)
BUSINESS LOGIC: If we raised prices recently, demand still adjusting

6.1 Rolling Statistics (by Product)
--------------------------------------------------------------------------------
✓ price_ma_7d: 7-day moving average price
  Business: Recent average price for context

✓ price_ma_30d: 30-day moving average price
  Business: Longer-term price trend

✓ price_momentum: Current price vs 30-day trend
  Business: +10% = prices rising, -10% = prices falling

✓ qty_ma_7d, qty_ma_30d: Demand moving averages
  Business: Is demand trending up or down?

✓ demand_trend: Short-term demand vs long-term
  Business: Positive = accelerating demand



In [11]:
# ============================================================================
# CATEGORY 7: CATEGORICAL ENCODING
# ============================================================================
print("\n" + "="*80)
print("CATEGORY 7: CATEGORICAL ENCODING")
print("="*80)
print()
print("WHY: ML models need numbers, not text")
print("TECHNICAL: Create both label encoding (trees) and one-hot (linear models)")
print()

print("7.1 Label Encoding (for tree-based models)")
print("-"*80)

# Encode products
le_product = LabelEncoder()
df['product_encoded'] = le_product.fit_transform(df['product'])
print(f"✓ product_encoded: {len(le_product.classes_)} products encoded as 0-{len(le_product.classes_)-1}")
print(f"  Mapping: {dict(zip(le_product.classes_, range(len(le_product.classes_))))}")
print()

# Encode segments
le_segment = LabelEncoder()
df['segment_encoded'] = le_segment.fit_transform(df['customer_segment'])
print(f"✓ segment_encoded: {len(le_segment.classes_)} segments encoded as 0-{len(le_segment.classes_)-1}")
print(f"  Mapping: {dict(zip(le_segment.classes_, range(len(le_segment.classes_))))}")
print()

# Encode seasons
le_season = LabelEncoder()
df['season_encoded'] = le_season.fit_transform(df['season'])
print(f"✓ season_encoded: Seasons as numbers")
print()

print("7.2 One-Hot Encoding (for linear models)")
print("-"*80)
print("Creating one-hot encoded version for linear models...")

# Create copy with one-hot encoding
df_onehot = df.copy()
df_onehot = pd.get_dummies(df_onehot, columns=['product', 'customer_segment', 'season'], 
                            prefix=['prod', 'seg', 'seas'])
print(f"✓ One-hot version created with {df_onehot.shape[1]} columns")
print()



CATEGORY 7: CATEGORICAL ENCODING

WHY: ML models need numbers, not text
TECHNICAL: Create both label encoding (trees) and one-hot (linear models)

7.1 Label Encoding (for tree-based models)
--------------------------------------------------------------------------------
✓ product_encoded: 5 products encoded as 0-4
  Mapping: {'Centrifuge': 0, 'Microscope': 1, 'PCR_System': 2, 'Pipettes': 3, 'Reagent_Kit': 4}

✓ segment_encoded: 4 segments encoded as 0-3
  Mapping: {'Academic': 0, 'Biotech': 1, 'Government': 2, 'Pharma': 3}

✓ season_encoded: Seasons as numbers

7.2 One-Hot Encoding (for linear models)
--------------------------------------------------------------------------------
Creating one-hot encoded version for linear models...
✓ One-hot version created with 62 columns



In [12]:

# ============================================================================
# CATEGORY 8: TARGET ENGINEERING
# ============================================================================
print("\n" + "="*80)
print("CATEGORY 8: TARGET VARIABLE ENGINEERING")
print("="*80)
print()
print("WHY: What are we predicting? Need to define our target carefully")
print("BUSINESS: We want to maximize PROFIT, not revenue or quantity")
print()

print("8.1 Primary Target: Profit")
print("-"*80)
print("✓ profit: Already in data (revenue - cost)")
print("  This is what we'll optimize")
print()

print("8.2 Alternative Targets (for diagnostics)")
print("-"*80)

# Profit per unit
df['profit_per_unit'] = (df['profit'] / df['quantity_sold'].replace(0, 1)).round(2)
print("✓ profit_per_unit: Margin per item sold")
print("  Business: Quality of sale (high-margin vs low-margin)")
print()

# Revenue (for comparison)
print("✓ revenue: Already in data")
print("  Business: Top-line metric (but not what we optimize)")
print()

# Margin percentage
df['margin_pct'] = (df['profit'] / df['revenue'].replace(0, 1) * 100).round(2)
print("✓ margin_pct: Profit as % of revenue")
print("  Business: Efficiency metric")
print()



CATEGORY 8: TARGET VARIABLE ENGINEERING

WHY: What are we predicting? Need to define our target carefully
BUSINESS: We want to maximize PROFIT, not revenue or quantity

8.1 Primary Target: Profit
--------------------------------------------------------------------------------
✓ profit: Already in data (revenue - cost)
  This is what we'll optimize

8.2 Alternative Targets (for diagnostics)
--------------------------------------------------------------------------------
✓ profit_per_unit: Margin per item sold
  Business: Quality of sale (high-margin vs low-margin)

✓ revenue: Already in data
  Business: Top-line metric (but not what we optimize)

✓ margin_pct: Profit as % of revenue
  Business: Efficiency metric



In [13]:

# ============================================================================
# SUMMARY: FEATURE LIST FOR MODELING
# ============================================================================
print("\n" + "="*80)
print("FEATURE ENGINEERING SUMMARY")
print("="*80)
print()

# Define feature groups
price_features = [
    'price', 'competitor_price', 'price_diff_competitor', 'price_ratio_competitor',
    'price_pct_vs_competitor', 'is_premium_vs_competitor', 'price_vs_product_norm'
]

temporal_features = [
    'month', 'quarter', 'day_of_week', 'week_of_year',
    'is_academic_start', 'is_summer_slowdown', 'is_year_end', 'is_quarter_end', 'is_weekend'
]

competitive_features = [
    'competitor_promotion', 'days_since_promotion', 'competitive_intensity'
]

inventory_features = [
    'inventory_level', 'inventory_pct', 'high_inventory_flag'
]

interaction_features = [
    'price_inventory_pressure', 'overpriced_overstocked',
    'competitor_promo_high_season', 'premium_in_low_season'
]

historical_features = [
    'price_ma_7d', 'price_ma_30d', 'price_momentum',
    'qty_ma_7d', 'qty_ma_30d', 'demand_trend'
]

categorical_features = [
    'product_encoded', 'segment_encoded', 'season_encoded'
]

# All features for modeling
all_features = (price_features + temporal_features + competitive_features + 
                inventory_features + interaction_features + historical_features + 
                categorical_features)

print(f"Total features created: {len(all_features)}")
print()
print("Feature breakdown:")
print(f"  Price features: {len(price_features)}")
print(f"  Temporal features: {len(temporal_features)}")
print(f"  Competitive features: {len(competitive_features)}")
print(f"  Inventory features: {len(inventory_features)}")
print(f"  Interaction features: {len(interaction_features)}")
print(f"  Historical features: {len(historical_features)}")
print(f"  Categorical features: {len(categorical_features)}")
print()

print("Target variable: profit")
print()

# ============================================================================
# SAVE PROCESSED DATA
# ============================================================================
print("="*80)
print("SAVING PROCESSED DATA")
print("="*80)
print()

# Save with all features
df.to_csv('lab_equipment_pricing_features.csv', index=False)
print("✓ Saved: lab_equipment_pricing_features.csv")
print(f"  Shape: {df.shape}")
print()

# Save feature list for later use
feature_metadata = {
    'all_features': all_features,
    'price_features': price_features,
    'temporal_features': temporal_features,
    'competitive_features': competitive_features,
    'inventory_features': inventory_features,
    'interaction_features': interaction_features,
    'historical_features': historical_features,
    'categorical_features': categorical_features,
    'target': 'profit'
}

import json
with open('feature_metadata.json', 'w') as f:
    json.dump(feature_metadata, f, indent=2)
print("✓ Saved: feature_metadata.json")
print()

print("="*80)
print("FEATURE ENGINEERING COMPLETE")
print("="*80)
print()
print("Next Steps:")
print("1. Check for multicollinearity")
print("2. Build and compare models")
print("3. Select best model")
print("4. Deploy in Streamlit")
print()

# Quick feature statistics
print("Feature Statistics Sample:")
print(df[all_features[:10]].describe().round(2))


FEATURE ENGINEERING SUMMARY

Total features created: 35

Feature breakdown:
  Price features: 7
  Temporal features: 9
  Competitive features: 3
  Inventory features: 3
  Interaction features: 4
  Historical features: 6
  Categorical features: 3

Target variable: profit

SAVING PROCESSED DATA

✓ Saved: lab_equipment_pricing_features.csv
  Shape: (10000, 54)

✓ Saved: feature_metadata.json

FEATURE ENGINEERING COMPLETE

Next Steps:
1. Check for multicollinearity
2. Build and compare models
3. Select best model
4. Deploy in Streamlit

Feature Statistics Sample:
          price  competitor_price  price_diff_competitor  \
count  10000.00          10000.00               10000.00   
mean    7081.92           7084.95                  -3.03   
std     5991.31           6018.02                 536.95   
min      255.03            230.43               -1709.32   
25%      413.86            410.77                -182.35   
50%     7969.82           7937.68                   0.56   
75%    12765.