# llm-feat Package Test Notebook

This notebook tests the llm-feat package for automated feature engineering using LLMs.

## Setup


In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import llm_feat

print(f"llm-feat version: {llm_feat.__version__}")
print("✓ Package imported successfully!")


llm-feat version: 0.1.0
✓ Package imported successfully!


In [None]:
# Set your OpenAI API key
# Option 1: Set from environment variable (RECOMMENDED for production)
import os
api_key = os.getenv("OPENAI_API_KEY")

# Option 2: Set directly in notebook (for testing only - remove before committing!)
# Uncomment and set your key here if environment variable is not set:
if not api_key:
    api_key = "<OPENAI_API_KEY>"

if api_key:
    llm_feat.set_api_key(api_key)
    print("✓ API key set")
else:
    print("⚠️  OPENAI_API_KEY not set. Set it using:")
    print("   export OPENAI_API_KEY='your-key-here' (before starting Jupyter)")
    print("   Or uncomment the line above to set it directly in the notebook")

✓ API key set


## Test 1: Simple Numerical Dataset


In [7]:
# Create a simple numerical dataset
df = pd.DataFrame({
    'age': [25, 30, 35, 40, 45, 50, 28, 32, 38, 42],
    'income': [50000, 60000, 70000, 80000, 90000, 100000, 55000, 65000, 75000, 85000],
    'savings': [10000, 15000, 20000, 25000, 30000, 35000, 12000, 18000, 22000, 28000],
    'expenses': [40000, 45000, 50000, 55000, 60000, 65000, 43000, 47000, 53000, 57000]
})

print("Original DataFrame:")
print(df.head())
print(f"\nShape: {df.shape}")
print(f"Columns: {list(df.columns)}")


Original DataFrame:
   age  income  savings  expenses
0   25   50000    10000     40000
1   30   60000    15000     45000
2   35   70000    20000     50000
3   40   80000    25000     55000
4   45   90000    30000     60000

Shape: (10, 4)
Columns: ['age', 'income', 'savings', 'expenses']


In [8]:
# Create metadata DataFrame
metadata_df = pd.DataFrame({
    'column_name': ['age', 'income', 'savings', 'expenses'],
    'description': [
        'Age of the person in years',
        'Annual income in dollars',
        'Total savings in dollars',
        'Annual expenses in dollars'
    ],
    'data_type': ['numeric', 'numeric', 'numeric', 'numeric'],
    'label_definition': [None, None, None, None]
})

print("Metadata DataFrame:")
print(metadata_df)


Metadata DataFrame:
  column_name                 description data_type label_definition
0         age  Age of the person in years   numeric             None
1      income    Annual income in dollars   numeric             None
2     savings    Total savings in dollars   numeric             None
3    expenses  Annual expenses in dollars   numeric             None


### Mode 1: Generate Code

**Note:** When you run the cell below, the generated code will be set for the next cell. 
- In some Jupyter environments, a new cell will be created automatically
- In others, create a new cell below and the code will appear automatically
- The code is also printed in the output for manual copying


In [9]:
# Generate feature engineering code
# In Jupyter, the code will be automatically injected into the next cell
# Using gpt-4o-mini model for cost-effective feature generation
code = llm_feat.generate_features(df, metadata_df, mode='code', model='gpt-4o-mini')
print("Generated code:")
print(code)


<IPython.core.display.Javascript object>

✓ Attempted to create new cell with code - check below
  (Also set as next input - will appear in next cell you create)
Generated code:

# Generated Feature Engineering Code
import numpy as np

df['income_to_expense_ratio'] = np.where(df['expenses'] != 0, df['income'] / df['expenses'], np.nan)
df['savings_to_income_ratio'] = np.where(df['income'] != 0, df['savings'] / df['income'], np.nan)
df['net_savings'] = df['savings'] - df['expenses']
df['age_squared'] = df['age'] ** 2
df['income_growth'] = df['income'].diff().fillna(0)



In [10]:
import numpy as np

df['income_to_expense_ratio'] = np.where(df['expenses'] != 0, df['income'] / df['expenses'], np.nan)
df['savings_to_income_ratio'] = np.where(df['income'] != 0, df['savings'] / df['income'], np.nan)
df['net_savings'] = df['savings'] - df['expenses']
df['age_squared'] = df['age'] ** 2
df['income_growth'] = df['income'].diff().fillna(0)

### Mode 2: Direct Feature Addition


In [11]:
# Directly add features to DataFrame
# Using gpt-4o-mini model for cost-effective feature generation
df_with_features = llm_feat.generate_features(df, metadata_df, mode='direct', model='gpt-4o-mini')

print("DataFrame with new features:")
print(df_with_features.head())
print(f"\nOriginal columns: {list(df.columns)}")
print(f"New columns: {[col for col in df_with_features.columns if col not in df.columns]}")
print(f"\nTotal columns: {len(df_with_features.columns)} (original: {len(df.columns)})")


DataFrame with new features:
   age  income  savings  expenses  income_to_expense_ratio  \
0   25   50000    10000     40000                 1.250000   
1   30   60000    15000     45000                 1.333333   
2   35   70000    20000     50000                 1.400000   
3   40   80000    25000     55000                 1.454545   
4   45   90000    30000     60000                 1.500000   

   savings_to_income_ratio  net_savings  age_squared  income_growth  \
0                 0.200000       -30000          625            0.0   
1                 0.250000       -30000          900        10000.0   
2                 0.285714       -30000         1225        10000.0   
3                 0.312500       -30000         1600        10000.0   
4                 0.333333       -30000         2025        10000.0   

   savings_per_year  expenses_to_income_ratio  net_savings_per_year  \
0        400.000000                  0.800000          -1200.000000   
1        500.000000          

## Test 2: Dataset with Target Column


In [18]:
# Create dataset with target column
df = pd.DataFrame({
    'height': [170, 175, 180, 165, 185, 172, 178, 168, 182, 174],
    'weight': [70, 75, 80, 65, 85, 72, 78, 68, 83, 74],
    'bmi': [24.2, 24.5, 24.7, 23.9, 24.8, 24.3, 24.6, 24.1, 25.0, 24.4],
    'health_score': [1, 1, 0, 1, 0, 1, 1, 1, 0, 1]  # Target: 1=healthy, 0=unhealthy
})

metadata_df2 = pd.DataFrame({
    'column_name': ['height', 'weight', 'bmi', 'health_score'],
    'description': [
        'Height in centimeters',
        'Weight in kilograms',
        'Body Mass Index',
        'Health classification score'
    ],
    'data_type': ['numeric', 'numeric', 'numeric', 'numeric'],
    'label_definition': [None, None, None, '1 if healthy, 0 if unhealthy']
})

print("Dataset with target:")
print(df.head())
print("\nMetadata:")
print(metadata_df2)


Dataset with target:
   height  weight   bmi  health_score
0     170      70  24.2             1
1     175      75  24.5             1
2     180      80  24.7             0
3     165      65  23.9             1
4     185      85  24.8             0

Metadata:
    column_name                  description data_type  \
0        height        Height in centimeters   numeric   
1        weight          Weight in kilograms   numeric   
2           bmi              Body Mass Index   numeric   
3  health_score  Health classification score   numeric   

               label_definition  
0                          None  
1                          None  
2                          None  
3  1 if healthy, 0 if unhealthy  


In [13]:
# Generate features for dataset with target
# Using gpt-4o-mini model for cost-effective feature generation
code2 = llm_feat.generate_features(df, metadata_df2, mode='code', model='gpt-4o-mini')
print("Generated feature code:")
print(code2)


<IPython.core.display.Javascript object>

✓ Attempted to create new cell with code - check below
  (Also set as next input - will appear in next cell you create)
Generated feature code:

# Generated Feature Engineering Code
import numpy as np

df['height_weight_ratio'] = df['height'] / df['weight'].replace(0, np.nan)
df['bmi_squared'] = df['bmi'] ** 2
df['weight_bmi_interaction'] = df['weight'] * df['bmi']
df['health_score_bmi_difference'] = df['health_score'] - df['bmi']
df['bmi_category'] = pd.cut(df['bmi'], bins=[0, 18.5, 24.9, 29.9, np.inf], labels=['Underweight', 'Normal', 'Overweight', 'Obese'])



In [14]:
import numpy as np

df['height_weight_ratio'] = df['height'] / df['weight'].replace(0, np.nan)
df['bmi_squared'] = df['bmi'] ** 2
df['weight_bmi_interaction'] = df['weight'] * df['bmi']
df['health_score_bmi_difference'] = df['health_score'] - df['bmi']
df['bmi_category'] = pd.cut(df['bmi'], bins=[0, 18.5, 24.9, 29.9, np.inf], labels=['Underweight', 'Normal', 'Overweight', 'Obese'])

## Summary

- ✓ Package imports successfully
- ✓ API key management works
- ✓ Metadata validation works
- ✓ Code generation mode works (injects into next cell in Jupyter)
- ✓ Direct feature addition mode works
- ✓ Works with datasets with and without target columns
