# Feature Implementation

## Setup

In [2]:
# Cell 1: Setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Load processed data from EDA
df = pd.read_csv('../data/processed/telco_with_basic_features.csv')

print(f"Loaded {len(df):,} customers")
print(f"Features: {df.shape[1]}")
df.head(3)

Loaded 7,043 customers
Features: 23


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,tenure_segment,service_count
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No,0-12 months,1
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,No,No,One year,No,Mailed check,56.95,1889.5,No,24-48 months,3
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,0-12 months,3


## Feature 1 - Charges per Service Ratio

In [3]:
print("FEATURE ENGINEERING: charges_per_service")
print("="*60)

def calculate_charges_per_service(row):
    """
    Calculates how much a customer pays per service.
    Lower = better value perception
    Higher = potential churn risk (paying a lot for little)
    
    Edge case: service_count = 0 (phone-only) → treat as 1 service
    """
    services = row['service_count']
    charges = row['MonthlyCharges']
    
    # Edge case handling: 0 services = phone line only
    if services == 0:
        return charges / 1  # Treat phone line as 1 service
    
    return charges / services

# Apply function
df['charges_per_service'] = df.apply(calculate_charges_per_service, axis=1)

print("Feature created!")
print(f"\nDescriptive Stats:")
print(df['charges_per_service'].describe().round(2))

FEATURE ENGINEERING: charges_per_service
Feature created!

Descriptive Stats:
count    7043.00
mean       26.02
std        13.83
min        10.42
25%        18.40
50%        20.88
75%        28.15
max        77.90
Name: charges_per_service, dtype: float64


### Validate Feature - Check Edge Cases

In [4]:
print("VALIDATION: Edge Cases")
print("="*60)

# Check: How many customers have 0 services?
zero_services = df[df['service_count'] == 0]
print(f"\nCustomers with 0 services: {len(zero_services)}")

if len(zero_services) > 0:
    print("\nSample of 0-service customers:")
    print(zero_services[['customerID', 'service_count', 'MonthlyCharges', 'charges_per_service']].head())
else:
    print("✅ No edge cases found - all customers have at least 1 service!")

# Check: Distribution
print(f"\nMin charges_per_service: €{df['charges_per_service'].min():.2f}")
print(f"Max charges_per_service: €{df['charges_per_service'].max():.2f}")
print(f"Mean: €{df['charges_per_service'].mean():.2f}")

VALIDATION: Edge Cases

Customers with 0 services: 80

Sample of 0-service customers:
     customerID  service_count  MonthlyCharges  charges_per_service
105  6180-YBIQI              0           24.30                24.30
185  1024-GUALD              0           24.80                24.80
211  4195-NZGTA              0           25.25                25.25
272  6366-ZGQGL              0           24.80                24.80
376  6158-HDPXZ              0           25.35                25.35

Min charges_per_service: €10.42
Max charges_per_service: €77.90
Mean: €26.02


### Business Validation - Does it correlate with Churn?

In [None]:
print("BUSINESS VALIDATION: Does this feature make sense?")
print("="*60)

# Compare churned vs retained
churned = df[df['Churn'] == 'Yes']
retained = df[df['Churn'] == 'No']

churned_avg = churned['charges_per_service'].mean()
retained_avg = retained['charges_per_service'].mean()

print(f"\nAverage charges_per_service:")
print(f"  Churned:  €{churned_avg:.2f}")
print(f"  Retained: €{retained_avg:.2f}")
print(f"  Difference: €{churned_avg - retained_avg:.2f}")

if churned_avg > retained_avg:
    print("\nINSIGHT: Churned customers pay MORE per service!")
    print("   → Feature validates hypothesis!")
else:
    print("\nUNEXPECTED: Retained customers pay more per service?")
    print("   → Need to investigate further")

# Visualization
fig = px.box(df, x='Churn', y='charges_per_service',
             color='Churn',
             title='Charges per Service: Churned vs Retained',
             color_discrete_map={'Yes': '#ff6b6b', 'No': '#51cf66'})
fig.update_layout(height=400)
fig.show()

BUSINESS VALIDATION: Does this feature make sense?

Average charges_per_service:
  Churned:  €33.62
  Retained: €23.27
  Difference: €10.35

INSIGHT: Churned customers pay MORE per service!
   → Feature validates our hypothesis!


## Is Month-to-Month (Contract Risk)

In [6]:
print("FEATURE ENGINEERING: is_month_to_month")
print("="*60)

# Binary feature: 1 = risky contract, 0 = locked in
df['is_month_to_month'] = (df['Contract'] == 'Month-to-month').astype(int)

print("Feature created!")
print(f"\nDistribution:")
print(df['is_month_to_month'].value_counts())

# Business Validation
print("\nBUSINESS VALIDATION:")
m2m_churn = df[df['is_month_to_month'] == 1]['Churn'].value_counts(normalize=True)
locked_churn = df[df['is_month_to_month'] == 0]['Churn'].value_counts(normalize=True)

print(f"\nChurn Rate:")
print(f"  Month-to-month (1): {m2m_churn.get('Yes', 0)*100:.1f}%")
print(f"  Locked-in (0):      {locked_churn.get('Yes', 0)*100:.1f}%")
print(f"  Difference:         {(m2m_churn.get('Yes', 0) - locked_churn.get('Yes', 0))*100:.1f} percentage points")

if m2m_churn.get('Yes', 0) > locked_churn.get('Yes', 0):
    print("\nINSIGHT: Month-to-month customers have MUCH higher churn!")
    print("   → Strong predictor!")

FEATURE ENGINEERING: is_month_to_month
Feature created!

Distribution:
is_month_to_month
1    3875
0    3168
Name: count, dtype: int64

BUSINESS VALIDATION:

Churn Rate:
  Month-to-month (1): 42.7%
  Locked-in (0):      6.8%
  Difference:         36.0 percentage points

INSIGHT: Month-to-month customers have MUCH higher churn!
   → Strong predictor!


## Feature 3 - Customer Lifecycle Stage

In [7]:
print("FEATURE ENGINEERING: lifecycle_stage")
print("="*60)

def categorize_lifecycle(tenure):
    """
    Categorizes customers by tenure into lifecycle stages.
    
    Business Logic:
    - New (0-12): Onboarding phase, high churn risk
    - Growing (12-24): Building relationship
    - Mature (24-48): Established customer
    - Loyal (48+): Long-term, low churn risk
    """
    if tenure <= 12:
        return 'New'
    elif tenure <= 24:
        return 'Growing'
    elif tenure <= 48:
        return 'Mature'
    else:
        return 'Loyal'

df['lifecycle_stage'] = df['tenure'].apply(categorize_lifecycle)

print("Feature created!")
print(f"\nDistribution:")
print(df['lifecycle_stage'].value_counts().sort_index())

# Business Validation
print("\nBUSINESS VALIDATION:")
lifecycle_churn = df.groupby('lifecycle_stage')['Churn'].apply(
    lambda x: (x == 'Yes').sum() / len(x) * 100
).sort_values(ascending=False)

print("\nChurn Rate by Lifecycle Stage:")
for stage, rate in lifecycle_churn.items():
    count = (df['lifecycle_stage'] == stage).sum()
    print(f"  {stage:10s}: {rate:5.1f}% ({count:4,} customers)")

# Visualization
fig = px.histogram(df, x='lifecycle_stage', color='Churn',
                   category_orders={'lifecycle_stage': ['New', 'Growing', 'Mature', 'Loyal']},
                   title='Lifecycle Stage vs Churn Rate',
                   color_discrete_map={'Yes': '#ff6b6b', 'No': '#51cf66'},
                   barmode='group')
fig.update_layout(height=400)
fig.show()

if lifecycle_churn['New'] > lifecycle_churn['Loyal']:
    print("\nINSIGHT: Clear lifecycle pattern!")
    print("   → New customers need different retention strategy than Loyal!")

FEATURE ENGINEERING: lifecycle_stage
Feature created!

Distribution:
lifecycle_stage
Growing    1024
Loyal      2239
Mature     1594
New        2186
Name: count, dtype: int64

BUSINESS VALIDATION:

Churn Rate by Lifecycle Stage:
  New       :  47.4% (2,186 customers)
  Growing   :  28.7% (1,024 customers)
  Mature    :  20.4% (1,594 customers)
  Loyal     :   9.5% (2,239 customers)



INSIGHT: Clear lifecycle pattern!
   → New customers need different retention strategy than Loyal!


## Combined Feature Analysis

In [None]:
print("MULTI-FEATURE ANALYSIS")
print("="*60)

# All our engineered features
engineered_features = ['charges_per_service', 'is_month_to_month', 'lifecycle_stage']

print("Summary of engineered features:")
print(f"1. charges_per_service:  €{df['charges_per_service'].mean():.2f} avg")
print(f"2. is_month_to_month:    {df['is_month_to_month'].sum():,} customers ({df['is_month_to_month'].mean()*100:.1f}%)")
print(f"3. lifecycle_stage:      {df['lifecycle_stage'].value_counts().to_dict()}")

# Correlation check zwischen features
print("\nFeature Correlation Check:")
print("(Checking if features are redundant)")

# Create numeric version of lifecycle for correlation
lifecycle_numeric = df['lifecycle_stage'].map({
    'New': 1, 'Growing': 2, 'Mature': 3, 'Loyal': 4
})

correlation_data = pd.DataFrame({
    'charges_per_service': df['charges_per_service'],
    'is_month_to_month': df['is_month_to_month'],
    'lifecycle_numeric': lifecycle_numeric,
    'tenure': df['tenure']  # Original feature
})

corr_matrix = correlation_data.corr()
print("\nCorrelation Matrix:")
print(corr_matrix.round(3))

# Check: Ist lifecycle zu stark korreliert mit tenure?
lifecycle_tenure_corr = corr_matrix.loc['lifecycle_numeric', 'tenure']
print(f"\nlifecycle_stage ↔ tenure correlation: {lifecycle_tenure_corr:.3f}")

if abs(lifecycle_tenure_corr) > 0.95:
    print("WARNING: Very high correlation - might be redundant!")
elif abs(lifecycle_tenure_corr) > 0.80:
    print("High correlation but categorical encoding adds value")
else:
    print("Reasonable correlation - not redundant")