# Churn Prediction & Revenue Impact
Synthetic SaaS dataset

In [None]:
import pandas as pd, matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report

customers = pd.read_csv('../data/raw/customers.csv')
events = pd.read_csv('../data/raw/events.csv')

# Feature: total events per customer
event_counts = events.groupby('customer_id').size().reset_index(name='event_count')
df = customers.merge(event_counts, on='customer_id', how='left').fillna({'event_count':0})

# Encode plan
df = pd.get_dummies(df, columns=['plan'], drop_first=True)

X = df[['event_count','plan_Pro','plan_Enterprise']]
y = df['is_churned']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_test)[:,1]

print('AUC:', roc_auc_score(y_test, y_pred))
print(classification_report(y_test, (y_pred>0.5).astype(int)))

# Revenue impact curve
thresholds = [0.2, 0.3, 0.4, 0.5, 0.6]
impact = []
for t in thresholds:
    retained = df.loc[y_pred < t]
    saved_rev = retained['event_count'].sum()*1  # placeholder
    impact.append(saved_rev)

plt.plot(thresholds, impact)
plt.xlabel('Churn Probability Threshold')
plt.ylabel('Potential Revenue Saved')
plt.title('Revenue Impact vs Retention Strategy')
plt.show()