In [108]:
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report, recall_score, precision_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import confusion_matrix


In [26]:
df = pd.read_csv('../data/customer_churn_dataset-testing-master.csv')

In [27]:
df.head()

Unnamed: 0,CustomerID,Age,Gender,Tenure,Usage Frequency,Support Calls,Payment Delay,Subscription Type,Contract Length,Total Spend,Last Interaction,Churn
0,1,22,Female,25,14,4,27,Basic,Monthly,598,9,1
1,2,41,Female,28,28,7,13,Standard,Monthly,584,20,0
2,3,47,Male,27,10,2,29,Premium,Annual,757,21,0
3,4,35,Male,9,12,5,17,Premium,Quarterly,232,18,0
4,5,53,Female,58,24,9,2,Standard,Annual,533,18,0


In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64374 entries, 0 to 64373
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   CustomerID         64374 non-null  int64 
 1   Age                64374 non-null  int64 
 2   Gender             64374 non-null  object
 3   Tenure             64374 non-null  int64 
 4   Usage Frequency    64374 non-null  int64 
 5   Support Calls      64374 non-null  int64 
 6   Payment Delay      64374 non-null  int64 
 7   Subscription Type  64374 non-null  object
 8   Contract Length    64374 non-null  object
 9   Total Spend        64374 non-null  int64 
 10  Last Interaction   64374 non-null  int64 
 11  Churn              64374 non-null  int64 
dtypes: int64(9), object(3)
memory usage: 5.9+ MB


In [29]:
df.isna().sum()

CustomerID           0
Age                  0
Gender               0
Tenure               0
Usage Frequency      0
Support Calls        0
Payment Delay        0
Subscription Type    0
Contract Length      0
Total Spend          0
Last Interaction     0
Churn                0
dtype: int64

In [33]:
X = df.drop(columns=['Churn', 'CustomerID'])
y = df['Churn']

In [48]:
categorial_features = [
    'Gender',
    'Subscription Type',
    'Contract Length'
]
numerical_features = [
    'Age',
    'Tenure',
    'Usage Frequency',
    'Support Calls',
    'Payment Delay',
    'Total Spend',
    'Last Interaction'
]

In [50]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [52]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorial_features),
        ('num', 'passthrough', numerical_features)
    ]
)

In [53]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

In [54]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LogisticRegression(max_iter=1000))
])

In [55]:
pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [58]:
# предсказания на тесте
y_pred = pipeline.predict(X_test)

# матрица ошибок
cm = confusion_matrix(y_test, y_pred)
cm

array([[5625, 1151],
       [1075, 5024]])

In [94]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.83      0.83      6776
           1       0.81      0.82      0.82      6099

    accuracy                           0.83     12875
   macro avg       0.83      0.83      0.83     12875
weighted avg       0.83      0.83      0.83     12875



In [61]:
y_proba = pipeline.predict_proba(X_test)[:, 1]  # вероятность churn

### Baseline модель

В качестве baseline использованя Logistic Regression.
При пороге 0.5 recall по churn = 0.82.
Для бизнеса важнее минимизировать FN, поэтому рассматривается снижение порога.

In [104]:
y_pred_04 = (y_proba >= 0.4).astype(int)

In [105]:
cm_new = confusion_matrix(y_test, y_pred_04)

In [106]:
cm_new

array([[5217, 1559],
       [ 755, 5344]])

In [107]:
print(classification_report(y_test, y_pred_04))

              precision    recall  f1-score   support

           0       0.87      0.77      0.82      6776
           1       0.77      0.88      0.82      6099

    accuracy                           0.82     12875
   macro avg       0.82      0.82      0.82     12875
weighted avg       0.83      0.82      0.82     12875



In [112]:
thresholds = np.arange(0.1, 0.9, 0.05)
results = []
for t in thresholds:
    y_pred_t = (y_proba >= t).astype(int)
    recall = recall_score(y_test, y_pred_t)
    precision = precision_score(y_test, y_pred_t)
    results.append((t, recall, precision))
results

[(np.float64(0.1), 0.9742580750942778, 0.6115056087269733),
 (np.float64(0.15000000000000002), 0.9619609772093786, 0.6472859664607238),
 (np.float64(0.20000000000000004), 0.9480242662731595, 0.6786384976525821),
 (np.float64(0.25000000000000006), 0.9349073618626004, 0.7030826140567201),
 (np.float64(0.30000000000000004), 0.9191670765699295, 0.7292832054117341),
 (np.float64(0.3500000000000001), 0.897360222987375, 0.7495206792659546),
 (np.float64(0.40000000000000013), 0.8762092146253484, 0.7741561639866724),
 (np.float64(0.45000000000000007), 0.8512870962452861, 0.7961969023155958),
 (np.float64(0.5000000000000001), 0.823741596983112, 0.8136032388663967),
 (np.float64(0.5500000000000002), 0.7873421872438104, 0.8296475466482377),
 (np.float64(0.6000000000000002), 0.749303164453189, 0.8477091448710814),
 (np.float64(0.6500000000000001), 0.6991310050828005, 0.8610662358642972),
 (np.float64(0.7000000000000002), 0.6561731431382194, 0.8776315789473684),
 (np.float64(0.7500000000000002), 0.5

In [113]:
from sklearn.metrics import roc_auc_score, roc_curve

In [114]:
roc_auc = roc_auc_score(y_test, y_proba)
roc_auc

0.903000845165358

### Threshold selection

Для задачи churn приоритетом является минимизация False Negative (пропущенных уходящих клиентов).
При снижении порога классификации наблюдается рост recall за счёт снижения precision.

| Threshold | Recall (churn) | Precision (churn) |
|----------|----------------|-------------------|
| 0.10     | ~0.97          | ~0.61             |
| 0.35     | ~0.90          | ~0.75             |
| 0.50     | ~0.82          | ~0.81             |

В качестве рабочего выбран threshold = 0.35, обеспечивающий баланс между высокой полнотой и приемлемой точностью.