In [136]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.metrics import recall_score
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler

In [137]:
df = pd.read_csv("data/telecom_churn.csv")

## EDA


In [138]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7032 entries, 0 to 7031
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   CustomerID        7032 non-null   object 
 1   Gender            7032 non-null   object 
 2   SeniorCitizen     7032 non-null   int64  
 3   Partner           7032 non-null   object 
 4   Dependents        7032 non-null   object 
 5   PhoneService      7032 non-null   object 
 6   MultipleLines     7032 non-null   object 
 7   InternetService   7032 non-null   object 
 8   OnlineSecurity    7032 non-null   object 
 9   OnlineBackup      7032 non-null   object 
 10  DeviceProtection  7032 non-null   object 
 11  TechSupport       7032 non-null   object 
 12  StreamingTV       7032 non-null   object 
 13  StreamingMovies   7032 non-null   object 
 14  Tenure            7032 non-null   int64  
 15  Contract          7032 non-null   object 
 16  PaperlessBilling  7032 non-null   object 


In [139]:
df.head()

Unnamed: 0,CustomerID,Gender,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,...,TechSupport,StreamingTV,StreamingMovies,Tenure,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,No,No phone service,DSL,No,Yes,...,No,No,No,1,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,Yes,No,DSL,Yes,No,...,No,No,No,34,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,Yes,No,DSL,Yes,Yes,...,No,No,No,2,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,No,No phone service,DSL,Yes,No,...,Yes,No,No,45,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,Yes,No,Fiber optic,No,No,...,No,No,No,2,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [140]:
# In this dataset, outliers are the churned customers
df.Churn.value_counts()

Churn
No     5163
Yes    1869
Name: count, dtype: int64

In [141]:
df.Churn.value_counts(normalize=True)

Churn
No     0.734215
Yes    0.265785
Name: proportion, dtype: float64

## Preparing the data


In [142]:
X = df.drop(columns=["Churn", "CustomerID"], axis=1)
y = df.Churn

In [143]:
# Create a transformer that will encode the binary columns
def binary_transformer(column: pd.Series):
    return column.map(lambda x: 1 if x == "Yes" else 0)

In [144]:
numerical_features = ["Tenure", "MonthlyCharges", "TotalCharges"]
categorial_features = [
    "Gender",
    "MultipleLines",
    "InternetService",
    "OnlineSecurity",
    "OnlineBackup",
    "DeviceProtection",
    "TechSupport",
    "StreamingTV",
    "StreamingMovies",
    "Contract",
    "PaymentMethod",
]
binary_features = ["Partner", "Dependents", "PhoneService", "PaperlessBilling"]
no_transform_features = ["SeniorCitizen"]

# Create transformers
numerical_transformer = StandardScaler()
categorial_transformer = OneHotEncoder()
binary_transformer = FunctionTransformer(binary_transformer)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_features),
        ("cat", categorial_transformer, categorial_features),
        ("bin", binary_transformer, binary_features),
        ("pass", "passthrough", no_transform_features),
    ]
)

# Fit and transform the data
X_transformed = preprocessor.fit_transform(X)

In [145]:
X_transformed.shape

(7032, 41)

## Training the model


In [146]:
lof = LocalOutlierFactor(n_neighbors=30, contamination=0.26)

In [147]:
# Fit the model and predict the outliers
y_pred = lof.fit_predict(X_transformed)
y_pred[0:10]

array([ 1,  1,  1,  1,  1,  1, -1,  1,  1,  1])

In [148]:
# Show the calculated LOF scores
-lof.negative_outlier_factor_[0:10]

array([1.03763484, 1.05796292, 1.03679697, 1.00049398, 0.98895361,
       1.00224663, 1.11797261, 1.02458449, 1.02656525, 1.00451802])

## Visualizing the results


In [149]:
inliers = y_pred == 1
outliers = y_pred == -1

print(f"Inliers: {inliers.sum()}")
print(f"Outliers: {outliers.sum()}")

Inliers: 5203
Outliers: 1829


In [150]:
# Convert y to -1 for churned customers and 1 for non-churned customers
y_true = y.map(lambda x: -1 if x == "Yes" else 1)
y_true

0       1
1       1
2      -1
3       1
4      -1
       ..
7027    1
7028    1
7029    1
7030   -1
7031    1
Name: Churn, Length: 7032, dtype: int64

In [151]:
# Calculate the recall score
recall_score(y_true, y_pred)

np.float64(0.7553747821034282)