In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.metrics import roc_auc_score, classification_report
from scipy import stats
import xgboost as xgb
import shap
import warnings

In [4]:
warnings.filterwarnings('ignore')

# Utility function

In [6]:
def analyze_outliers(df, column_names):
    """
    Analyze and print the outlier rows for the given numeric columns using the IQR method.

    Parameters:
    df (pandas.DataFrame): The input DataFrame.
    column_names (list of str): List of numeric column names to analyze.

    Returns:
    None (prints the outliers directly).
    """

    df_analyze = df.copy()

    for column in column_names:
        # Define lower and upper bound
        Q1 = df_analyze[column].quantile(0.25)
        Q3 = df_analyze[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Select outliers
        outliers = df_analyze[(df_analyze[column] < lower_bound) | (df_analyze[column] > upper_bound)]

        # Print output
        print(f"Outliers for column '{column}':")
        if outliers.empty:
            print("No outliers found.")
        else:
            print(outliers[[column]])  # Prints only the column; add more columns if needed
        print("-" * 50)

# Example usage (assuming 'df' is your DataFrame):
# analyze_outliers(df, ['MonthlyCharges', 'tenure', 'TotalCharges'])


In [7]:
def remove_outliers(df, column_names):
    """
    Remove outliers in numeric columns of the dataframe by using IQR method
    outliers defined as laying outside 1.5 IQR lower or upper than Q1 & Q3
    """
    df_clean = df.copy()

    for column in column_names:
        # define lower and upper bound
        Q1 = df_clean[column].quantile(0.25)
        Q3 = df_clean[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5*IQR
        upper_bound = Q3 + 1.5*IQR

        # filter out outliers
        df_clean = df_clean[(df_clean[column] >= lower_bound) & (df_clean[column] <= upper_bound)]

    return df_clean

In [8]:
def winsorize_outliers(df, column_names, limits = [0.05, 0.05]):
    """
    instead of completely remove the extreme values of the dataset,
    winsorize the outliers to cap the max values at 0.05 and 0.95

    variables:
    - columns_names: provide a list of column names
    """
    df_winsorize = df.copy()

    for column in column_names:
        df_winsorize[column] = stats.mstats.winsorize(
            df_winsorize[column],
            limits = limits
        )

    return df_winsorize

# Workflow

In [9]:
df = pd.read_csv('../data/customer_churn_telecom_services.csv')

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   MultipleLines     7043 non-null   object 
 7   InternetService   7043 non-null   object 
 8   OnlineSecurity    7043 non-null   object 
 9   OnlineBackup      7043 non-null   object 
 10  DeviceProtection  7043 non-null   object 
 11  TechSupport       7043 non-null   object 
 12  StreamingTV       7043 non-null   object 
 13  StreamingMovies   7043 non-null   object 
 14  Contract          7043 non-null   object 
 15  PaperlessBilling  7043 non-null   object 
 16  PaymentMethod     7043 non-null   object 


In [11]:
# fill the missing data in 'TotalCharges'ArithmeticError# fill missing values in 'TotalCharges'
df['TotalCharges'] = df['TotalCharges'].fillna(df['MonthlyCharges'] * df['tenure'])

In [12]:
df.columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [13]:
# define target and variables
categorical_variable = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']
numeric_variables = ['tenure', 'MonthlyCharges', 'TotalCharges']

variables = df[categorical_variable + numeric_variables]

target = df['Churn']

In [14]:
# train validation test split of the dataset
X_train_val, X_test, y_train_val, y_test = train_test_split(variables, target, test_size = 0.2, random_state = 42, stratify = target)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size = 0.25, random_state = 42, stratify = y_train_val)

In [15]:
# label encoder for target 
le = LabelEncoder()
y_train_en = le.fit_transform(y_train)
y_val_en = le.transform(y_val)
y_test_en = le.transform(y_test)

In [16]:
# categorical encoding
ohe = OneHotEncoder(handle_unknown='ignore', drop = 'first')
X_train_ohe = pd.DataFrame(ohe.fit_transform(X_train[categorical_variable]).toarray(), columns = ohe.get_feature_names_out(), index = X_train.index)
X_val_ohe = pd.DataFrame(ohe.fit_transform(X_val[categorical_variable]).toarray(), columns = ohe.get_feature_names_out(), index = X_val.index)
X_test_ohe = pd.DataFrame(ohe.fit_transform(X_test[categorical_variable]).toarray(), columns = ohe.get_feature_names_out(), index = X_test.index)

In [17]:
# combine nueric and encoded variables
X_train = pd.concat([X_train[numeric_variables], X_train_ohe], axis = 1)
X_val = pd.concat([X_val[numeric_variables], X_val_ohe], axis = 1)
X_test = pd.concat([X_train[numeric_variables], X_train_ohe], axis = 1)

In [18]:
# identify outliers
analyze_outliers(X_train, numeric_variables)

Outliers for column 'tenure':
No outliers found.
--------------------------------------------------
Outliers for column 'MonthlyCharges':
No outliers found.
--------------------------------------------------
Outliers for column 'TotalCharges':
No outliers found.
--------------------------------------------------


Since there is no outlier, we do not need to remove or winsorize the data

In [19]:
# scale the data with standardscaler
scaler = StandardScaler()

X_train[numeric_variables] = scaler.fit_transform(X_train[numeric_variables])
X_val[numeric_variables] = scaler.transform(X_val[numeric_variables])
X_test[numeric_variables] = scaler.transform(X_test[numeric_variables])

In [20]:
# Imbalance handling
scale_pos_weight = sum(y_train_en == 0) / sum(y_train_en == 1)

# Base model
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',
    scale_pos_weight=scale_pos_weight,
    random_state=42,
    eval_metric='aucpr'
)

# Hyperparameter tuning on val (simple grid; expand as needed)
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5],
    'learning_rate': [0.1, 0.01]
}
grid_search = GridSearchCV(xgb_model, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_train, y_train_en)

best_model = grid_search.best_estimator_
y_val_pred = best_model.predict(X_val)
y_test_pred = best_model.predict(X_test)
print("Test AUC-ROC:", roc_auc_score(y_test_en, y_test_pred))
print(classification_report(y_test_en, y_test_pred))

# SHAP explanations
explainer = shap.TreeExplainer(best_model)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_test, plot_type="bar")


ValueError: Found input variables with inconsistent numbers of samples: [1409, 4225]