In [39]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings

In [3]:
warnings.filterwarnings('ignore')

# Utility function

In [37]:
def analyze_outliers(df, column_names):
    """
    Analyze and print the outlier rows for the given numeric columns using the IQR method.

    Parameters:
    df (pandas.DataFrame): The input DataFrame.
    column_names (list of str): List of numeric column names to analyze.

    Returns:
    None (prints the outliers directly).
    """

    df_analyze = df.copy()

    for column in column_names:
        # Define lower and upper bound
        Q1 = df_analyze[column].quantile(0.25)
        Q3 = df_analyze[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Select outliers
        outliers = df_analyze[(df_analyze[column] < lower_bound) | (df_analyze[column] > upper_bound)]

        # Print output
        print(f"Outliers for column '{column}':")
        if outliers.empty:
            print("No outliers found.")
        else:
            print(outliers[[column]])  # Prints only the column; add more columns if needed
        print("-" * 50)

# Example usage (assuming 'df' is your DataFrame):
# analyze_outliers(df, ['MonthlyCharges', 'tenure', 'TotalCharges'])


In [14]:
def remove_outliers(df, column_names):
    """
    Remove outliers in numeric columns of the dataframe by using IQR method
    outliers defined as laying outside 1.5 IQR lower or upper than Q1 & Q3
    """
    df_clean = df.copy()

    for column in column_names:
        # define lower and upper bound
        Q1 = df_clean[column].quantile(0.25)
        Q3 = df_clean[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5*IQR
        upper_bound = Q3 + 1.5*IQR

        # filter out outliers
        df_clean = df_clean[(df_clean[column] >= lower_bound) & (df_clean[column] <= upper_bound)]

    return df_clean

In [24]:
def winsorize_outliers(df, column_names, limits = [0.05, 0.05]):
    """
    instead of completely remove the extreme values of the dataset,
    winsorize the outliers to cap the max values at 0.05 and 0.95

    variables:
    - columns_names: provide a list of column names
    """
    df_winsorize = df.copy()

    for column in column_names:
        df_winsorize[column] = stats.mstats.winsorize(
            df_winsorize[column],
            limits = limits
        )

    return df_winsorize

# Workflow

In [25]:
df = pd.read_csv('../data/customer_churn_telecom_services.csv')

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   MultipleLines     7043 non-null   object 
 7   InternetService   7043 non-null   object 
 8   OnlineSecurity    7043 non-null   object 
 9   OnlineBackup      7043 non-null   object 
 10  DeviceProtection  7043 non-null   object 
 11  TechSupport       7043 non-null   object 
 12  StreamingTV       7043 non-null   object 
 13  StreamingMovies   7043 non-null   object 
 14  Contract          7043 non-null   object 
 15  PaperlessBilling  7043 non-null   object 
 16  PaymentMethod     7043 non-null   object 


In [27]:
# fill the missing data in 'TotalCharges'ArithmeticError# fill missing values in 'TotalCharges'
df['TotalCharges'] = df['TotalCharges'].fillna(df['MonthlyCharges'] * df['tenure'])

In [28]:
df.columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [29]:
# define target and variables
categorical_variable = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']
numeric_variables = ['tenure', 'MonthlyCharges', 'TotalCharges']

variables = df[categorical_variable + numeric_variables]

target = df['Churn']

In [30]:
# train validation test split of the dataset
X_train_val, X_test, y_train_val, y_test = train_test_split(variables, target, test_size = 0.2, random_state = 42, stratify = target)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size = 0.25, random_state = 42, stratify = y_train_val)

In [38]:
# identify outliers
analyze_outliers(X_train, numeric_variables)

Outliers for column 'tenure':
No outliers found.
--------------------------------------------------
Outliers for column 'MonthlyCharges':
No outliers found.
--------------------------------------------------
Outliers for column 'TotalCharges':
No outliers found.
--------------------------------------------------


Since there is no outlier, we do not need to remove or winsorize the data

In [42]:
# scale the data with standardscaler
scaler = StandardScaler()

X_train[numeric_variables] = scaler.fit_transform(X_train[numeric_variables])
X_val[numeric_variables] = scaler.transform(X_val[numeric_variables])
X_test[numeric_variables] = scaler.transform(X_test[numeric_variables])