In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

: 

In [None]:
# reading data
DATA_URL = './customer_churn.csv'
df = pd.read_csv(DATA_URL)

: 

In [None]:
# checking the data
df_original_cols = df.head(3).transpose()

: 

In [None]:
display(df.shape)
display(df.info())

: 

In [None]:
def split_text_on_uppercase(s, keep_contiguous=False):
    """

    Args:
        s (str): string
        keep_contiguous (bool): flag to indicate we want to 
                                keep contiguous uppercase chars together

    Returns:

    """

    string_length = len(s)
    is_lower_around = (lambda: s[i-1].islower() or 
                       string_length > (i + 1) and s[i + 1].islower())

    start = 0
    parts = []
    for i in range(1, string_length):
        if s[i].isupper() and (not keep_contiguous or is_lower_around()):
            parts.append(s[start: i])
            start = i
    parts.append(s[start:])

    return "_".join(parts).lower()

def standardize_columns(cols):
    lst_std_columns=[]
    for c in cols:
        lst_std_columns.append(split_text_on_uppercase(c, True))
    return lst_std_columns

: 

In [None]:
# renaming columns
df.columns = standardize_columns(df.columns)

: 

In [None]:
# checking if std process was done correctly 
df_std_cols = df.head(3).transpose()
display(True if (df_original_cols.values == df_std_cols.values).all() else False)

# checking nans if there are any nans in the df
display(True if (df.isna().sum().sum() == 0) else False)

: 

In [None]:
# 2. We will try to predict variable `Churn` using a 
# logistic regression on variables `tenure`,
# `SeniorCitizen`,`MonthlyCharges`.
DEPENDANT_VAR = ['tenure', 'senior_citizen', 'monthly_charges']
TARGET_VAR = 'churn'
df_model = df[DEPENDANT_VAR + [TARGET_VAR]]
df_model.head().transpose()

: 

In [None]:
df_model[TARGET_VAR].value_counts()

: 

In [None]:
# target column to categorical
df_model[TARGET_VAR] = df_model[TARGET_VAR].apply(lambda x: 0 if x == "No" else 1)

: 

In [None]:
df_model.head().transpose()

: 

In [None]:
# checking correlations
var_corr = df_model.corr()
sns.heatmap(var_corr, xticklabels=var_corr.columns, yticklabels=var_corr.columns, annot=True)

: 

In [None]:
sns.pairplot(data=df_model)

: 

In [None]:
for col in df_model.columns:
    sns.distplot(df_model[col], kde=False)
    plt.show()

: 

In [None]:
# Extracting independant variables and scaling them
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
                                            df_model[DEPENDANT_VAR],
                                            df_model[TARGET_VAR],
                                            test_size=0.33,
                                            random_state=42)

: 

In [None]:
# scaling X (test and train) sets 
def scale_sets(sets, scaler=StandardScaler):
    lst_scaled_sets = []
    for set in sets:
        lst_scaled_sets.append(scaler().fit_transform(set))
    return lst_scaled_sets

    
X_train_scaled, X_test_scaled = scale_sets([X_train, X_test])



: 

In [None]:
# applying logistic regression model
logistic_regressor = LogisticRegression()
logistic_regressor.fit(X_train_scaled, y_train)
y_pred = logistic_regressor.predict(X_test_scaled)
class_report = classification_report(y_test, y_pred)

display("Accuracy Score: " + str(round(accuracy_score(y_test, y_pred), 3)))
display("Confusion Matrix: ", confusion_matrix(y_test, y_pred))
print(class_report, " Classification Report")

: 

In [None]:
# Null Accuracy
# percentage of 1's
perc_ones = round(y_test.mean(), 4)
perc_zeros = round(1 - perc_ones, 4)
display(y_test.value_counts(), perc_ones , perc_zeros)

max(y_test.mean(), 1 - y_test.mean())

: 

In [None]:
# SMOTE
from imblearn.over_sampling import SMOTE
smote = SMOTE()

# split dataset in train-test subdataset
X_train, X_test, y_train, y_test = train_test_split(
                                            df_model[DEPENDANT_VAR],
                                            df_model[TARGET_VAR],
                                            test_size=0.33,
                                            random_state=42)

X_train, y_train = smote.fit_resample(X_train, y_train)
logreg = LogisticRegression(random_state=42, max_iter=10000)
logreg.fit(X_train, y_train)

predictions = logreg.predict(X_test)
print(classification_report(y_test, predictions))

: 

In [None]:
# TOMEK
from imblearn.under_sampling import TomekLinks
tomek = TomekLinks()
X_tl, y_tl = tomek.fit_resample(df_model[DEPENDANT_VAR], df_model[TARGET_VAR])

X_train, X_test, y_train, y_test = train_test_split(X_tl, y_tl, test_size=0.3, random_state=42, stratify=y_tl)
logreg = LogisticRegression(random_state=42, max_iter=10000)
logreg.fit(X_train, y_train)

predictions = logreg.predict(X_test)
print(classification_report(y_test, predictions))

: 