'telecom.csv'

In [4]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)
df = pd.read_csv("data/telecom.csv")

del df["customerID"]
df = df.replace({"TotalCharges": {" ": 0},
                "Churn": {"Yes": 1, "No": 0}})

y = df["Churn"]
del df["Churn"]

df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], downcast="float")

df = df.replace({"gender": {"Male": 1, "Female": 0}, 
                "Partner": {"Yes": 1, "No": 0},
                "Dependents": {"Yes": 1, "No": 0},
                "PhoneService": {"Yes": 1, "No": 0},
                "MultipleLines": {"Yes": 2, "No": 1, "No phone service": 0},
                "InternetService": {"Fiber optic": 2, "DSL": 1, "No": 0},
                "OnlineSecurity": {"Yes": 2, "No": 1, "No internet service": 0},
                "OnlineBackup": {"Yes": 2, "No": 1, "No internet service": 0},
                "DeviceProtection": {"Yes": 2, "No": 1, "No internet service": 0},
                "TechSupport": {"Yes": 2, "No": 1, "No internet service": 0},
                "StreamingTV": {"Yes": 2, "No": 1, "No internet service": 0},
                "StreamingMovies": {"Yes": 2, "No": 1, "No internet service": 0},
                "Contract": {"Month-to-month": 2, "Two year": 1, "One year": 0},
                "PaperlessBilling": {"Yes": 1, "No": 0},
                "PaymentMethod": {"Electronic check": 3, "Mailed check": 2, "Credit card (automatic)": 1, "Bank transfer (automatic)": 0}})

In [3]:
df

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,2,1,1,20.15,20.150000
1,1,0,0,0,50,1,1,2,2,2,2,2,1,1,0,1,1,90.10,4549.450195
2,1,0,1,0,55,1,2,2,2,2,1,1,2,1,2,1,3,96.75,5238.899902
3,0,1,1,0,7,1,2,2,1,1,1,1,1,2,2,1,3,84.55,646.849976
4,0,1,0,0,2,1,1,2,1,1,1,1,1,2,2,1,0,79.75,159.399994
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,1,1,0,0,2,1,1,2,1,2,1,1,1,1,2,1,3,74.70,165.399994
5996,1,0,1,1,72,1,2,2,2,2,2,2,1,2,1,0,1,103.95,7517.700195
5997,0,1,0,0,7,1,1,1,1,2,2,1,2,1,2,1,3,64.95,493.649994
5998,1,0,1,0,70,1,2,0,0,0,0,0,0,0,1,0,0,25.15,1940.849976


Split data into 2 parts, take the small one as the final test. You will be using for each model's final evaluation

In [3]:
from sklearn.model_selection import train_test_split

train_X, test_X, train_y, test_y = train_test_split(df, y, test_size=1/6,random_state=109) 

### Creating a dictionary for kepping the metrics from each model and the necessary functions

In [4]:
models = ["Logistic Regression", "Decision Tree", "SVM", "KNN", "Random Forest", "Ensemble Learning", "Neural Networks"]
coefs = dict()

for model in models:
    coefs[model] = {
        "Accuracy": 0,
        "Precision": 0,
        "Recall | Sensitivity": 0,
        "Specificity": 0,
        "Negative predictive value": 0
    }


def perf_measure(y_actual, y_hat):   
    TP, FP, TN, FN = 0, 0, 0, 0

    for i in range(len(y_actual)):
        if y_hat[i] == 1 and y_actual[i] == 1:
            TP += 1
        elif y_hat[i] == 1 and y_actual[i] == 0:
            FP += 1
        elif y_hat[i] == 0 and y_actual[i] == 0:
            TN += 1
        elif y_hat[i] == 0 and y_actual[i] == 1:
            FN += 1
        
    return TP, FP, TN, FN


def return_metrics(tp, fp, tn, fn):
    metrics = list()

    metrics.append(round(((tp + tn) / (tp + tn + fp + fn)), 2))
    metrics.append(round((tp / (tp + fp)), 2))
    metrics.append(round((tp / (tp + fn)), 2))
    metrics.append(round((tn / (tn + fp)), 2))
    metrics.append(round((tn / (tn + fn)), 2))

    return metrics


def update_metrics(metrics, model):
    coefs[model]["Accuracy"] = metrics[0]
    coefs[model]["Precision"] = metrics[1]
    coefs[model]["Recall | Sensitivity"] = metrics[2]
    coefs[model]["Specificity"] = metrics[3]
    coefs[model]["Negative predictive value"] = metrics[4]


def print_metrics(metrics):
    print("Accuracy:", metrics[0])
    print("Precision:", metrics[1])
    print("Recall | Sensitivity:", metrics[2])
    print("Specificity:", metrics[3])
    print("Negative predictive value:", metrics[4])

# Logistic regression

In [5]:
import warnings
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

warnings.filterwarnings('ignore')

X_train, X_test, y_train, y_test = train_test_split(train_X, train_y, test_size=0.2,random_state=109) 

param_grid = {"penalty": ["l1", "l2", "elasticnet", "none"],
            "solver": ["newton-cg", "lbfgs", "liblinear", "sag", "saga"],
            "multi_class": ["auto", "ovr", "multinomial"]}

grid_search = GridSearchCV(LogisticRegression(random_state=0), param_grid, cv=6, n_jobs=2)
grid_search.fit(X_train, y_train)

y_pred = grid_search.predict(X_test)

tp, fp, tn, fn = perf_measure(y_test.tolist(), y_pred.tolist())
metrics = return_metrics(tp, fp, tn, fn)
update_metrics(metrics, "Logistic Regression")
print_metrics(metrics)

grid_search.best_estimator_

Accuracy: 0.8
Precision: 0.68
Recall | Sensitivity: 0.56
Specificity: 0.9
Negative predictive value: 0.84


LogisticRegression(random_state=0, solver='newton-cg')

# Decision tree

In [6]:
from sklearn.tree import DecisionTreeClassifier

X_train, X_test, y_train, y_test = train_test_split(train_X, train_y, test_size=0.2,random_state=109) 

param_grid = {"max_depth": range(3, 7),
            "min_samples_split": range(2, 7),
            "min_samples_leaf": range(1, 4),
            "max_features": range(10, 15)}

grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=6, n_jobs=2)
grid_search.fit(X_train, y_train)

y_pred = grid_search.predict(X_test)

tp, fp, tn, fn = perf_measure(y_test.tolist(), y_pred.tolist())
metrics = return_metrics(tp, fp, tn, fn)
update_metrics(metrics, "Decision Tree")
print_metrics(metrics)

Accuracy: 0.79
Precision: 0.71
Recall | Sensitivity: 0.43
Specificity: 0.93
Negative predictive value: 0.81


# SVM

In [7]:
from sklearn import svm

X_train, X_test, y_train, y_test = train_test_split(train_X, train_y, test_size=0.2,random_state=109) 

param_grid = {"kernel": ["sigmoid"]}

grid_search = GridSearchCV(svm.SVC(), param_grid, cv=6, n_jobs=2)
grid_search.fit(X_train, y_train)

y_pred = grid_search.predict(X_test)

tp, fp, tn, fn = perf_measure(y_test.tolist(), y_pred.tolist())
metrics = return_metrics(tp, fp, tn, fn)
update_metrics(metrics, "SVM")
print_metrics(metrics)

Accuracy: 0.59
Precision: 0.23
Recall | Sensitivity: 0.21
Specificity: 0.73
Negative predictive value: 0.7


# KNN

In [8]:
from sklearn.neighbors import KNeighborsClassifier

X_train, X_test, y_train, y_test = train_test_split(train_X, train_y, test_size=0.2,random_state=109) 

param_grid = {"n_neighbors": range(3, 6),
            "weights": ["uniform", "distance"],
            "algorithm": ["auto", "ball_tree", "kd_tree", "brute"]}

grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=6, n_jobs=2)
grid_search.fit(X_train, y_train)

y_pred = grid_search.best_estimator_.predict(X_test)

tp, fp, tn, fn = perf_measure(y_test.tolist(), y_pred.tolist())
metrics = return_metrics(tp, fp, tn, fn)
update_metrics(metrics, "KNN")
print_metrics(metrics)

grid_search.best_estimator_

Accuracy: 0.77
Precision: 0.68
Recall | Sensitivity: 0.32
Specificity: 0.94
Negative predictive value: 0.78


KNeighborsClassifier(n_neighbors=4)

# Random Forest

In [9]:
from sklearn.ensemble import RandomForestClassifier

X_train, X_test, y_train, y_test = train_test_split(train_X, train_y, test_size=0.2,random_state=109) 

param_grid = {"n_estimators": range(2, 20)}

grid_search = GridSearchCV(RandomForestClassifier(random_state=1), param_grid, cv=6, n_jobs=2)
grid_search.fit(X_train, y_train)

y_pred = grid_search.best_estimator_.predict(X_test)

tp, fp, tn, fn = perf_measure(y_test.tolist(), y_pred.tolist())
metrics = return_metrics(tp, fp, tn, fn)
update_metrics(metrics, "Random Forest")
print_metrics(metrics)

Accuracy: 0.81
Precision: 0.73
Recall | Sensitivity: 0.5
Specificity: 0.93
Negative predictive value: 0.83


# Ensemble Learning

In [10]:
import warnings
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import StackingClassifier

warnings.filterwarnings('ignore')

X_train, X_test, y_train, y_test = train_test_split(train_X, train_y, test_size=0.2,random_state=109) 

estimators = [
    ('rf', RandomForestClassifier(n_estimators=16, random_state=1)),
    ('lr', LogisticRegression(random_state=0, solver='newton-cg'))
]

clf = StackingClassifier(
    estimators=estimators, final_estimator=LogisticRegression(random_state=0, solver='newton-cg')
)

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

tp, fp, tn, fn = perf_measure(y_test.tolist(), y_pred.tolist())
metrics = return_metrics(tp, fp, tn, fn)
update_metrics(metrics, "Ensemble Learning")
print_metrics(metrics)

Accuracy: 0.81
Precision: 0.71
Recall | Sensitivity: 0.54
Specificity: 0.91
Negative predictive value: 0.84


# Neural Networks

In [11]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

X = train_X.iloc[:,:].values
y = pd.DataFrame(train_y).iloc[:,:].values

X = StandardScaler().fit_transform(X)
y = OneHotEncoder().fit_transform(y).toarray()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=109) 

model = Sequential()
model.add(Dense(16, input_dim=19, activation = "sigmoid"))
model.add(Dense(12, activation="sigmoid"))
model.add(Dense(2, activation="softmax"))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X_train, y_train, epochs=50, batch_size=64)

y_pred = np.around(model.predict(X_test))

def compiler(list):
    if list[0] == 1 and list[1] == 0:
        return 1
    elif list[0] == 0 and list[1] == 1:
        return 0
    elif list[0] == list[1]:
        return np.random.randint(0, 2)

y_test = list(map(compiler, y_test))
y_pred = list(map(compiler, y_pred))

tp, fp, tn, fn = perf_measure(y_test, y_pred)
metrics = return_metrics(tp, fp, tn, fn)
update_metrics(metrics, "Neural Networks")
print_metrics(metrics)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Accuracy: 0.81
Precision: 0.84
Recall | Sensitivity: 0.91
Specificity: 0.55
Negative predictive value: 0.7


# The table of metrics

In [12]:
table = pd.DataFrame(coefs).transpose()
table

Unnamed: 0,Accuracy,Precision,Recall | Sensitivity,Specificity,Negative predictive value
Logistic Regression,0.8,0.68,0.56,0.9,0.84
Decision Tree,0.79,0.71,0.43,0.93,0.81
SVM,0.59,0.23,0.21,0.73,0.7
KNN,0.77,0.68,0.32,0.94,0.78
Random Forest,0.81,0.73,0.5,0.93,0.83
Ensemble Learning,0.81,0.71,0.54,0.91,0.84
Neural Networks,0.81,0.84,0.91,0.55,0.7


### As we can see Neural Networks give the best results in 3 most important metrics, so we will use them for our final evaluation.

In [13]:
X_train = train_X.iloc[:,:].values
y_train = pd.DataFrame(train_y).iloc[:,:].values
X_test = test_X.iloc[:,:].values
y_test = pd.DataFrame(test_y).iloc[:,:].values

X_train = StandardScaler().fit_transform(X_train)
X_test = StandardScaler().fit_transform(X_test)
y_train = OneHotEncoder().fit_transform(y_train).toarray()
y_test = OneHotEncoder().fit_transform(y_test).toarray()

model = Sequential()
model.add(Dense(16, input_dim=19, activation = "sigmoid"))
model.add(Dense(12, activation="sigmoid"))
model.add(Dense(2, activation="softmax"))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X_train, y_train, epochs=50, batch_size=64)

y_pred = np.around(model.predict(X_test))

def compiler(list):
    if list[0] == 1 and list[1] == 0:
        return 1
    elif list[0] == 0 and list[1] == 1:
        return 0
    elif list[0] == list[1]:
        return np.random.randint(0, 2)

y_test = list(map(compiler, y_test))
y_pred = list(map(compiler, y_pred))

tp, fp, tn, fn = perf_measure(y_test, y_pred)
metrics = return_metrics(tp, fp, tn, fn)
print_metrics(metrics)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Accuracy: 0.8
Precision: 0.86
Recall | Sensitivity: 0.88
Specificity: 0.56
Negative predictive value: 0.6
