### Setup

In [2]:
import sklearn as sk
import numpy as np
import pandas as pd
import os

## Import Data

In [3]:
dataFile = os.path.join(
    "/mnt/data",
    "churn-data",
    "smallPrepared.csv"
)

df = pd.read_csv(dataFile, header=0, index_col=0)
print(len(df), " rows")
print(len(df.columns), " cols")
df.head()

7939  rows
5  cols


Unnamed: 0_level_0,dropperc,mins,consecmonths,income,churn_Y
custid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
844336,0.016364,550,28,89.2,0
146041,0.018349,545,33,54.2,0
847745,0.018519,378,41,55.3,0
285565,0.014493,552,32,66.8,0
754611,0.012132,577,4,87.2,0


### Begin Model Building

In [4]:
columns = list(df.columns)
columns.remove("churn_Y")
y = df["churn_Y"].values
X = df[columns].values

In [5]:
from sklearn.ensemble import AdaBoostClassifier
ad1 = AdaBoostClassifier(learning_rate=1)
ad1 = ad1.fit(X, y)
ad1prb = ad1.predict_proba(X)

In [6]:
from sklearn.ensemble import AdaBoostClassifier
ad2 = AdaBoostClassifier(learning_rate=0.5)
ad2 = ad2.fit(X, y)
ad2prb = ad2.predict_proba(X)

In [7]:
from sklearn.ensemble import GradientBoostingClassifier
gb1 = GradientBoostingClassifier(loss = "exponential", max_depth=3)
gb1 = gb1.fit(X, y)
gb1prb = gb1.predict_proba(X)

In [8]:
from sklearn.ensemble import GradientBoostingClassifier
gb2 = GradientBoostingClassifier(loss = "exponential", max_depth=10)
gb2 = gb2.fit(X, y)
gb2prb = gb2.predict_proba(X)

In [9]:
from sklearn.ensemble import RandomForestClassifier 
rf1 = RandomForestClassifier(max_depth=None)
rf1 = rf1.fit(X, y)
rf1prb = rf1.predict_proba(X)
rf1pclass = rf1.predict(X)

In [10]:
from sklearn.ensemble import RandomForestClassifier 
rf2 = RandomForestClassifier(max_depth = 5)
rf2 = rf2.fit(X, y)
rf2prb = rf2.predict_proba(X)
rf2pclass = rf2.predict(X)

In [11]:
from sklearn.ensemble import RandomForestClassifier 
rf3 = RandomForestClassifier(max_depth = 30)
rf3 = rf3.fit(X, y)
rf3prb = rf3.predict_proba(X)
rf3pclass = rf3.predict(X)

In [12]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg = logreg.fit(X, y)
logregprb = logreg.predict_proba(X)
logregpclass = logreg.predict(X)

In [13]:
print(logreg.intercept_)
print(logreg.coef_)

[-3.04138236]
[[ 2.66057905 -0.0086364   0.09553533  0.01732229]]


### Build a Cross Validation Function

In [14]:
from sklearn import model_selection

# takes a list of models, the input np.array, the target np.array, the type of score to be used with cv, and k
# each element in the list of models should have two items: the model object and the name you want to use for that 
# model object
# returns a dataframe with the names you entered and the mean of the cv scores across all k folds

def cv_fun(models, inputs, target, score, k):
    i = 0
    for m in models:
        scores = model_selection.cross_val_score(models[i][0], inputs, target, scoring=score, cv=k)
    
        if i==0:
            list1 = list()
            list2 = list()
            
        list1.append(round(scores.mean(),3))
        list2.append(models[i][1])

        i=i+1

    return pd.DataFrame(list1, index=list2, columns=[score])

### Cross Validation on the Training Data

In [15]:
# use the cv function found up under the Setup section
# enter a list with each entry holding the model object followed by a text name you want to give the model

input_models = [[ad1, 'ad1']]
input_models.append([ad2, 'ad2'])
input_models.append([gb1, 'gb1'])
input_models.append([gb2, 'gb2'])
input_models.append([rf1, 'rf1'])
input_models.append([rf2, 'rf2'])
input_models.append([rf3, 'rf3'])
input_models.append([logreg, 'logreg'])

cv_auc = cv_fun(input_models, X, y, 'roc_auc', 5)
cv_acc = cv_fun(input_models, X, y, 'accuracy', 5)
cv_results = cv_auc.join(cv_acc)
cv_results.sort_values(by='roc_auc', ascending=False, inplace=True)

In [16]:
#manually pick best, would be nice to automate this part
cv_results

Unnamed: 0,roc_auc,accuracy
gb1,0.944,0.959
ad2,0.942,0.959
ad1,0.936,0.958
rf2,0.936,0.956
gb2,0.927,0.954
rf1,0.914,0.957
rf3,0.909,0.956
logreg,0.804,0.95


### Save Model to File

In [17]:
import pickle 
best = gb1

modelSaveFile = os.path.join(
    "/mnt/artifacts",
    "results",
    "gb1.pkl"
)
pickle.dump(best, open(modelSaveFile, "wb")) # w = open for writing, r = open for reading, b = binary

In [18]:

#make confusion matrix plot
def plot_confusion_matrix(cm,
                          target_names,
                          title='Confusion matrix',
                          cmap=None,
                          normalize=True):
    """
    given a sklearn confusion matrix (cm), make a nice plot
 
    Arguments
    ---------
    cm:           confusion matrix from sklearn.metrics.confusion_matrix
 
    target_names: given classification classes such as [0, 1, 2]
                  the class names, for example: ['high', 'medium', 'low']
 
    title:        the text to display at the top of the matrix
 
    cmap:         the gradient of the values displayed from matplotlib.pyplot.cm
                  see http://matplotlib.org/examples/color/colormaps_reference.html
                  plt.get_cmap('jet') or plt.cm.Blues
 
    normalize:    If False, plot the raw numbers
                  If True, plot the proportions
 
    Usage
    -----
    plot_confusion_matrix(cm           = cm,                  # confusion matrix created by
                                                              # sklearn.metrics.confusion_matrix
                          normalize    = True,                # show proportions
                          target_names = y_labels_vals,       # list of names of the classes
                          title        = best_estimator_name) # title of graph
 
    Citiation
    ---------
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
 
    """
    import matplotlib.pyplot as plt
    import numpy as np
    import itertools
 
    accuracy = np.trace(cm) / float(np.sum(cm))
    misclass = 1 - accuracy
 
    if cmap is None:
        cmap = plt.get_cmap('Blues')
 
    plt.figure(figsize=(8, 6))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
 
    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=45)
        plt.yticks(tick_marks, target_names)
 
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
 
 
    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, "{:0.4f}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        else:
            plt.text(j, i, "{:,}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
 
 
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
    plt.gcf().subplots_adjust(bottom=0.25)

    confMatxBestFile = os.path.join(
        "/mnt/artifacts",
        "results",
        "ConfMatx_Best.png"
    )
    plt.savefig(confMatxBestFile, format="png")
    plt.show()
    plt.gcf().clear()
    


In [19]:
from sklearn import metrics

plot_confusion_matrix(cm           = metrics.confusion_matrix(y, best.predict(X)), 
                      normalize    = False,
                      target_names = ['no churn', 'churn'],
                      title        = "Confusion Matrix for Best Model")
                      

<Figure size 800x600 with 2 Axes>

In [20]:
df2 = df
df2["prob"] = best.predict_proba(X)[:,1]

In [21]:
df2.head()

Unnamed: 0_level_0,dropperc,mins,consecmonths,income,churn_Y,prob
custid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
844336,0.016364,550,28,89.2,0,0.005922
146041,0.018349,545,33,54.2,0,0.000947
847745,0.018519,378,41,55.3,0,0.00045
285565,0.014493,552,32,66.8,0,0.002602
754611,0.012132,577,4,87.2,0,0.000102


In [22]:
modelOutputFile = os.path.join(
    "/mnt/data",
    "scratch",
    "gb1-output.csv"
)

df2.to_csv(modelOutputFile)