In [None]:
import pandas as pd
import numpy as np
import plotly
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

In [None]:
def hist_by_contract(df, contract, agg_type):
    """
    This function plots 2d histograms based on the value of contract column and aggregation operator
    applied to the total revenue from all customers with given tenure
    
    Args:
        df - input dataframe about tenure and contracts of customers
        contract - contract type (Month-to month, one year, two year)
        agg type - aggregation operator (mean or sum)
    
    Returns:
        2d histogram
    """
    if agg_type == 'mean':
        print("{} contract: Average revenue".format(contract))
        agg_revenue_df=pd.DataFrame(df[df['contract_type']==contract].groupby(['months_customer'])['total_fees'].mean()).reset_index()
    else:
        print("{} contract: Total revenue".format(contract))
        agg_revenue_df=pd.DataFrame(df[df['contract_type']==contract].groupby(['months_customer'])['total_fees'].sum()).reset_index()
    x=agg_revenue_df['months_customer'].dropna()
    y=agg_revenue_df['total_fees'].fillna(0)
    plt.hist2d(x, y, bins=2, cmap='Blues')
    cb = plt.colorbar()
    cb.set_label('counts in bin')
    fig_filename = contract + '_' + agg_type + "_histogram.png"
    plt.savefig('plots/' + fig_filename)
    plt.show()

In [None]:
def convert_cat_numeric(df, column):
    """
    This function converts categorical strings into integer labels to allow for quantitative analysis of the data
    
    Args:
        df - input dataframe with features characterizing analyzed customers
        column - categorical data column to be converted to integer data type
        
    Returns:
        df - output dataframe with quantified categorical data
    """
    cat_labels = df[column].unique()
    cat_labels=np.where(cat_labels=='No', '0No', cat_labels)
    cat_labels=np.where(cat_labels=='Yes', '1Yes', cat_labels)
    cat_labels.sort()
    d = dict(zip(cat_labels, range(0, len(cat_labels)+1)))
    df=df.replace('No phone service', 'No')
    df=df.replace('No internet service', 'No')
    df=df.replace('No', '0No')
    df=df.replace('Yes', '1Yes')
    df[column]=df[column].map(d, na_action='ignore')
    print(column)
    print(d)
    
    return df[column]

In [None]:
def make_classification(model, X_train, y_train, X_test, y_test, method_string):
    """
    This function fits model coefficients, predicts test values and generates classification metrics summary table
    
    Args:
        model (model object): estimated ML model
        X_train (dataframe): training feature data subset
        y_train (dataframe): training target data series
        X_test (dataframe): test feature data subset
        y_test (dataframe): test target data series
        method_string (string): name of ML method applied
    
    Returns:
        y_predict (dataframe): predicted target variable
        classification metrics summary table (print table)
    """
    model.fit(X_train, y_train)
    y_predict = model.predict(X_test)
    classification_title = method_string + ' Classification'
    print(classification_title)
    print(classification_report(y_test, y_predict))
    
    return y_predict

In [None]:
def make_confusion_matrix(y_test, y_predict, ml_method, method_string):
    """
    This function plots classification metrics data in a confusion matrics graph
    
    Args:
        y_test (dataframe): test target data series
        y_predict (dataframe): predicted values for the target values in the test subset
        ml_method (string): name of ML method, 'LR', 'RF', 'KNN', 'NB', 'SVM'
        method_string (string): name of ML method applied
    
    Returns:
        confusion matrix plot (sns heatmap plot)
    """
    cm = confusion_matrix(y_test, y_predict)
    ax= plt.subplot()
    sns.heatmap(cm, annot = True)
    ax.set_xlabel('Predicted labels')
    ax.set_ylabel('True labels')
    matrix_title = method_string + ': Confusion Matrix'
    ax.set_title(matrix_title)
    
    fig = ax.get_figure()
    fig_filename = method_string + "_confmatrix.png"
    fig.savefig('plots/' + fig_filename)
    
    plt.show()

In [None]:
def ml_train_test(X_train, y_train, X_test, y_test, ml_method):
    """
    This function applies an ML method to train and test their predictive power on the input data
    
    Args:
        X_train (dataframe): train feature subset
        y_train (dataframe): train target subset
        X_test (dataframe): test feature data subset
        y_test (dataframe): test target subset
        ml_method (string): name of ML method, 'LR', 'RF', 'KNN', 'NB', 'SVM'

    Returns:
        model object (model object)
        classification metrics summary table (print table)
        confusion matrix plot (sns heatmap plot)
    """
    if ml_method == 'LR':
        print(ml_method)
        method_string = 'Logistic Regression'
        model = LogisticRegression(max_iter=100000)
    elif ml_method == 'RF':
        method_string = 'Random Forrest'
        model = RandomForestClassifier()
    elif ml_method == 'KNN':
        method_string = 'K-Nearest Neighbors'
        model = KNeighborsClassifier()
    elif ml_method == 'NB':
        method_string = 'Naive Bayes'
        model = GaussianNB()
    elif ml_method == 'SVM':
        method_string = 'Support Vector Machine'
        model_svc = LinearSVC(max_iter=100000)
        model = CalibratedClassifierCV(model_svc)
    else:
        "Please, enter valid ML method argument: 'LR', 'RF', 'KNN', 'NB', 'SVM'"
    
    y_predict = make_classification(model, X_train, y_train, X_test, y_test, method_string)
   
    make_confusion_matrix(y_test, y_predict, ml_method, method_string)
    
    return model

In [None]:
def make_aucroc_curve(X_test, y_test, model_holder):
    """
    This function computes the ROC FPR, TPR and thresholds as well as the auc scores for the 5 ML predictions
    
    Args:
        X_test (dataframe): test feature data subset
        y_test (dataframe): test target data series
        model_holder (dictionary): holder dictionary of trained models
    Returns:
        AUC ROC curve plot
    """
    fpr1, tpr1, thresh1 = roc_curve(y_test, model_holder['model_lr'].predict_proba(X_test)[:, 1], pos_label = 1)
    fpr2, tpr2, thresh2 = roc_curve(y_test, model_holder['model_rf'].predict_proba(X_test)[:, 1], pos_label = 1)
    fpr3, tpr3, thresh3 = roc_curve(y_test, model_holder['model_knn'].predict_proba(X_test)[:, 1], pos_label = 1)
    fpr4, tpr4, thresh4 = roc_curve(y_test, model_holder['model_nb'].predict_proba(X_test)[:, 1], pos_label = 1)
    fpr5, tpr5, thresh5 = roc_curve(y_test, model_holder['model_svm'].predict_proba(X_test)[:, 1], pos_label = 1)
    
    auc_score1 = roc_auc_score(y_test, model_holder['model_lr'].predict_proba(X_test)[:, 1])
    auc_score2 = roc_auc_score(y_test, model_holder['model_rf'].predict_proba(X_test)[:, 1])
    auc_score3 = roc_auc_score(y_test, model_holder['model_knn'].predict_proba(X_test)[:, 1])
    auc_score4 = roc_auc_score(y_test, model_holder['model_nb'].predict_proba(X_test)[:, 1])
    auc_score5 = roc_auc_score(y_test, model_holder['model_svm'].predict_proba(X_test)[:, 1])

    print("Logistic Regression: ", auc_score1) # Logistic Regression
    print("Random Forest: ", auc_score2) # Support Vector Machine
    print("K-Nearest Neighbors: ", auc_score3) # Random Forest
    print("Naive Bayes: ", auc_score4) # K-Nearest Neighbors
    print("Support Vector Machine: ", auc_score5) # Naive Bayes
    
    plt.plot(fpr1, tpr1, linestyle = "--", color = "orange", label = "Logistic Regression")
    plt.plot(fpr2, tpr2, linestyle = "--", color = "red", label = "Random Forest")
    plt.plot(fpr3, tpr3, linestyle = "--", color = "green", label = "K-Nearest Neighbors")
    plt.plot(fpr4, tpr4, linestyle = "--", color = "yellow", label = "Naive Bayes")
    plt.plot(fpr5, tpr5, linestyle = "--", color = "blue", label = "SVM")

    plt.title('Receiver Operator Characteristics (ROC)')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive rate')

    plt.legend(loc = 'best')
    plt.savefig('plots/AUC_ROC_curve.png', dpi = 300)
    plt.show()

In [None]:
def make_pred_output(churn_df, X, y, model_holder, ml_method):
    """
    This function creates a dataframe which includes both input data and computed churn probabilities
    
    Args:
        churn_df (dataframe): input data
        X (dataframe): input feature data as boolean indices
        y (dataframe): input target data as boolean binary index
        model_holder (dictionary): holder dictionary of trained models
        ml_method (string): name of ML method, 'LR', 'RF', 'KNN', 'NB', 'SVM'
    
    Returns:
        churn_pred_df (dataframe): output dataframe
        churn_predicted_probs.csv (text file): output data file
    """
    churn_pred_df = churn_df.copy()
    churn_pred_df = churn_pred_df.dropna()
    churn_pred_df['churned_bool'] = y
    y_predict_probs = model_holder['model_' + ml_method.lower()].predict_proba(X)
    y_predict_probs = pd.DataFrame(y_predict_probs)[1]
    churn_pred_df['churn_predprob'] = y_predict_probs
    churn_pred_df.to_csv('output\churn_predicted_probs.csv', index=False)