In [None]:
import time
start0 = time.time()

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler, scale
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import roc_auc_score, roc_curve, classification_report
from sklearn.model_selection import learning_curve, StratifiedShuffleSplit, GridSearchCV, train_test_split
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.utils import resample

upsample_flag = False
function_flag = True

%matplotlib inline

# Data Preprocessing 

Data for this project was obtained from IBM.

In [None]:
#https://www.ibm.com/communities/analytics/watson-analytics-blog/guide-to-sample-datasets/
url = 'http://bit.ly/gta-mlnd-capstone'
df = pd.read_csv(url)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
#By setting errors to 'coerce' a 'NaN will be inserted when there is an error.
#https://pandas.pydata.org/pandas-docs/stable/generated/pandas.to_numeric.html

df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors = 'coerce')

In [None]:
df['MonthlyCharges'].isnull().sum()

In [None]:
df['TotalCharges'].isnull().sum()

In [None]:
df[df['TotalCharges'].isnull()]

In [None]:
index_row = []
for index, row in df.iterrows():
    try:
        if pd.isnull(row['TotalCharges']):
            df.set_value(index, 'TotalCharges', row['MonthlyCharges'])
            print(index)
    except:
        index_row.append(row)

In [None]:
len(index_row)

In [None]:
df[df['tenure'] == 1].head(3)

In [None]:
df['ratio'] = df['TotalCharges'] / df['MonthlyCharges']

In [None]:
df_tenure_1 = df[df['tenure'] == 1]
df_tenure_1.head(10)

In [None]:
df[df['tenure'] == 2].head(3)

In [None]:
df.iloc[4]['MonthlyCharges'] * df.iloc[4]['tenure'] 

In [None]:
df.describe()

In [None]:
print('The total number of customers who did not churn: {}' .format(df['Churn'].value_counts()[0]))
print('The total number of customers who did churn: {}' .format(df['Churn'].value_counts()[1]))

In [None]:
Q1 = np.percentile(df['MonthlyCharges'], 25)
Q3 = np.percentile(df['MonthlyCharges'], 75)

step = 1.5 * (Q3 - Q1)
    
print('The 25% quartile: {}'.format(format(Q1)))
print('The 75% quartile: {}'.format(format(Q3)))
print("1.5 * IQR: {}" .format(step))

print(Q1 - step)
print(Q3 + step)
print(df['MonthlyCharges'].max())
print(df['TotalCharges'].min())

In [None]:
Q1 = np.percentile(df['TotalCharges'], 25)
Q3 = np.percentile(df['TotalCharges'], 75)

step = 1.5 * (Q3 - Q1)
    
print('25%: {}'.format(Q1))
print('75%: {}'.format(Q3))
print("1.5 * IQR: {}" .format(step))

print(Q1 - step)
print(Q3 + step)
print(df['TotalCharges'].max())
print(df['TotalCharges'].min())

In [None]:
df.columns.tolist()

In [None]:
features = df[[
 'Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod']]

In [None]:
features.info()

In [None]:
columns_list = df.columns.tolist()

In [None]:
df.head()

In [None]:
df['Churn'] = df['Churn'].map({'No': 0, 'Yes': 1})

In [None]:
sns.catplot(y="Churn", kind="count", data=df, height=2.6, aspect=2.5, orient='h')
plt.xlabel('Number of Customers')
plt.title('Distribution of Customer Churn', fontsize=12)
plt.show()

In [None]:
sns.catplot(y='InternetService', kind='count', data=df, height=2.5, aspect=2.5, orient='h')
plt.show()

In [None]:
sns.catplot(y='PhoneService', kind='count', data=df, height=2.5, aspect=2.5, orient='h')
plt.show()

In [None]:
sns.catplot(y='PaperlessBilling', kind='count', data=df, height=2.5, aspect=2.5, orient='h')
plt.show()

In [None]:
g = sns.PairGrid(df, y_vars=["tenure"], 
                 x_vars=["MonthlyCharges","TotalCharges"], 
                 height=4.5, 
                 hue="Churn", 
                 aspect=1.5)
g.map(plt.scatter, alpha=0.15)
plt.ylim(0, 80)
plt.show()

In [None]:
g = sns.FacetGrid(df, col="PaperlessBilling", height=5, aspect=.9)
g.map(sns.barplot, "Contract", "Churn", order = ['Month-to-month', 'One year', 'Two year'])
plt.show()

In [None]:
g = sns.FacetGrid(df, col="SeniorCitizen", height=5, aspect=.9)
g.map(sns.barplot, "Partner", "Churn", order= ['Yes', 'No'])
plt.show()

In [None]:
g = sns.FacetGrid(df, col='SeniorCitizen', height=5, aspect=.9)
g.map(sns.barplot, 'PaymentMethod', 'Churn', order = ['Electronic check', 
                                                     'Mailed check',
                                                     'Bank transfer (automatic)',
                                                     'Credit card (automatic)'])
plt.show()

In [None]:
g = sns.FacetGrid(df, col='PhoneService', height=5, aspect=1.0)
g.map(sns.barplot, 'PaymentMethod', 'Churn', order = ['Electronic check', 
                                                     'Mailed check',
                                                     'Bank transfer (automatic)',
                                                     'Credit card (automatic)'])
plt.show()

In [None]:
g = sns.FacetGrid(df, col='InternetService', height=5, aspect=.9)
g.map(sns.barplot, 'Contract', 'Churn', order = ['Month-to-month', 'One year', 'Two year'])
plt.show()

In [None]:
g = sns.FacetGrid(df, col='InternetService', height=5, aspect=.70)
g.map(sns.barplot, 'Contract', 'Churn', order = ['Month-to-month', 'One year', 'Two year'])
plt.show()

In [None]:
g = sns.FacetGrid(df, col='InternetService', height=5, aspect=.50)
g.map(sns.barplot, 'PaperlessBilling', 'Churn', order = ['Yes', 'No'])
plt.show()

In [None]:
sns.catplot(x="Contract", y="MonthlyCharges", hue="Churn", kind="box", data=df, height=4.2, aspect=1.4)
plt.ylabel('Total Monthly Charges ($)', fontsize=12)
plt.show()

In [None]:
sns.catplot(x="Contract", y="TotalCharges", hue="Churn", kind="box", data=df, height=4.2, aspect=1.4)
plt.ylabel('Total Monthly Charges ($)', fontsize=12)
plt.show()

In [None]:
sns.catplot(x="Contract", y="tenure", hue="Churn", kind="box", data=df, height=4.2, aspect=1.4)
plt.ylabel('Tenure (months)', fontsize=12)
plt.show()

In [None]:
sns.catplot(y="Churn", 
            x="TotalCharges", 
            row="PaymentMethod", 
            kind="box", data=df, height=1.5, aspect=4, orient='h')
plt.show()

In [None]:
sub_index = [1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 20]
sub_list = [columns_list[x] for x in sub_index]

In [None]:
unique_ = []
for i in sub_list:
    unique_.append(df[i].unique())
type(unique_)

In [None]:
df_col_summary = pd.DataFrame({'Column_Header' : sub_list, 'Initial_Index': sub_index, 'Unique_Fields' : unique_})   
df_col_summary

In [None]:
df_col_summary.iloc[15]['Unique_Fields'].tolist()

In [None]:
df['gender'] = df['gender'].map({'Male': 0, 'Female': 1})
df['Partner'] = df['Partner'].map({'No': 0, 'Yes': 1})
df['Dependents'] = df['Dependents'].map({'No': 0, 'Yes': 1})
df['PhoneService'] = df['PhoneService'].map({'No': 0, 'Yes': 1})
df['MultipleLines'] = df['MultipleLines'].map({'No phone service': 0, 'No': 1, 'Yes': 2})
df['InternetService'] = df['InternetService'].map({'DSL': 0, 'Fiber optic': 1, 'No': 2})
df['OnlineSecurity'] = df['OnlineSecurity'].map({'No internet service': 0, 'No': 1, 'Yes': 2})
df['OnlineBackup'] = df['OnlineBackup'].map({'No internet service': 0, 'No': 1, 'Yes': 2})
df['DeviceProtection'] = df['DeviceProtection'].map({'No internet service': 0, 'No': 1, 'Yes': 2})
df['TechSupport'] = df['TechSupport'].map({'No internet service': 0, 'No': 1, 'Yes': 2})
df['StreamingTV'] = df['StreamingTV'].map({'No internet service': 0, 'No': 1, 'Yes': 2})
df['StreamingMovies'] = df['StreamingMovies'].map({'No internet service': 0, 'No': 1, 'Yes': 2})
df['Contract'] = df['Contract'].map({'Month-to-month': 0, 'One year': 1, 'Two year': 2})
df['PaperlessBilling'] = df['PaperlessBilling'].map({'No': 0, 'Yes': 1})
df['PaymentMethod'] = df['PaymentMethod'].map({'Electronic check': 0, 'Mailed check': 1, 'Bank transfer (automatic)': 2, 'Credit card (automatic)': 3})
# df['Churn'] = df['Churn'].map({'No': 0, 'Yes': 1})

In [None]:
df.head()

In [None]:
df_vect = df.drop(['customerID', 'ratio', 'MonthlyCharges'], axis=1)

In [None]:
df_vect.head()

In [None]:
df_vect.shape

In [None]:
df_vect['Churn'].value_counts()

In [None]:
df_vect_major = df_vect[df_vect['Churn'] == 0]
df_vect_minor = df_vect[df_vect['Churn'] == 1]

In [None]:
print("Major Class Shape: {}" .format(df_vect_major.shape))
print("Minor Class Shape: {}" .format(df_vect_minor.shape))

In [None]:
df_minor_upsample = resample(df_vect_minor, replace=True, n_samples =5174, random_state=42)

In [None]:
df_minor_upsample.shape

In [None]:
df_upsampled = pd.concat([df_vect_major, df_minor_upsample])

In [None]:
df_upsampled.shape

In [None]:
df_upsampled[df_upsampled['Churn'] == 0].shape

In [None]:
df_upsampled[df_upsampled['Churn'] == 1].shape

In [None]:
if upsample_flag:
    X = df_upsampled.drop('Churn', axis=1)
    y = df_upsampled['Churn']
else:
    X = df_vect.drop('Churn', axis=1)
    y = df_vect['Churn']

In [None]:
X.shape

In [None]:
y.shape

In [None]:
X.keys()

In [None]:
X.columns.tolist()

In [None]:
def learning_curves(model, X_training_data, y_training_data, model_name, num_k):
    '''
    This function creates testing and cross validation learning curves that can be used
    to assess the performance of a given model.
    
    Inputs:
    model: This is the model object being used. 
    X_training_data: This is the training data that is used for the model training.
    y_training_data: This is the actual response values for the training set.
    model_name: This is the name of the model, as a text string.
    num_k: This is the number of folks to use during the cross validation phase.
    '''
    plt.figure()

    plt.xlabel("Training examples")
    plt.ylabel("Score")

    train_sizes, train_scores, test_scores = learning_curve(model, X_training_data, y_training_data, cv=num_k)

    plt.title("Learning Curves: " + model_name + " With Standard Scaler")

    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    plt.grid()

    plt.fill_between(train_sizes, 
                     train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, 
                     alpha=0.1,
                     color="r")

    plt.fill_between(train_sizes, 
                     test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, 
                     alpha=0.1, color="b")

    plt.plot(train_sizes, 
             train_scores_mean, 
             'o-', 
             color="r", 
             label="Training score")

    plt.plot(train_sizes, 
             test_scores_mean, 
             'o-', color="b", 
             label="Cross-validation score")

    plt.ylim([0.0, 1.25])
    plt.legend(loc="best")
    return plt.show()

In [None]:
def auc_roc_curves(model, X_test_data_, y_test_data_, model_name):
    
    roc_auc  = roc_auc_score(y_test_data_, model.predict(X_test_data_))
    fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(X_test_data_)[:,1])
    
    plt.figure()

    plt.plot(fpr, tpr, label= model_name + ' (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])

    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic: ' + model_name)
    plt.legend(loc="lower right")
    return plt.show()

In [None]:
def lr_coef_curves(model, X_data):
    per_var = np.round(model.coef_[0], decimals = 1)
#     labels = X_data.columns.tolist()
    plt.bar(x = range(1, len(per_var)+1), height=per_var, tick_label = X_data.columns.tolist())
    plt.xlabel('Feature', fontsize = 14)
    plt.ylabel('LR Coefficient Value', fontsize = 14)
    plt.title('LR Coefficient Values', fontsize = 14)
    plt.xticks(rotation=90)
    plt.grid(True)
    return plt.show()

In [None]:
def pca_results_(good_data, pca):
    '''
    Create a DataFrame of the PCA results
    Includes dimension feature weights and explained variance
    Visualizes the PCA results
    '''

    # Dimension indexing
    dimensions = dimensions = ['Dimension {}'.format(i) for i in range(1,len(pca.components_)+1)]

    # PCA components
    components = pd.DataFrame(np.round(pca.components_, 4), columns = list(good_data.keys()))
    components.index = dimensions
    
    # PCA explained variance
    ratios = pca.explained_variance_ratio_.reshape(len(pca.components_), 1)
    variance_ratios = pd.DataFrame(np.round(ratios, 4), columns = ['Explained Variance'])
    variance_ratios.index = dimensions

    # Create a bar plot visualization
    fig, ax = plt.subplots(figsize = (14,8))

    # Plot the feature weights as a function of the components
    components.plot(ax = ax, kind = 'bar');
    ax.set_ylabel("Feature Weights")
    ax.set_xticklabels(dimensions, rotation=0)

    # Display the explained variance ratios
    for i, ev in enumerate(pca.explained_variance_ratio_):
        ax.text(i-0.40, ax.get_ylim()[1] + 0.05, "Explained Variance\n          %.4f"%(ev))

    # Return a concatenated DataFrame
    return pd.concat([variance_ratios, components], axis = 1)

def biplot(good_data, reduced_data, pca):
    '''
    Produce a biplot that shows a scatterplot of the reduced
    data and the projections of the original features.
    
    good_data: original data, before transformation.
               Needs to be a pandas dataframe with valid column names
    reduced_data: the reduced data (the first two dimensions are plotted)
    pca: pca object that contains the components_ attribute
    return: a matplotlib AxesSubplot object (for any additional customization)
    
    This procedure is inspired by the script:
    https://github.com/teddyroland/python-biplot
    '''

    fig, ax = plt.subplots(figsize = (14,8))
    # scatterplot of the reduced data    
    ax.scatter(x=reduced_data.loc[:, 'Dimension_1'], y=reduced_data.loc[:, 'Dimension_2'], 
        facecolors='b', edgecolors='b', s=70, alpha=0.005)
    
    feature_vectors = pca.components_.T

    # we use scaling factors to make the arrows easier to see
    arrow_size, text_pos = 7.0, 8.0,

    # projections of the original features
    for i, v in enumerate(feature_vectors):
        ax.arrow(0, 0, arrow_size*v[0], arrow_size*v[1], 
                  head_width=0.2, head_length=0.2, linewidth=2, color='red')
        ax.text(v[0]*text_pos, v[1]*text_pos, good_data.columns[i], color='black', 
                 ha='center', va='center', fontsize=18)

    ax.set_xlabel("Dimension 1", fontsize=14)
    ax.set_ylabel("Dimension 2", fontsize=14)
    ax.set_title("Principal Component Plane With Original Feature Projections.", fontsize=16);
    return ax

# Data Scaling Using `StandardScaler`.

In [None]:
standard_scaler = StandardScaler()

In [None]:
print(standard_scaler.fit(X))
X_std = standard_scaler.transform(X)

In [None]:
#https://chrisalbon.com/machine_learning/model_evaluation/split_data_into_training_and_test_sets/
#Tl;dr: Split the data set using train_test_split first then apply standard scaler. 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.25, 
                                                    random_state=42, 
                                                    shuffle=True, 
                                                    stratify=y)
#http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
#This means that we don't need to run Stratified_Shuffle_Split.

In [None]:
print(standard_scaler.fit(X_train))
X_train_std_df = standard_scaler.transform(X_train)
X_test_std_df = standard_scaler.transform(X_test)

In [None]:
print("Shape of X_train: {}".format(X_train.shape))
print("Shape of X_test: {}".format(X_test.shape))
print("Shape of y_train: {}".format(y_train.shape))
print("Shape of y_test: {}".format(y_test.shape))

# Part 1: Model development and tuning using `LogisticRegression`.

In [None]:
#LR base with standard scaler
lr = LogisticRegression(random_state=42)
lr.fit(X_train_std_df, y_train)
y_pred_class = lr.predict(X_test_std_df)

print('ROC_AUC Score: {}'. format(roc_auc_score(y_test, y_pred_class)))
print('---')
print('LR coef: {}' .format(lr.coef_[0]))
print('---')
print('Confusion Matrix:')
print('{}'.format(confusion_matrix(y_test, y_pred_class)))

In [None]:
lr_coef_curves(lr, X)

In [None]:
auc_roc_curves(lr, X_test_std_df, y_test, 'Logistic Regression')

In [None]:
learning_curves(lr, X_train_std_df, y_train, 'Logistic Regression', 10)

In [None]:
#http://scikit-learn.org/0.15/modules/generated/sklearn.cross_validation.cross_val_score.html#sklearn.cross_validation.cross_val_score
lr_std_scores = cross_val_score(lr, X_std, y, cv=10, scoring='roc_auc')
lr_std_scores

In [None]:
if function_flag:
    lr_grid = LogisticRegression(random_state = 42)
    penalty = ['l1', 'l2']
    C = np.logspace(0, 1, 10, 100)
    hyperparameters = dict(C=C, penalty=penalty)
    grid_results = GridSearchCV(lr_grid, 
                                hyperparameters, 
                                verbose=3, 
                                cv=10, 
                                scoring='roc_auc').fit(X_train_std_df, y_train) #Will output the probability.
else: 
    pass

In [None]:
try:
    print(grid_results.best_params_)
except:
    pass

In [None]:
try:
    print(grid_results.best_score_) #It is assumed that this combination will yield the best performing model.
except:
    pass

In [None]:
y_pred_class = grid_results.predict(X_test_std_df)

In [None]:
lr_ = grid_results.best_estimator_
print(lr_)

In [None]:
lr_tuned = LogisticRegression(C=3.5938136638046276, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=42,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [None]:
#https://stackoverflow.com/questions/49061575/why-when-i-use-gridsearchcv-with-roc-auc-scoring-the-score-is-different-for-gri
lr_tuned.fit(X_train_std_df, y_train)
y_pred_class = lr_tuned.predict(X_test_std_df) #Outputs the class type and not the probabilities.

print('ROC_AUC Score: {}'. format(roc_auc_score(y_test, y_pred_class)))
print('---')
print('LR coef: {}' .format(lr_tuned.coef_[0]))
print('---')
print('Confusion Matrix:')
print('{}'.format(confusion_matrix(y_test, y_pred_class)))

In [None]:
auc_roc_curves(lr_tuned, X_test_std_df, y_test, 'Logistic Regression')

In [None]:
learning_curves(lr_tuned, X_train_std_df, y_train, 'Logistic Regression', 10)

In [None]:
lr_coef_curves(lr_tuned, X)

In [None]:
#Classification report of tuned model.
print(classification_report(y_test, y_pred_class))

# Part 2: Model development and tuning using `XBGClassifier`.

In [None]:
#https://machinelearningmastery.com/develop-first-xgboost-model-python-scikit-learn/
#https://github.com/dmlc/xgboost/tree/master/demo/guide-python

#XGB base with standard scaler
xgb_model = XGBClassifier(random_state=42)
xgb_model.fit(X_train_std_df, y_train)

In [None]:
y_pred_prob = xgb_model.predict(X_test_std_df)
y_pred_class = [round(each) for each in y_pred_prob]

In [None]:
print('ROC_AUC Score: {}'. format(roc_auc_score(y_test, y_pred_class)))
print('---')
print('Confusion Matrix:')
print('{}'.format(confusion_matrix(y_test, y_pred_class)))

In [None]:
learning_curves(xgb_model, X_train_std_df, y_train, "XGB Classifier", 10)

In [None]:
auc_roc_curves(xgb_model, X_test_std_df, y_test, 'XGB Classifier')

In [None]:
# https://towardsdatascience.com/fine-tuning-xgboost-in-python-like-a-boss-b4543ed8b1e
if function_flag:
    xgb_grid = XGBClassifier(random_state=42)
    learning_rate = [0.01, 0.1, 1]
    max_depth = [1, 3, 5, 7]
    n_estimators = [100, 1000]
    hyperparameters = dict(learning_rate=learning_rate, max_depth=max_depth, n_estimators=n_estimators)
    grid_results = GridSearchCV(xgb_grid, 
                                hyperparameters, 
                                verbose=3, 
                                cv=10, 
                                scoring='roc_auc').fit(X_train_std_df, y_train) #Will output the probability.
else:
    pass

In [None]:
print(grid_results.best_params_)

In [None]:
print(grid_results.best_score_) #It is assumed that this combination will yield the best performing model.

In [None]:
xgb_ = grid_results.best_estimator_
print(xgb_)

In [None]:
xgb_tuned = grid_results.best_estimator_

In [None]:
xgb_tuned.fit(X_train_std_df, y_train)
y_pred_prob = xgb_tuned.predict(X_test_std_df)
y_pred_class = [round(each) for each in y_pred_prob]

In [None]:
print('ROC_AUC Score: {}'. format(roc_auc_score(y_test, y_pred_class)))
print('---')
print('Confusion Matrix:')
print('{}'.format(confusion_matrix(y_test, y_pred_class)))

In [None]:
learning_curves(xgb_tuned, X_train_std_df, y_train, "XGB Classifier", 10)

In [None]:
auc_roc_curves(xgb_tuned, X_test_std_df, y_test, 'XGB Classifier')

# Part 3: Model development and tuning using `SVC`.

In [None]:
svc_model = SVC(random_state=42, probability=True)
svc_model.fit(X_train_std_df, y_train)
y_pred_class = svc_model.predict(X_test_std_df)
#http://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html

In [None]:
print('ROC_AUC Score: {}'. format(roc_auc_score(y_test, y_pred_class)))
print('---')
print('Confusion Matrix:')
print('{}'.format(confusion_matrix(y_test, y_pred_class)))

In [None]:
learning_curves(svc_model, X_train_std_df, y_train, "SVC", 10)

In [None]:
auc_roc_curves(svc_model, X_test_std_df, y_test, 'SVC')

In [None]:
if function_flag:
    svc_grid = SVC(random_state=42)
    C = [0.001, 0.10, 0.1, 10, 25, 50, 100, 1000]
    gamma = [1e-2, 1e-3, 1e-4, 1e-5]
    kernel = ['rbf', 'poly'] 
    hyperparameters = dict(C=C, gamma=gamma, kernel=kernel)
    grid_results = GridSearchCV(svc_grid, 
                                hyperparameters, 
                                verbose=3, 
                                cv=10, 
                                scoring='roc_auc').fit(X_train_std_df, y_train) #Will output the probability.
else:
    pass

In [None]:
print(grid_results.best_params_)

In [None]:
print(grid_results.best_score_) #It is assumed that this combination will yield the best performing model.

In [None]:
svc_ = grid_results.best_estimator_
print(svc_)

In [None]:
svc_tuned = grid_results.best_estimator_

# SVC(C=50, cache_size=200, class_weight=None, coef0=0.0,
#                 decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
#                 max_iter=-1, probability=True, random_state=42, shrinking=True,
#                 tol=0.001, verbose=False)

In [None]:
svc_tuned.fit(X_train_std_df, y_train)

In [None]:
svc_tuned.fit(X_train_std_df, y_train)
y_pred_class = svc_tuned.predict(X_test_std_df) #Outputs the class type and not the probabilities.

In [None]:
print('ROC_AUC Score: {}'. format(roc_auc_score(y_test, y_pred_class)))
print('---')
print('Confusion Matrix:')
print('{}'.format(confusion_matrix(y_test, y_pred_class)))

In [None]:
start1 = time.time()

learning_curves(svc_tuned, X_train_std_df, y_train, "SVC", 10)

end1 = time.time()
print('Elapsed time: {0:.2f} min' .format((end1 - start1)/60))

In [None]:
start2 = time.time()

auc_roc_curves(svc_tuned, X_test_std_df, y_test, 'SVC')

end2 = time.time()
print('Elapsed time: {0:.2f} min' .format((end2 - start2)/60))

# Step 4 `PCA` Analysis.

In [None]:
pca = PCA(n_components=10).fit(X_std)
pca_results = pca.transform(X_std)

In [None]:
plt.plot(pca.explained_variance_ratio_)
plt.xlabel('Number of Principal Components', fontsize=14)
plt.ylabel('Explained Variance', fontsize=14)
plt.title('PCA Scree Plot', fontsize = 14)
plt.grid(True)
plt.show()

In [None]:
pca = PCA(n_components=2).fit(X_std)
pca_results = pca.transform(X_std)

pca_reduced_df = pd.DataFrame(pca_results, columns=["Dimension_1", "Dimension_2"])
pca_reduced_df.head()

In [None]:
pca_results_(X, pca)

In [None]:
biplot(X, pca_reduced_df, pca)

In [None]:
#https://stats.stackexchange.com/questions/244677/how-to-decide-between-pca-and-logistic-regression

In [None]:
lr_coef_curves(lr_tuned, X)

# Step 5: Run new model with the most important features (i.e., reduced feature set).

In [None]:
df_reduced = df[['tenure','MultipleLines', 'InternetService', 'StreamingTV', 'StreamingMovies', 'Contract','PaperlessBilling','TotalCharges']]

In [None]:
df_reduced.head()

In [None]:
X = df_upsampled
X.shape

In [None]:
y.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.25, 
                                                    random_state=42, 
                                                    shuffle=True,
                                                    stratify=y)

In [None]:
print(standard_scaler.fit(X_train))
X_train_std_red = standard_scaler.transform(X_train)
X_test_std_red = standard_scaler.transform(X_test)

In [None]:
#XGB base with standard scaler and a reduced feature set. 
xgb_model = XGBClassifier(random_state=42)
xgb_model.fit(X_train_std_red, y_train)

In [None]:
y_pred_prob = xgb_model.predict(X_test_std_red)
y_pred_class = [round(each) for each in y_pred_prob]
print('ROC_AUC Score: {}'. format(roc_auc_score(y_test, y_pred_class)))
print('---')
print('Confusion Matrix:')
print('{}'.format(confusion_matrix(y_test, y_pred_class)))

In [None]:
learning_curves(xgb_model, X_train_std_red, y_train, "XGB Classifier", 10)

In [None]:
auc_roc_curves(xgb_model, X_test_std_red, y_test, 'XGB Classifier')

In [None]:
if function_flag:
    xgb_grid = XGBClassifier(random_state=42)
    learning_rate = [0.01, 0.1, 1]
    max_depth = [1, 3, 5, 7]
    n_estimators = [100, 1000]
    hyperparameters = dict(learning_rate=learning_rate, max_depth=max_depth, n_estimators=n_estimators)
    grid_results = GridSearchCV(xgb_grid, 
                                hyperparameters, 
                                verbose=3, 
                                cv=10, 
                                scoring='roc_auc').fit(X_train_std_df, y_train) #Will output the probability.
else:
    pass

In [None]:
print(grid_results.best_params_)

In [None]:
print(grid_results.best_score_) #It is assumed that this combination will yield the best performing model.

In [None]:
xgb_ = grid_results.best_estimator_
print(xgb_)

In [None]:
xgb_reduced = grid_results.best_estimator_

# XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
#                             colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
#                             max_depth=1, min_child_weight=1, missing=None, n_estimators=1000,
#                             n_jobs=1, nthread=None, objective='binary:logistic',
#                             random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
#                             seed=None, silent=True, subsample=1)

In [None]:
xgb_tuned.fit(X_train_std_df, y_train)
y_pred_prob = xgb_tuned.predict(X_test_std_df)
y_pred_class = [round(each) for each in y_pred_prob]

In [None]:
print('ROC_AUC Score: {}'. format(roc_auc_score(y_test, y_pred_class)))
print('---')
print('Confusion Matrix:')
print('{}'.format(confusion_matrix(y_test, y_pred_class)))

In [None]:
def xgb_boost_function(X_data, y_data):
    
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, 
                                                    test_size=0.25, 
                                                    random_state=42, 
                                                    shuffle=True,
                                                    stratify=y)
    standard_scaler = StandardScaler()
    standard_scaler.fit(X_train)
    X_train_std_red_ = standard_scaler.transform(X_train)
    X_test_std_red_ = standard_scaler.transform(X_test)
    xgb_model = XGBClassifier(random_state=42)
    xgb_model.fit(X_train_std_red_, y_train)
    
    y_pred_prob = xgb_model.predict(X_test_std_red_)
    y_pred_class = [round(each) for each in y_pred_prob]
    return roc_auc_score(y_test, y_pred_class)

In [None]:
feature_list = ['tenure','MultipleLines', 'InternetService', 'StreamingTV', 'StreamingMovies', 'Contract','PaperlessBilling','TotalCharges']

In [None]:
X.shape

In [None]:
if upsample_flag:
    
    initial_feature_list = []
    seq_feature_list = []
    roc_scores = []
    for each in feature_list:
        initial_feature_list.append(each)
        X = df_upsampled[initial_feature_list]
        roc_scores.append(xgb_boost_function(X, y))
        seq_feature_list.append(list(initial_feature_list))
        
else:
    initial_feature_list = []
    seq_feature_list = []
    roc_scores = []
    for each in feature_list:
        initial_feature_list.append(each)
        X = df_vect[initial_feature_list]
        roc_scores.append(xgb_boost_function(X, y))
        seq_feature_list.append(list(initial_feature_list))

In [None]:
scores_ = np.round(roc_scores, decimals = 3)
plt.bar(x = range(1, len(scores_)+1), height=scores_, tick_label = ['/ '.join(i) for i in seq_feature_list])
plt.xlabel('Feature', fontsize = 14)
plt.ylabel('XGB AUC Score', fontsize = 14)
plt.title('AUC Score as a Function of Feature Inclusion', fontsize = 14)
plt.xticks(rotation=90)
plt.grid(True)
plt.ylim([0.0, 0.8])
plt.show()

In [None]:
end0 = time.time()
print('Elapsed time: {0:.2f} min' .format((end0 - start0)/60))