In [1]:

import random as rand                     # random number gen
import pandas as pd                       # data science essentials
import numpy as np
import matplotlib.pyplot as plt                      # data visualization
import seaborn as sns                      # enhanced data viz
from sklearn.linear_model import LogisticRegression  # logistic regression
import statsmodels.formula.api as smf                # logistic regression
from sklearn.neighbors import KNeighborsClassifier   # KNN for classification
from sklearn.neighbors import KNeighborsRegressor    # KNN for regression
from sklearn.preprocessing import StandardScaler     # standard scaler
from sklearn.tree import DecisionTreeClassifier      # classification trees
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import export_graphviz             # exports graphics
from six import StringIO           # saves objects in memory
from IPython.display import Image                    # displays on frontend
import pydotplus   # interprets dot objects
from sklearn.model_selection import train_test_split   # train-test split
from sklearn.metrics import roc_auc_score              # auc score
from sklearn.model_selection import RandomizedSearchCV # hyperparameter tuning
from sklearn.metrics import make_scorer                # customizable scorer
from sklearn.metrics import confusion_matrix           # confusion matrix
from sklearn.ensemble import GradientBoostingClassifier # gbm
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)


########################################
# display_tree
########################################
def display_tree(tree, feature_df, height=500, width=800):
    """
    PARAMETERS
    ----------
    tree       : fitted tree model object
        fitted CART model to visualized
    feature_df : DataFrame
        DataFrame of explanatory features (used to generate labels)
    height     : int, default 500
        height in pixels to which to constrain image in html
    width      : int, default 800
        width in pixels to which to constrain image in html
    """

    # visualizing the tree
    dot_data = StringIO()

    # exporting tree to graphviz
    export_graphviz(decision_tree=tree,
                    out_file=dot_data,
                    filled=True,
                    rounded=True,
                    special_characters=True,
                    feature_names=feature_df.columns)

    # declaring a graph object
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())

    # creating image
    img = Image(graph.create_png(),
                height=height,
                width=width)

    return img


########################################
# plot_feature_importances
########################################
def plot_feature_importances(model, train, export=False):
    """
    Plots the importance of features from a CART model.

    PARAMETERS
    ----------
    model  : CART model
    train  : explanatory variable training data
    export : whether or not to export as a .png image, default False
    """

    # declaring the number
    n_features = x_train.shape[1]

    # setting plot window
    fig, ax = plt.subplots(figsize=(12, 9))

    plt.barh(range(n_features), model.feature_importances_, align='center')
    plt.yticks(pd.np.arange(n_features), train.columns)
    plt.xlabel("Feature importance")
    plt.ylabel("Feature")

    if export == True:
        plt.savefig('Tree_Leaf_50_Feature_Importance.png')


chef = pd.read_excel('Apprentice_Chef_Dataset.xlsx')
def text_split_feature(col, df, sep=' ', new_col_name='number_of_names'):
    """
Splits values in a string Series (as part of a DataFrame) and sums the number
of resulting items. Automatically appends summed column to original DataFrame.

PARAMETERS
----------
col          : column to split
df           : DataFrame where column is located
sep          : string sequence to split by, default ' '
new_col_name : name of new column after summing split, default
               'number_of_names'
"""

    df[new_col_name] = 0

    for index, val in df.iterrows():
        df.loc[index, new_col_name] = len(df.loc[index, col].split(sep=' '))

text_split_feature(col='NAME',df=chef)

chef['number_of_names'].value_counts().sort_index()
one_hot_num_name = pd.get_dummies(chef['number_of_names'])

#set the log functions
chef = chef.drop(['REVENUE', 'NAME', 'EMAIL', 'FIRST_NAME', 'FAMILY_NAME', 
                        'UNIQUE_MEALS_PURCH', 'PRODUCT_CATEGORIES_VIEWED'], axis=1)

chef_data = chef.drop(['CROSS_SELL_SUCCESS'], axis=1)
chef_target = chef['CROSS_SELL_SUCCESS']

# train-test split with stratification
x_train, x_test, y_train, y_test = train_test_split(chef_data, chef_target, test_size=0.25, random_state=219,
                                                    stratify=chef_target)


# merging training data for statsmodels
chef_train = pd.concat([x_train, y_train], axis=1)

# instantiating a logistic regression model object
logistic_small = smf.logit(formula = """CROSS_SELL_SUCCESS ~
MOBILE_NUMBER +
CANCELLATIONS_BEFORE_NOON +
CANCELLATIONS_AFTER_NOON +
TASTES_AND_PREFERENCES +
PC_LOGINS +
EARLY_DELIVERIES +
CONTACTS_W_CUSTOMER_SERVICE +
MOBILE_LOGINS +
WEEKLY_PLAN +
LATE_DELIVERIES +
PACKAGE_LOCKER +
REFRIGERATED_LOCKER +
AVG_PREP_VID_TIME +
LARGEST_ORDER_SIZE +
MASTER_CLASSES_ATTENDED +
MEDIAN_MEAL_RATING +
AVG_CLICKS_PER_VISIT +
TOTAL_PHOTOS_VIEWED +
TOTAL_MEALS_ORDERED +
AVG_TIME_PER_SITE_VISIT
""", data=chef_train)


# fitting the model object
results_logistic = logistic_small.fit()

# train/test split with the full model
chef_data_sig = chef.loc[:, ['CANCELLATIONS_BEFORE_NOON', 'TASTES_AND_PREFERENCES', 'PC_LOGINS',
                             'EARLY_DELIVERIES']]
chef_target_sig = chef['CROSS_SELL_SUCCESS']


# this is the exact code we were using before
x_train_sig, x_test_sig, y_train_sig, y_test_sig = train_test_split(
            chef_data_sig,
            chef_target_sig,
            random_state=219,
            test_size=0.25,
            stratify=chef_target_sig)

# INSTANTIATING a logistic regression model
logreg = LogisticRegression(solver = 'lbfgs',
                            C = 1,
                            random_state = 219)


# FITTING the training data
logreg_fit = logreg.fit(x_train_sig, y_train_sig)


# PREDICTING based on the testing set
logreg_pred = logreg_fit.predict(x_test_sig)

# saving scoring data for future use
logreg_train_score = logreg_fit.score(x_train_sig, y_train_sig).round(4) # accuracy
logreg_test_score  = logreg_fit.score(x_test_sig, y_test_sig).round(4)   # accuracy


# displaying and saving the gap between training and testing
logreg_test_gap = abs(logreg_train_score - logreg_test_score).round(4)

# unpacking the confusion matrix
logreg_tn, \
logreg_fp, \
logreg_fn, \
logreg_tp = confusion_matrix(y_true = y_test_sig, y_pred = logreg_pred).ravel()


# saving AUC score for future use
logreg_auc_score = roc_auc_score(y_true  = y_test_sig,
                                 y_score = logreg_pred).round(decimals = 4)

# zipping each feature name to its coefficient
logreg_model_values = zip(['MOBILE_NUMBER', 'CANCELLATIONS_BEFORE_NOON', 'TASTES_AND_PREFERENCES', 'PC_LOGINS',
                           'EARLY_DELIVERIES'],
                          logreg_fit.coef_.ravel().round(decimals=2))

# setting up a placeholder list to store model features
logreg_model_lst = [('intercept', logreg_fit.intercept_[0].round(decimals=2))]

# printing out each feature-coefficient pair one by one
for val in logreg_model_values:
    logreg_model_lst.append(val)

#Tree
########################################

# INSTANTIATING a classification tree object
full_tree = DecisionTreeClassifier()


# FITTING the training data
full_tree_fit = full_tree.fit(x_train, y_train)


# PREDICTING on new data
full_tree_pred = full_tree_fit.predict(x_test)

# saving scoring data for future use
full_tree_train_score = full_tree_fit.score(x_train, y_train).round(4) # accuracy
full_tree_test_score  = full_tree_fit.score(x_test, y_test).round(4)   # accuracy


# saving AUC
full_tree_auc_score   = roc_auc_score(y_true  = y_test,
                                      y_score = full_tree_pred).round(4) # auc

# unpacking the confusion matrix
full_tree_tn, \
full_tree_fp, \
full_tree_fn, \
full_tree_tp = confusion_matrix(y_true = y_test, y_pred = full_tree_pred).ravel()


# calling display_tree
display_tree(tree       = full_tree_fit,
             feature_df = x_train)


# INSTANTIATING a classification tree object
pruned_tree = DecisionTreeClassifier(max_depth = 4,
                                     min_samples_leaf = 25,
                                     random_state = 219)


################
# FITTING the training data
pruned_tree_fit  = pruned_tree.fit(x_train, y_train)


# PREDICTING on new data
pruned_tree_pred = pruned_tree_fit.predict(x_test)


# saving scoring data for future use
pruned_tree_train_score = pruned_tree_fit.score(x_train, y_train).round(4) # accuracy
pruned_tree_test_score  = pruned_tree_fit.score(x_test, y_test).round(4)   # accuracy


# saving auc score
pruned_tree_auc_score   = roc_auc_score(y_true  = y_test,
                                        y_score = pruned_tree_pred).round(4) # auc

pruned_tree_tn, \
pruned_tree_fp, \
pruned_tree_fn, \
pruned_tree_tp = confusion_matrix(y_true = y_test, y_pred = pruned_tree_pred).ravel()


# creating a dictionary for model results
model_performance = {

    'Model Name': ['Logistic', 'Full Tree', 'Pruned Tree'],

    'AUC Score': [logreg_auc_score, full_tree_auc_score, pruned_tree_auc_score],

    'Training Accuracy': [logreg_train_score, full_tree_train_score,
                          pruned_tree_train_score],

    'Testing Accuracy': [logreg_test_score, full_tree_test_score,
                         pruned_tree_test_score],

    'Confusion Matrix': [(logreg_tn, logreg_fp, logreg_fn, logreg_tp),
                         (full_tree_tn, full_tree_fp, full_tree_fn, full_tree_tp),
                         (pruned_tree_tn, pruned_tree_fp, pruned_tree_fn, pruned_tree_tp)]}

# converting model_performance into a DataFrame
model_performance = pd.DataFrame(model_performance)






#######################################
#单独改良的GBM模型
#!pip3 install imblearn
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report

# read data from excel file
df = pd.read_excel('Apprentice_Chef_Dataset.xlsx')
data = df.drop(columns = ['REVENUE', 'NAME', 'EMAIL', 'FIRST_NAME', 'FAMILY_NAME', 
                        'UNIQUE_MEALS_PURCH', 'PRODUCT_CATEGORIES_VIEWED'])
# split x and y
df_y = chef['CROSS_SELL_SUCCESS']
df_X = chef.drop(['CROSS_SELL_SUCCESS'], axis=1)

ros = RandomOverSampler(random_state=219)
X_resampled, y_resampled  =  ros.fit_sample(df_X, df_y)
# Normalize features
X_resampled = (X_resampled - np.min(X_resampled)) / (np.max(X_resampled) - np.min(X_resampled)).values
# target variable is stratified
x_train, x_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size = 0.25, random_state=219, stratify=y_resampled)
# max_depth for classification tree, random forest, and gradient boosted machine (GBM) models is less than or equal to 8
from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier(random_state=219, max_depth=7)
# train
model.fit(x_train ,y_train)
# predict
ygbm_pred = model.predict(x_test)
# Training Accuracy
traingbm_acc = model.score(x_train, y_train)
# Testing Accuracy
testgbm_acc = model.score(x_test, y_test)
# Confusion Matrix
cmgbm = confusion_matrix(y_test, ygbm_pred)
# AUC Score
# roc_auc_score(y_test, model.decision_function(x_test))
aucgbm = roc_auc_score(y_test, model.predict_proba(x_test)[:,1])

# appending to model_performance
model_performance = model_performance.append(
                          {'Model Name'        : 'GBM - !CHOOSE THIS! ',
                          'Training Accuracy'  : traingbm_acc,
                          'Testing Accuracy'   : testgbm_acc,
                          'AUC Score'          : aucgbm,
                          'Confusion Matrix'   : cmgbm},
                          ignore_index = True)
############################################
# checking the results
model_performance
model_performance.sort_values(by = 'AUC Score',
                              ascending = False)

#the result may spend some time, please wait for it!



Optimization terminated successfully.
         Current function value: 0.595707
         Iterations 5


Unnamed: 0,Model Name,AUC Score,Training Accuracy,Testing Accuracy,Confusion Matrix
3,GBM - !CHOOSE THIS!,0.907654,0.998486,0.826021,"[[300, 30], [85, 246]]"
2,Pruned Tree,0.6419,0.7272,0.7228,"(65, 91, 44, 287)"
1,Full Tree,0.6042,1.0,0.6509,"(74, 82, 88, 243)"
0,Logistic,0.5017,0.6813,0.6797,"(1, 155, 1, 330)"


The second task is to do classification analysis, which is a more comprehensive way to analyze data by building different models. We chose CROSS_SELL_SUCCESS and other related data so that we can make better predictions and choose a better development plan for the company. The highest AUC score is the GBM model, which has 0.907.