In [11]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, scale
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, make_scorer, roc_auc_score
from sklearn.model_selection import cross_val_score, RandomizedSearchCV, GridSearchCV, train_test_split
from sklearn.externals import joblib
from sklearn.decomposition import PCA
from sklearn import preprocessing
# non-standard package: http://contrib.scikit-learn.org/imbalanced-learn/index.html
# https://www.jair.org/media/953/live-953-2037-jair.pdf
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek

%matplotlib inline

In [12]:
def loadDataset(df):
    # remove NA
    df.fillna(-99999, inplace = True)
    # Convert Dates
    df.order_date = pd.to_datetime(df.order_date, format='%Y-%m-%d')
    df.account_creation_date = pd.to_datetime(df.account_creation_date, format='%Y-%m-%d')
    df.deliverydate_estimated = pd.to_datetime(df.deliverydate_estimated, format='%Y-%m-%d')
    df.deliverydate_actual = pd.to_datetime(df.deliverydate_actual, format='%Y-%m-%d')
    # Convert Categories
    df.form_of_address = df.form_of_address.astype('category')
    df.email_domain = df.email_domain.astype('category')
    df.payment = df.payment.astype('category')
    df.advertising_code = df.advertising_code.astype('category')
    df.x_advertising_code_bin = df.x_advertising_code_bin.astype('category')
    df.x_order_date_yearweek = df.x_order_date_yearweek.astype('category')
    return(df)

def bads_costs(y_t, yhat, m = np.array([[3., 0.], [-10., 0.]])):
    N = yhat.shape[0]
    C = confusion_matrix(y_t, yhat)
    return(np.multiply(C, m).sum() / N)

In [22]:
use_full_feature_set = False

features_to_use = [
       'x_order_date_num',
       #'form_of_address', 'title', 'email_domain',
       'newsletter', 'model', 'payment',
       'delivery', 'coupon',
       #'postcode_invoice', 'postcode_delivery',
       #'x_advertising_code_bin',  'goods_value', 'giftwrapping',
       'referrer', 'cost_shipping', 
       'x_delivery_time_est', 'x_delivery_time', 'x_created_account', 
       #'weight', 
       'remitted_items', 'canceled_items',
       #'used_items', 
       #'book_count', 'paperback_count', 'schoolbook_count', 'ebook_count', 'audiobook_count', 'audiobook_download_count', 'film_count', 'musical_count', 'hardware_count', 'imported_count', 'other_count'
       ]

# Load and split training  and testing data
train = pd.read_csv("output/train_cleaned.csv", sep=";", index_col="ID")
train = loadDataset(train)
if use_full_feature_set:
    features_to_use = train.columns.values.tolist()
    features_to_use.remove("return_customer")
    for date_feature, v in train.dtypes.items():
        if v == "datetime64[ns]": features_to_use.remove(date_feature)
train = train[features_to_use + ["return_customer"]]
train = pd.get_dummies(train)
X_train, Y_train = train.drop("return_customer", 1), train["return_customer"]

test = pd.read_csv("output/test_cleaned.csv", sep=";", index_col="ID")
test = loadDataset(test)
test = test[features_to_use]
test = pd.get_dummies(test)
test = test.reindex(columns = train.columns, fill_value=0)
test.drop("return_customer", 1, inplace=True)
X_test = test

# Oversample
oversample = "SMOTETomek"
if oversample == "simple":
    ret_cust_idx = []
    ret_cust_idx.extend(Y_train.loc[Y_train == 0].index)
    ret_cust_idx.extend(Y_train.loc[Y_train == 1]
                .sample(frac=(Y_train.loc[Y_train == 0].count() / Y_train.loc[Y_train == 1].count()),replace=True)
                .index)
    X_train, Y_train = X_train.loc[ret_cust_idx,:].reset_index(drop=True), Y_train[ret_cust_idx].reset_index(drop=True)
elif oversample == "SMOTE":
    sm = SMOTE(kind='regular')
    X_train, Y_train = sm.fit_sample(X_train, Y_train)
elif oversample == "SMOTETomek":
    sm = SMOTETomek()
    X_train, Y_train = sm.fit_sample(X_train, Y_train)
    
# Create Cross-Validation Splits
X_train_cv, X_valid_cv, Y_train_cv, Y_valid_cv = train_test_split(X_train, Y_train, 
                                                      test_size = 0.15, random_state = 12345)
                               
X_train.shape, X_test.shape

((83918, 16), (12971, 16))

In [None]:
# Grid/Randomized Search Cross-Validation

cost_func = bads_costs # bads_costs, roc_auc_score

# Cross Validation
rf_params = {'n_estimators':[100, 250], 'min_samples_split':[2, 4, 8], 'min_samples_leaf': [1, 3, 9]}
clf_rf_cv = RandomizedSearchCV(RandomForestClassifier(), 
                         rf_params, 
                         scoring = make_scorer(cost_func), 
                         cv = 4)
clf_rf_cv.fit(X_train_cv, Y_train_cv)
#clf_rf_cv.cv_results_
joblib.dump(clf_rf_cv.cv_results_, 'output/clf_rf_cv.results.pkl')
print("Cross Valdiation Report:")
print(clf_rf_cv.best_params_)

# Train and Validate a random forest classifier with the best parameters
params_star = clf_rf_cv.best_params_
clf_rf_star = RandomForestClassifier().set_params(**params_star)
clf_rf_star.fit(X_train_cv, Y_train_cv)
yhat_valid = clf_rf_star.predict(X_valid_cv)
print("Validation Summary:")
print(cost_func(Y_valid_cv, yhat_valid))
print("Validation: {} of {}".format(np.sum(yhat_valid), len(yhat_valid)))
print(confusion_matrix(Y_valid_cv, yhat_valid))

# Train model with all data and use on the Test set
clf_rf_star.fit(X_train, Y_train)
yhat_test = clf_rf_star.predict(X_test)
np.savetxt("output/test_return_customer.csv", yhat_test.astype(int), fmt='%i', delimiter=";")
print("Testing: {} of {}".format(np.sum(yhat_test), len(yhat_test)))

In [None]:
yhat_train = clf_rf_star.predict(train[features_to_use])
print("Training: {} of {}".format(np.sum(yhat_train), len(yhat_train)))
print(confusion_matrix(train["return_customer"], yhat_train))

In [None]:
# Single Random Forest

cost_func = roc_auc_score # bads_costs roc_auc_score

clf_rf = RandomForestClassifier(n_estimators=4000)

clf_rf.fit(X_train_cv, Y_train_cv)
yhat_valid = clf_rf.predict(X_valid_cv)
print("Validation Summary:")
print(cost_func(Y_valid_cv, yhat_valid))
print("Validation: {} of {}".format(np.sum(yhat_valid), len(yhat_valid)))
print(confusion_matrix(Y_valid_cv, yhat_valid))

#clf_rf.fit(X_train, Y_train)
print("Test Results")
yhat = clf_rf.predict(X_test)
print("{} of {}".format(np.sum(yhat), len(yhat)))

#clf_rf.fit(train[features_to_use], train["return_customer"])
#yhat = clf_rf.predict(test)
#print("{} of {}".format(np.sum(yhat), len(yhat)))
#joblib.dump(clf_rf, 'output/model_rf.pkl')
#clf_rf = joblib.load('output/model_rf.pkl')
#cross_val_score(clf_rf, X_train, Y_train, scoring=make_scorer(cost_func), cv = 3)

In [None]:
# PCA Analysis

num_PC = 5

train_scaled = preprocessing.scale(X_train)
pca = PCA(n_components=num_PC)
pca.fit(train_scaled)
train_rotated = pca.transform(train_scaled)
df_train = pd.DataFrame(train_rotated)
df_train["colors"] = Y_train
sns.pairplot(df_train, hue = "colors", diag_kind="kde", vars=range(num_PC))
plt.show()
test_scaled = preprocessing.scale(test)
test_rotated = pca.transform(test_scaled)
df_test = pd.DataFrame(test_rotated)
sns.pairplot(df_test, diag_kind="kde", vars=range(num_PC))
plt.show()

In [None]:
# Gradient Boost Classifier (testing)

cost_func = roc_auc_score # bads_costs roc_auc_score

clf_gbc = GradientBoostingClassifier()
clf_gbc.fit(X_train_cv, Y_train_cv)
yhat_valid = clf_gbc.predict(X_valid_cv)
print("Validation Summary:")
print(cost_func(Y_valid_cv, yhat_valid))
print("Validation: {} of {}".format(np.sum(yhat_valid), len(yhat_valid)))
print(confusion_matrix(Y_valid_cv, yhat_valid))

clf_gbc.fit(X_train, Y_train)
print("Test Results")
yhat = clf_gbc.predict(X_test)
print("{} of {}".format(np.sum(yhat), len(yhat)))

#cross_val_score(clf_gbc, X_train, Y_train, scoring=make_scorer(cost_func), cv = 3)

In [18]:
train = pd.read_csv("output/train_cleaned.csv", sep=";", index_col="ID")
train = loadDataset(train)
[k for k,v in train.dtypes.items() if v == "datetime64[ns]"]

['order_date',
 'account_creation_date',
 'deliverydate_estimated',
 'deliverydate_actual']