In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Loading Data

In [7]:
df = pd.read_csv('avocado.csv')

# Data exploration

In [8]:
# simple way to check the contents of categorical features
def feature_check(X):
    cat_cols = np.array([])
    for col_name in X.columns:
        if X[col_name].dtypes == 'object':
            unique_cat = len(X[col_name].unique())
            cat_cols = np.hstack([np.array(col_name), cat_cols])
            if unique_cat > 10:
                print("Feature '{col_name}' has {unique_cat} unique categories".format(
                        col_name=col_name, unique_cat=unique_cat))
            else:
                print("Feature '{col_name}' has {unique_cat} unique categories, which are {c}".format(
                    col_name=col_name, unique_cat=unique_cat,c=X[col_name].unique()))
    return cat_cols
  
cat_cols = feature_check(df)

Feature 'Date' has 169 unique categories
Feature 'type' has 2 unique categories, which are ['conventional' 'organic']
Feature 'region' has 54 unique categories


# Pre-processing

In [2]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# Modeling

In [3]:
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier

In [None]:
y_score = clf.predict_proba()

In [None]:
def plot_ROC(y_test, y_score):
    # Compute ROC curve and ROC area for each class
    from sklearn.metrics import roc_curve, auc
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    y_test = pd.get_dummies(y_test).values
    (_, n_classes) = y_test.shape
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_test[:,i], y_score[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    # Compute micro-average ROC curve and ROC area
    fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

    fig = plt.figure()
    lw = 2
    plt.plot(fpr[1], tpr[1], color='darkorange',
             lw=lw, label='ROC curve (area = %0.2f)' % roc_auc[1])
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()
    return fig

In [None]:
plot_ROC(y_test, y_score)

In [None]:
from sklearn.metrics import precision_recall_curve
precision, recall, thresholds = precision_recall_curve(y_test, y_score[:,1])

plt.plot(thresholds, precision[1:], label='precision')
plt.plot(thresholds, recall[1:], label='recall')
plt.legend()
plt.show()

# A/B test

In [4]:
from math import lgamma
from numba import jit

#defining the functions used
@jit
def h(a, b, c, d):
    num = lgamma(a + c) + lgamma(b + d) + lgamma(a + b) + lgamma(c + d)
    den = lgamma(a) + lgamma(b) + lgamma(c) + lgamma(d) + lgamma(a + b + c + d)
    return np.exp(num - den)

@jit
def g0(a, b, c):    
    return np.exp(lgamma(a + b) + lgamma(a + c) - (lgamma(a + b + c) + lgamma(a)))

@jit
def hiter(a, b, c, d):
    while d > 1:
        d -= 1
        yield h(a, b, c, d) / d

def g(a, b, c, d):
    return g0(a, b, c) + sum(hiter(a, b, c, d))

def calc_prob_between(beta1, beta2):
    return g(beta1.args[0], beta1.args[1], beta2.args[0], beta2.args[1])

In [5]:
from scipy.stats import beta
import numpy as np
#from calc_prob import calc_prob_between

#This is the known data: impressions and conversions for the Control and Test set
imps_ctrl, convs_ctrl = 17000, 27 
imps_test, convs_test = 17000, 30

#here we create the Beta functions for the two sets
a_C, b_C = convs_ctrl + 1, imps_ctrl-convs_ctrl+1
beta_C = beta(a_C, b_C)
a_T, b_T = convs_test + 1, imps_test-convs_test+1
beta_T = beta(a_T, b_T)

#calculating the lift
lift=(beta_T.mean()-beta_C.mean())/beta_C.mean()

#calculating the probability for Test to be better than Control
prob=calc_prob_between(beta_T, beta_C)

print (f"Test option lift Conversion Rates by {lift*100:2.2f}% with {prob*100:2.1f}% probability.")
#output: Test option lift Conversion Rates by 59.68% with 98.2% probability.

Test option lift Conversion Rates by 10.71% with 65.3% probability.
