In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline


import datetime
from s3fs.core import S3FileSystem
from pandas.api.types import is_string_dtype
from collections import OrderedDict

In [0]:
df = pd.read_table("/tmp/rank2_customers.tmp")

print("Shape of  the dataset:" , df.shape)
print("Number of unique customers:", df['encrypted_customer_id'].nunique(), "number of unique ASINs: ", df['asin'].nunique())

In [0]:
def get_proper_shape(df):
    """function converts to pd datetime and adds recency and get rid of promotion_title and gets rid of different promotion titles"""
    df['transaction_datetime_local'] = pd.to_datetime(df['transaction_datetime_local'])
    df['week_ending'] = pd.to_datetime(df['week_ending'])
    
    last_date = df.groupby('encrypted_customer_id')['transaction_datetime_local'].max().reset_index()
    last_date.columns = ['encrypted_customer_id','last_purchase']
    merged_df = pd.merge(left = df, right = last_date, left_on = 'encrypted_customer_id', right_on = 'encrypted_customer_id', how = 'left')
    merged_df['recency'] = (merged_df['transaction_datetime_local'].max() - merged_df['last_purchase']) / np.timedelta64(1,"D")
    
    merged_df.is_copy = False
    merged_df['is_promo'] = np.where(merged_df['promotion_title'].isnull(), 'non_promo','promo')
    
    temp = merged_df.drop(['promotion_title'],axis = 1)
    f = temp[~temp.duplicated()]
    
    return f

In [0]:
df_new = get_proper_shape(df)

In [0]:
print("Shape of  the dataset:" , df_new.shape)
print("Number of unique customers:", df_new['encrypted_customer_id'].nunique(), "number of unique ASINs: ", df_new['asin'].nunique())
df_new.head()

In [0]:
print(df_new.groupby("max_rank")['encrypted_customer_id'].nunique().sum())
df_new.groupby("max_rank")['encrypted_customer_id'].nunique()

# Some EDA

### What did 1st time customers (who did not make second purchase for at least 30 days purchase on their first transaction? 

In [0]:
print(df_new[(df_new['max_rank'] == 1) & (df_new['rank_by_datetime'] == 1)].shape)
print(df_new[(df_new['max_rank'] == 1) & (df_new['rank_by_datetime'] == 1)]['encrypted_customer_id'].nunique())

In [0]:
print(df_new[(df_new['max_rank'] == 1) & (df_new['rank_by_datetime'] == 1) & (df_new['recency'] > 30)].shape)
print(df_new[(df_new['max_rank'] == 1) & (df_new['rank_by_datetime'] == 1) & (df_new['recency'] > 30)]['encrypted_customer_id'].nunique())
print(df_new[(df_new['max_rank'] == 1) & (df_new['rank_by_datetime'] == 1) & (df_new['recency'] > 30)]['units'].sum())

In [0]:
def overall_stat(df, core, metric):
    """calculate high level stat with cumsum and shares"""
    a = df.groupby([core])[metric].sum().reset_index().sort_values(by = metric, ascending = False)
    a['share'] = a[metric] / a[metric].sum()
    a['cumsum'] = a['share'].cumsum()
    return a

In [0]:
b = overall_stat(df_new[(df_new['max_rank'] == 1) & (df_new['rank_by_datetime'] == 1) & (df_new['recency'] > 30)], 'title_name','units')
b[0:25]

# Setup for Classification Problem

## Idea is to make a cutoff window, as an example, 2 weeks after pre last purchase as a target

In [0]:
def incorporate_pre_last_purchase(df):
    """function to trimp datetime and get for each customer pre last purchase date"""
    
    df['datetime'] = df['transaction_datetime_local'].dt.date
    df['datetime'] = pd.to_datetime(df['datetime'])

    d = df[['encrypted_customer_id','datetime']]
    d = d.drop_duplicates()
    
    #calculate pre last purchase date with method nth(k)
    pre_last = d.groupby("encrypted_customer_id")['datetime'].nth(-2).reset_index()
    pre_last.columns = ['encrypted_customer_id','pre_last_purchase']
    
    #merge the datasets
    m = pd.merge(left = df,right = pre_last, left_on = ['encrypted_customer_id'], right_on = ['encrypted_customer_id'], how = 'left')
    
    #get the time difference btw last and pre last time stamp
    m['difference_btw_pre_last_and_last'] = (m['last_purchase'] - m['pre_last_purchase']) / np.timedelta64(1,"D")
    return m
    
df_newds = incorporate_pre_last_purchase(df_new)
print(df_new.shape)
print(df_newds.shape)

In [0]:
#Quick check
df_newds[df_newds['encrypted_customer_id'] == 'A23PH2G2990QEF']

In [0]:
df_newds['difference_btw_pre_last_and_last'].hist(range = (0,100), figsize = (15,9)) 

In [0]:
# Another check

print(df_newds[(df_newds['difference_btw_pre_last_and_last'].isnull()) & (df_newds['max_rank'] > 1) ].shape)
print(df_newds[(df_newds['difference_btw_pre_last_and_last'].isnull()) & (df_newds['max_rank'] == 1) ].shape)
print(df_newds[(df_newds['difference_btw_pre_last_and_last'].isnull())].shape)

In [0]:
def add_target_value(df, threshold):
    df['target'] = np.where(df['difference_btw_pre_last_and_last'].isnull(),0,1)
    df['target'] = np.where((df['difference_btw_pre_last_and_last'] < threshold) & (df['difference_btw_pre_last_and_last'].notnull()), 1,0)
    
    return df

In [0]:
df_newds = add_target_value(df_newds, 30)
print(df_newds.shape)

In [0]:
#Overall stat
def get_sizes(df):
    a = df.groupby('target')['encrypted_customer_id'].nunique().reset_index()
    a['share'] = a['encrypted_customer_id'] / a['encrypted_customer_id'].sum()
    return a
    
get_size_of_buckets = get_sizes(df_newds)
get_size_of_buckets



In [0]:
df_newds.tail()

In [0]:
def compile_date_set_for_ml(df):
    df1 = df.groupby(['encrypted_customer_id'])['units','display_price_wo_tax'].sum().reset_index()
    df2 = df.groupby("encrypted_customer_id")['recency','target'].max().reset_index()
    
    df_merged = pd.merge(left = df1,right = df2,left_on = 'encrypted_customer_id', right_on = 'encrypted_customer_id', how = 'left')
    
    return df_merged

In [0]:
df_ml = compile_date_set_for_ml(df_newds)

In [0]:
print(df_newds['encrypted_customer_id'].nunique())
print(df_ml.shape)
df_ml['encrypted_customer_id'].nunique()

In [0]:
df_ml.head()

In [0]:
df_ml.groupby("target").mean()

In [0]:
X = df_ml[['units','display_price_wo_tax','recency']]
Y = df_ml['target']

In [0]:
print(X.shape, Y.shape)

# Import libraries and make logistic regression

In [0]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size = 0.3, random_state = 0)

In [0]:
print(X_train.shape,y_train.shape)

In [0]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

## Predict the classes 

In [0]:
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

In [0]:
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
kfold = model_selection.KFold(n_splits=10, random_state=7)
modelCV = LogisticRegression()
scoring = 'accuracy'
results = model_selection.cross_val_score(modelCV, X_train, y_train, cv=kfold, scoring=scoring)
print("10-fold cross validation average accuracy: %.3f" % (results.mean()))

In [0]:
#Confusion matrix

from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

In [0]:
# Compute precision, recall etc

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

In [0]:
# ROC Curve

from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(X_test)[:,1])
plt.figure(figsize = (15,8))
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")

plt.show()

In [0]:
predicted_probabilities = logreg.predict_proba(X_test)
print(predicted_probabilities.shape, X_test.shape)
predicted_probabilities