In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option("display.max_columns", 101)
pd.set_option("display.max_rows", 101)

### Picking up where we left off...

In [None]:
# read in pickled DFs generated by query_dfs.py

df_orders = pd.read_pickle("./pickle/df_orders.pickle")
df_train = pd.read_pickle("./pickle/df_train.pickle")
df_prior = pd.read_pickle("./pickle/df_prior.pickle")
df_prod_detail = pd.read_pickle("./pickle/df_prod_detail.pickle")

In [None]:
# read in pickled feature DF generated by feature_engineering_1.ipynb
X = pd.read_pickle("./pickle/X_7.pickle")

In [None]:
X.shape


In [None]:
X.head(3)

In [None]:
df_orders.sample()

In [None]:
len(df_train)

In [None]:
mask = df_orders.eval_set == 'train'
len(df_orders[mask].order_id.unique())

### Features to Add:

**Product features**
* `prod_total_sales`: the number of times the product has been purcased in the past (by all users)
* `prod_pct_reordered`: product percent of prior transactions that were reorders
* `prod_avg_atco`: product average add-to-cart order


In [None]:
prod_sales = (df_prior.groupby('product_id')
              .agg({"product_id": "count"})['product_id']
              .sort_values(ascending=False))

prod_sales

In [None]:
# get total sales for each product & add to X

X = (X.merge(prod_sales, left_on='product_id', right_index=True)
    .drop(columns="product_id_x"))
    
X.rename(columns={"product_id_y": "prod_prior_sales"}, inplace=True)
X.head(3)

In [None]:
X.shape

In [None]:
# add prod_pct_reordered col
reorders = df_prior.groupby("product_id").agg({"reordered": "sum"})
reorders.head(3)

In [None]:
X = (X.merge(reorders, left_on='product_id', right_index=True)
     .rename(columns={"reordered": "prod_prior_reorders"}))
del reorders
X.head(3)

In [None]:
X["prod_pct_reorders"] = X.prod_prior_reorders / X.prod_prior_sales
X.drop(columns="prod_prior_reorders", inplace=True)
X.head(3)

In [None]:
X.shape

In [None]:
# add product avg. add to cart order (prod_avg_atco)
atcos = (df_prior.groupby(['product_id'], as_index=False)
                                           .agg({'add_to_cart_order': 'mean'}))

X = X.merge(atcos, on="product_id").rename(columns={'add_to_cart_order': 'prod_avg_atco'})
del atcos
X.head(3)

### Features to Add

**User features**
* `user_total_orders`: total number of orders for the user
* `user_total_reorders`
* `user_pct_reorders`
* `user_avg_ippo`: user average items per order

In [None]:
# get num. total purchases for each user   
user_order_counts = (df_orders.groupby("user_id")
                     .agg({"order_number": "max"})
                     .rename(columns={"order_number": "user_total_orders"}))
                     
                     
user_order_counts.head(3)

In [None]:
# add user_total_orders col to X

X = X.merge(user_order_counts, how='left', left_on='user_id', right_index=True)

del user_order_counts

X.head(3)

In [None]:
# add total items per order
user_total_items = (df_prior.groupby('user_id', as_index=False)
 .agg({"product_id": "count"})
 .rename(columns={"product_id": "user_total_items_purchased"}))

user_total_items.head(3)

In [None]:
user_num_prior_orders = (df_prior.groupby('user_id', as_index=False)
 .agg({"order_number": "max"})
 .rename(columns={"order_number": "user_num_prior_orders"}))

user_avg_ipos = user_num_prior_orders.merge(user_total_items, on='user_id')

user_avg_ipos['user_avg_ippo'] = user_avg_ipos.user_total_items_purchased / user_avg_ipos.user_num_prior_orders
user_avg_ipos

In [None]:
X = X.merge(user_avg_ipos[['user_id', 'user_avg_ippo']], on='user_id')
X.drop(columns='user_total_orders', inplace=True)
X.head(3)

### Top Ten Products (reorder percentage)

In [None]:
(df_prod_detail.merge((X.groupby('product_id', as_index=False)
 .agg({"prod_pct_reorders": "min"})), how='left', on='product_id')
 .sort_values(by="prod_pct_reorders", ascending=False)).reset_index().head(10)

In [None]:
X.sample(2)

In [None]:
X.columns

In [None]:
len(df_prior.user_id.unique())

In [None]:
len(df_train.user_id.unique())

In [None]:
len(df_orders.user_id.unique())

In [None]:
mask = df_orders.eval_set == 'test'
df_orders[mask].head(3)

In [None]:
mask = df_prior.user_id == 6
df_prior[mask].shape

In [None]:
df_train.columns

### More features...
* `order_hour_of_day_avg`: already provided in df_train
* `days_since_prior_order`: already provided in df_train


In [None]:
train_feats = (df_train.groupby('user_id', as_index=False)
               .agg({'days_since_prior_order': 'min',
                     'order_hour_of_day':'min'}))

X = X.merge(train_feats, on='user_id')
X.head(3)

In [None]:
df_train.head(2)

In [None]:
# add user_avg_spacing and user_avg_time

train_merge_cols = ['user_id', 'order_hour_of_day', 'days_since_prior_order']
prior_user_stats = (df_prior[train_merge_cols].groupby('user_id', as_index=False)
               .agg('mean'))

X = (X.merge(prior_user_stats, on='user_id', suffixes=[None, '_avg']))

(X.rename(columns={"order_hour_of_day": "user_avg_time",
                   "days_since_prior_order": "user_avg_spacing"},
          inplace=True))

X.head(3)

In [None]:
from collections import OrderedDict
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

def split_users(df, test_size=.2, seed=36):

    rs = np.random.RandomState(seed)
    
    # Here, we select a sample (`choice`) from all possible unique users
    total_users = df['user_id'].unique()
    test_users = rs.choice(total_users, 
                           size=int(total_users.shape[0] * test_size), 
                           replace=False)

    mask = df['user_id'].isin(test_users)
    df_tr = df[~mask] # the '~' means NOT (i.e. includes Bool=False)
    df_te = df[mask] 

    y_tr, y_te = df_tr['in_cart'], df_te['in_cart']
    X_tr = df_tr.drop(['product_id','user_id','cart','in_cart','last_cart'],axis=1) 
    X_te = df_te.drop(['product_id','user_id','cart','in_cart','last_cart'],axis=1)
    
    print(f"Actual Test Size: {y_te.shape[0] / df.shape[0]:0.4}")
    
    return X_tr, X_te, y_tr, y_te

In [None]:
# check results
X_tr, X_te, y_tr, y_te = split_users(X)

lr = LogisticRegression(solver='lbfgs', max_iter=200)
lr.fit(X_tr, y_tr)
f1_score(lr.predict(X_te), y_te)