In [34]:
%pip install category-encoders


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [35]:
import numpy as np
import pandas as pd
import category_encoders as ce

In [36]:
def optimize_dataframe_for_memory(df):
 print("current memeor usage: ", df.memory_usage().sum() / 1024**2, "MB")

 for col in df.columns:
  _max = df[col].max()
  _min = df[col].min()
  if df[col].dtype  == 'int64':
   if _min > np.iinfo(np.int8).min and _max < np.iinfo(np.int8).max:
    df[col] = df[col].astype(np.int8)
   elif _min > np.iinfo(np.int16).min and _max < np.iinfo(np.int16).max:
    df[col] = df[col].astype(np.int16)
   elif _min > np.iinfo(np.int32).min and _max < np.iinfo(np.int32).max:
    df[col] = df[col].astype(np.int32)
  elif df[col].dtype == 'float64':
   if _min > np.finfo(np.float16).min and _max < np.finfo(np.float16).max:
    df[col] = df[col].astype(np.float16)
   elif _min > np.finfo(np.float32).min and _max < np.finfo(np.float32).max:
    df[col] = df[col].astype(np.float32)
  else:
   continue
 
 print("new memory usage: ", df.memory_usage().sum() / 1024**2, "MB")
 return df


In [37]:
data_set_location  = 'data/'

In [38]:
departments_df = pd.read_csv(data_set_location + 'departments.csv')
aisles_df = pd.read_csv(data_set_location + 'aisles.csv')


orders_df = pd.read_csv(data_set_location + 'orders.csv')
product_df = pd.read_csv(data_set_location + 'products.csv')

order_products_train_df = pd.read_csv(data_set_location + 'order_products__train.csv')
order_products_prior_df = pd.read_csv(data_set_location + 'order_products__prior.csv')

In [39]:
departments_df = optimize_dataframe_for_memory(departments_df)
aisles_df = optimize_dataframe_for_memory(aisles_df)

orders_df = optimize_dataframe_for_memory(orders_df)
product_df = optimize_dataframe_for_memory(product_df)

order_products_train_df = optimize_dataframe_for_memory(order_products_train_df)
order_products_prior_df = optimize_dataframe_for_memory(order_products_prior_df)

current memeor usage:  0.0004425048828125 MB
new memory usage:  0.00030231475830078125 MB
current memeor usage:  0.002166748046875 MB
new memory usage:  0.001399993896484375 MB
current memeor usage:  182.7056655883789 MB
new memory usage:  68.5147008895874 MB
current memeor usage:  1.5164794921875 MB
new memory usage:  0.7109146118164062 MB
current memeor usage:  42.255279541015625 MB
new memory usage:  13.204858779907227 MB
current memeor usage:  989.8221740722656 MB
new memory usage:  340.2514524459839 MB


In [40]:
df_by_product_order = order_products_prior_df.copy() #TODO consider concat with train set

df_by_product_order = pd.merge(df_by_product_order , orders_df, on='order_id', how='left')
df_by_product_order = pd.merge(df_by_product_order , product_df, on='product_id', how='left')

In [41]:
df_by_product_order.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_name,aisle_id,department_id
0,2,33120,1,1,202279,prior,3,5,9,8.0,Organic Egg Whites,86,16
1,2,28985,2,1,202279,prior,3,5,9,8.0,Michigan Organic Kale,83,4
2,2,9327,3,0,202279,prior,3,5,9,8.0,Garlic Powder,104,13
3,2,45918,4,1,202279,prior,3,5,9,8.0,Coconut Butter,19,13
4,2,30035,5,0,202279,prior,3,5,9,8.0,Natural Sweetener,17,13


In [42]:

df_by_product_order['products_by_user_count'] = df_by_product_order.groupby(['user_id' ,  'product_id']).cumcount() + 1

df_by_product_order[ ['user_id' , 'product_id' , 'products_by_user_count' ]].head(2)

Unnamed: 0,user_id,product_id,products_by_user_count
0,202279,33120,1
1,202279,28985,1


In [43]:
"""

PRODUCT LEVEL FEATURES


"""

'\n\nPRODUCT LEVEL FEATURES\n\n\n'

In [44]:

agg_dict = {
    'add_to_cart_order': 'mean',
    'reordered': [
        ('total_orders', 'count'),
        ('total_reorders', 'sum'),
        ('reorder_percentage' ,'mean')
    ],
    'user_id': [ ( 'product_unique_users'  ,lambda x: x.nunique() ) ],
    'products_by_user_count': [
        ('order_first_time_total_cnt' , lambda x: (x == 1).sum() ),
        ('order_second_time_total_cnt' , lambda x: (x == 2).sum())
    ],
}

df_product_feats = df_by_product_order.groupby('product_id').agg(agg_dict)

df_product_feats.columns = ['_'.join(col).strip() for col in df_product_feats.columns.values]

In [45]:
df_product_feats.reset_index(inplace=True)

df_product_feats['second_time_percent'] = df_product_feats['products_by_user_count_order_second_time_total_cnt'] / df_product_feats['products_by_user_count_order_first_time_total_cnt']

df_product_feats.head()

Unnamed: 0,product_id,add_to_cart_order_mean,reordered_total_orders,reordered_total_reorders,reordered_reorder_percentage,user_id_product_unique_users,products_by_user_count_order_first_time_total_cnt,products_by_user_count_order_second_time_total_cnt,second_time_percent
0,1,5.801836,1852,1136.0,0.613391,716,716,276,0.385475
1,2,9.888889,90,12.0,0.133333,78,78,8,0.102564
2,3,6.415162,277,203.0,0.732852,74,74,36,0.486486
3,4,9.507599,329,147.0,0.446809,182,182,64,0.351648
4,5,6.466667,15,9.0,0.6,6,6,4,0.666667


In [46]:
print( df_product_feats.columns )

"""
product features added:
 - average position in cart - add_to_cart_order_mean
 - total orders - reordered_total_orders
 - total reorders - reordered_total_reorders
 - reorder percentage - reordered_reorder_percentage
 - unique users - user_id_unique_users

 - order first time total count - products_by_user_count_order_first_time_total_cnt
 - order second time total count - products_by_user_count_order_second_time_total_cnt

"""

Index(['product_id', 'add_to_cart_order_mean', 'reordered_total_orders',
       'reordered_total_reorders', 'reordered_reorder_percentage',
       'user_id_product_unique_users',
       'products_by_user_count_order_first_time_total_cnt',
       'products_by_user_count_order_second_time_total_cnt',
       'second_time_percent'],
      dtype='object')


'\nproduct features added:\n - average position in cart - add_to_cart_order_mean\n - total orders - reordered_total_orders\n - total reorders - reordered_total_reorders\n - reorder percentage - reordered_reorder_percentage\n - unique users - user_id_unique_users\n\n - order first time total count - products_by_user_count_order_first_time_total_cnt\n - order second time total count - products_by_user_count_order_second_time_total_cnt\n\n'

In [47]:
"""

AISLE LEVEL FEATURES



"""

'\n\nAISLE LEVEL FEATURES\n\n\n\n'

In [48]:
"""

Reorder percentage, Total orders and reorders of a product aisle

(9) Mean and std of aisle add-to-cart-order

(10) Aisle unique users

"""


'\n\nReorder percentage, Total orders and reorders of a product aisle\n\n(9) Mean and std of aisle add-to-cart-order\n\n(10) Aisle unique users\n\n'

In [49]:
agg_dict_aisle = {
 'add_to_cart_order' : [
  ('aisle_mean_order' , 'mean'),
  ('aisle_std_order' , 'std')
 ],
  'reordered' : {
   ('aisle_total_orders','count'), 
   ('aisle_total_reorders','sum'), 
   ('aisle_reorder_percentage','mean')
   },
   'user_id': [
    ('aisle_unique_users' , lambda x: x.nunique())
   ]
}


df_aisle_feats = df_by_product_order.groupby('aisle_id').agg(agg_dict_aisle)


df_aisle_feats.columns = ['_'.join(col).strip() for col in df_aisle_feats.columns.values]


df_aisle_feats.reset_index(inplace=True)

df_aisle_feats.head()

Unnamed: 0,aisle_id,add_to_cart_order_aisle_mean_order,add_to_cart_order_aisle_std_order,reordered_aisle_total_orders,reordered_aisle_reorder_percentage,reordered_aisle_total_reorders,user_id_aisle_unique_users
0,1,8.16764,7.104166,71928,0.596597,42912.0,20711
1,2,9.275497,7.473802,82491,0.489326,40365.0,31222
2,3,9.571935,7.899672,456386,0.598007,272922.0,63592
3,4,10.16145,7.745705,200687,0.489533,98243.0,53892
4,5,10.2976,8.187047,62510,0.280627,17542.0,32312


In [50]:
"""
Aisle features added:
  Mean and std of aisle position in card - add_to_cart_order_aisle_mean_order, add_to_cart_order_aisle_std_order
  Aisle unique users - user_id_unique_users
  count of total orders - reordered_aisle_total_orders
  sum of total reorders - reordered_aisle_total_reorders
  mean of reorder percentage - reordered_aisle_reorder_percentage

"""

print( df_aisle_feats.columns )



Index(['aisle_id', 'add_to_cart_order_aisle_mean_order',
       'add_to_cart_order_aisle_std_order', 'reordered_aisle_total_orders',
       'reordered_aisle_reorder_percentage', 'reordered_aisle_total_reorders',
       'user_id_aisle_unique_users'],
      dtype='object')


In [51]:
"""

Department Level Features

"""

'\n\nDepartment Level Features\n\n'

In [52]:
agg_dict_department =  {
 'add_to_cart_order' : [
  ('department_mean_add_to_cart_order','mean'),
  ('department_std_add_to_cart_order','std')
  ],
  'reordered' : [
   ('department_total_orders','count'), 
   ('department_total_reorders','sum'),
   ('department_reorder_percentage','mean')
   ],
   'user_id': [
    ('department_unique_users'  ,lambda x: x.nunique())
   ]
}



df_department_feats  = df_by_product_order.groupby('department_id').agg(agg_dict_department)

df_department_feats.columns = ['_'.join(col).strip() for col in df_department_feats.columns.values]

df_department_feats.reset_index(inplace=True)

df_department_feats.head()

Unnamed: 0,department_id,add_to_cart_order_department_mean_add_to_cart_order,add_to_cart_order_department_std_add_to_cart_order,reordered_department_total_orders,reordered_department_total_reorders,reordered_department_reorder_percentage,user_id_department_unique_users
0,1,8.996414,7.393502,2236432,1211890.0,0.541885,163233
1,2,8.277645,7.526272,36291,14806.0,0.40798,17875
2,3,8.084397,6.904849,1176787,739188.0,0.628141,140612
3,4,8.022875,6.658899,9479291,6160710.0,0.649913,193237
4,5,5.428346,5.778253,153696,87595.0,0.569924,15798


In [53]:
"""
Deparment features added:
  Mean and std of department position in card - add_to_cart_order_department_mean_add_to_cart_order, add_to_cart_order_department_std_add_to_cart_order
  Department unique users - user_id_department_unique_users
  count of total orders - reordered_department_total_orders
  sum of total reorders - reordered_department_total_reorders
  mean of reorder percentage - reordered_department_reorder_percentage
"""

'\nDeparment features added:\n  Mean and std of department position in card - add_to_cart_order_department_mean_add_to_cart_order, add_to_cart_order_department_std_add_to_cart_order\n  Department unique users - user_id_department_unique_users\n  count of total orders - reordered_department_total_orders\n  sum of total reorders - reordered_department_total_reorders\n  mean of reorder percentage - reordered_department_reorder_percentage\n'

In [54]:
df_product_feats.head()

Unnamed: 0,product_id,add_to_cart_order_mean,reordered_total_orders,reordered_total_reorders,reordered_reorder_percentage,user_id_product_unique_users,products_by_user_count_order_first_time_total_cnt,products_by_user_count_order_second_time_total_cnt,second_time_percent
0,1,5.801836,1852,1136.0,0.613391,716,716,276,0.385475
1,2,9.888889,90,12.0,0.133333,78,78,8,0.102564
2,3,6.415162,277,203.0,0.732852,74,74,36,0.486486
3,4,9.507599,329,147.0,0.446809,182,182,64,0.351648
4,5,6.466667,15,9.0,0.6,6,6,4,0.666667


In [55]:
df_product_feats = pd.merge(df_product_feats, product_df, on='product_id', how='left')

df_product_feats = pd.merge(df_product_feats, aisles_df, on='aisle_id', how='left')
df_product_feats = pd.merge(df_product_feats, departments_df, on='department_id', how='left')



df_product_feats = pd.merge(df_product_feats, df_aisle_feats, on='aisle_id', how='left')
df_product_feats = pd.merge(df_product_feats, df_department_feats, on='department_id', how='left')


print( df_product_feats.columns )

Index(['product_id', 'add_to_cart_order_mean', 'reordered_total_orders',
       'reordered_total_reorders', 'reordered_reorder_percentage',
       'user_id_product_unique_users',
       'products_by_user_count_order_first_time_total_cnt',
       'products_by_user_count_order_second_time_total_cnt',
       'second_time_percent', 'product_name', 'aisle_id', 'department_id',
       'aisle', 'department', 'add_to_cart_order_aisle_mean_order',
       'add_to_cart_order_aisle_std_order', 'reordered_aisle_total_orders',
       'reordered_aisle_reorder_percentage', 'reordered_aisle_total_reorders',
       'user_id_aisle_unique_users',
       'add_to_cart_order_department_mean_add_to_cart_order',
       'add_to_cart_order_department_std_add_to_cart_order',
       'reordered_department_total_orders',
       'reordered_department_total_reorders',
       'reordered_department_reorder_percentage',
       'user_id_department_unique_users'],
      dtype='object')


In [56]:
df_product_feats =  df_product_feats.drop(['product_name', 'aisle_id', 'department_id'], axis=1)

In [57]:
df_product_feats.shape 

(49677, 23)

In [58]:
binnary_encoder = ce.BinaryEncoder(cols=['aisle', 'department'] , return_df=True)

df_product_feats = binnary_encoder.fit_transform(df_product_feats)

df_product_feats.head(2)

Unnamed: 0,product_id,add_to_cart_order_mean,reordered_total_orders,reordered_total_reorders,reordered_reorder_percentage,user_id_product_unique_users,products_by_user_count_order_first_time_total_cnt,products_by_user_count_order_second_time_total_cnt,second_time_percent,aisle_0,...,reordered_aisle_total_orders,reordered_aisle_reorder_percentage,reordered_aisle_total_reorders,user_id_aisle_unique_users,add_to_cart_order_department_mean_add_to_cart_order,add_to_cart_order_department_std_add_to_cart_order,reordered_department_total_orders,reordered_department_total_reorders,reordered_department_reorder_percentage,user_id_department_unique_users
0,1,5.801836,1852,1136.0,0.613391,716,716,276,0.385475,0,...,234065,0.548698,128431.0,54202,9.187743,7.692492,2887550,1657973.0,0.57418,174219
1,2,9.888889,90,12.0,0.133333,78,78,8,0.102564,0,...,212092,0.152391,32321.0,76402,9.593425,7.875241,1875577,650301.0,0.346721,172755


In [59]:
df_product_feats.isnull().sum()

product_id                                             0
add_to_cart_order_mean                                 0
reordered_total_orders                                 0
reordered_total_reorders                               0
reordered_reorder_percentage                           0
user_id_product_unique_users                           0
products_by_user_count_order_first_time_total_cnt      0
products_by_user_count_order_second_time_total_cnt     0
second_time_percent                                    0
aisle_0                                                0
aisle_1                                                0
aisle_2                                                0
aisle_3                                                0
aisle_4                                                0
aisle_5                                                0
aisle_6                                                0
aisle_7                                                0
department_0                   

In [60]:
"""

USER LEVEL FEATURES

"""

'\n\nUSER LEVEL FEATURES\n\n'

In [62]:
df_by_product_order.head(2)

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_name,aisle_id,department_id,products_by_user_count
0,2,33120,1,1,202279,prior,3,5,9,8.0,Organic Egg Whites,86,16,1
1,2,28985,2,1,202279,prior,3,5,9,8.0,Michigan Organic Kale,83,4,1


In [63]:
df_by_product_order.isnull().sum()

order_id                        0
product_id                      0
add_to_cart_order               0
reordered                       0
user_id                         0
eval_set                        0
order_number                    0
order_dow                       0
order_hour_of_day               0
days_since_prior_order    2078068
product_name                    0
aisle_id                        0
department_id                   0
products_by_user_count          0
dtype: int64

In [64]:
df_by_product_order['days_since_prior_order'] = df_by_product_order['days_since_prior_order'].fillna(0)

In [65]:
agg_dict_user = {
 'order_dow': [
  ('avg_dow','mean'), 
  ('std_dow','std')
  ],
  'order_hour_of_day': [
   ('avg_doh','mean'), 
   ('std_doh','std')
   ],
   'days_since_prior_order': [
    ('avg_since_order','mean'), 
    ('std_since_order','std')
    ],
    'order_number': [
     ('total_orders_by_user',  lambda x: x.nunique())
     ],
     'product_id': [
      ('total_products_by_user', 'count'),
      ('total_unique_product_by_user', lambda x: x.nunique())
      ],
      'reordered': [
       ('total_reorders_by_user','sum'), 
       ('reorder_propotion_by_user','mean')
       ] 
       }

df_user_feats = df_by_product_order.groupby('user_id').agg(agg_dict_user)
df_user_feats.columns =  ['_'.join(col).strip() for col in df_user_feats.columns.values]

df_user_feats.reset_index(inplace=True)

df_user_feats.head(2)

Unnamed: 0,user_id,order_dow_avg_dow,order_dow_std_dow,order_hour_of_day_avg_doh,order_hour_of_day_std_doh,days_since_prior_order_avg_since_order,days_since_prior_order_std_since_order,order_number_total_orders_by_user,product_id_total_products_by_user,product_id_total_unique_product_by_user,reordered_total_reorders_by_user,reordered_reorder_propotion_by_user
0,1,2.644068,1.256194,10.542373,3.500355,18.546875,10.559065,10,59,18,41.0,0.694915
1,2,2.005128,0.971222,10.441026,1.649854,14.90625,9.671712,14,195,102,93.0,0.476923


In [66]:
"""
user features added:
  average and std of order day of week - order_dow_avg_dow, order_dow_std_dow
  average and std of order hour of day - order_hour_of_day_avg_doh, order_hour_of_day_std_doh
  average and std of days since prior order - days_since_prior_order_avg_since_order, days_since_prior_order_std_since_order
  total orders by user - order_number_total_orders_by_user
  total products by user - product_id_total_products_by_user
  total unique products by user - product_id_total_unique_product_by_user
  total reorders by user - reordered_total_reorders_by_user
  reorder proportion by user - reordered_reorder_propotion_by_user

"""

'\nuser features added:\n  average and std of order day of week - order_dow_avg_dow, order_dow_std_dow\n  average and std of order hour of day - order_hour_of_day_avg_doh, order_hour_of_day_std_doh\n  average and std of days since prior order - days_since_prior_order_avg_since_order, days_since_prior_order_std_since_order\n  total orders by user - order_number_total_orders_by_user\n  total products by user - product_id_total_products_by_user\n  total unique products by user - product_id_total_unique_product_by_user\n  total reorders by user - reordered_total_reorders_by_user\n  reorder proportion by user - reordered_reorder_propotion_by_user\n\n'

In [69]:
agg_dict_user_2 = {
 'reordered': {
  ('average_order_size','count'), 
  ('reorder_in_order','mean')
  }}

df_user_feats_2 = df_by_product_order.groupby(['user_id', 'order_number']).agg(agg_dict_user_2)

df_user_feats_2.columns = ['_'.join(col).strip() for col in df_user_feats_2.columns.values]

df_user_feats_2.reset_index(inplace=True)

df_user_feats_2.head(2)

Unnamed: 0,user_id,order_number,reordered_reorder_in_order,reordered_average_order_size
0,1,1,0.0,5
1,1,2,0.5,6


In [71]:
agg_dict_user_3  = {
 'reordered_average_order_size' : 'mean', 
 'reordered_reorder_in_order':'mean'
 }

df_user_feats_3 = df_user_feats_2.groupby('user_id').agg(agg_dict_user_3)

df_user_feats_3.reset_index(inplace=True)

df_user_feats_3.head(2)

Unnamed: 0,user_id,reordered_average_order_size,reordered_reorder_in_order
0,1,5.9,0.705833
1,2,13.928571,0.447961


In [73]:
#merge user features
df_user_feats = pd.merge(df_user_feats, df_user_feats_3, on='user_id', how='left')
df_user_feats.head(2)

Unnamed: 0,user_id,order_dow_avg_dow,order_dow_std_dow,order_hour_of_day_avg_doh,order_hour_of_day_std_doh,days_since_prior_order_avg_since_order,days_since_prior_order_std_since_order,order_number_total_orders_by_user,product_id_total_products_by_user,product_id_total_unique_product_by_user,reordered_total_reorders_by_user,reordered_reorder_propotion_by_user,reordered_average_order_size_x,reordered_reorder_in_order_x,reordered_average_order_size_y,reordered_reorder_in_order_y
0,1,2.644068,1.256194,10.542373,3.500355,18.546875,10.559065,10,59,18,41.0,0.694915,5.9,0.705833,5.9,0.705833
1,2,2.005128,0.971222,10.441026,1.649854,14.90625,9.671712,14,195,102,93.0,0.476923,13.928571,0.447961,13.928571,0.447961


In [75]:
df_user_feats_2.head(2)

Unnamed: 0,user_id,order_number,reordered_reorder_in_order,reordered_average_order_size
0,1,1,0.0,5
1,1,2,0.5,6


In [78]:
# get top 3 ordernumber from a user

last_3_orders = df_user_feats_2.groupby('user_id')["order_number"].nlargest(3).reset_index()

last_3_orders.head(3)

Unnamed: 0,user_id,level_1,order_number
0,1,9,10
1,1,8,9
2,1,7,8


In [79]:
last_3_orders = pd.merge(last_3_orders, df_user_feats_2, on=['user_id', 'order_number'], how='inner')

last_3_orders.head()

Unnamed: 0,user_id,level_1,order_number,reordered_reorder_in_order,reordered_average_order_size
0,1,9,10,0.666667,9
1,1,8,9,1.0,6
2,1,7,8,0.666667,6
3,2,23,14,0.625,16
4,2,22,13,0.0,9


In [80]:
last_3_orders['rank'] = last_3_orders.groupby("user_id")["order_number"].rank("dense", ascending=True)

In [81]:
last_order_feats = last_3_orders.pivot_table(index = 'user_id', columns = ['rank'], 
                                             values=['reordered_average_order_size', 'reordered_reorder_in_order']).reset_index(drop = False)

last_order_feats.columns = ['user_id','orders_3', 'orders_2', 'orders_1', 'reorder_3', 'reorder_2', 'reorder_1']
last_order_feats.head(2)

Unnamed: 0,user_id,orders_3,orders_2,orders_1,reorder_3,reorder_2,reorder_1
0,1,6,6,9,0.666667,1.0,0.666667
1,2,19,9,16,0.578947,0.0,0.625


In [82]:
df_user_feats  = df_user_feats .merge(last_order_feats, on = 'user_id', how = 'left')
df_user_feats .head(2)

Unnamed: 0,user_id,order_dow_avg_dow,order_dow_std_dow,order_hour_of_day_avg_doh,order_hour_of_day_std_doh,days_since_prior_order_avg_since_order,days_since_prior_order_std_since_order,order_number_total_orders_by_user,product_id_total_products_by_user,product_id_total_unique_product_by_user,...,reordered_average_order_size_x,reordered_reorder_in_order_x,reordered_average_order_size_y,reordered_reorder_in_order_y,orders_3,orders_2,orders_1,reorder_3,reorder_2,reorder_1
0,1,2.644068,1.256194,10.542373,3.500355,18.546875,10.559065,10,59,18,...,5.9,0.705833,5.9,0.705833,6,6,9,0.666667,1.0,0.666667
1,2,2.005128,0.971222,10.441026,1.649854,14.90625,9.671712,14,195,102,...,13.928571,0.447961,13.928571,0.447961,19,9,16,0.578947,0.0,0.625


In [83]:
"""

USER AND PRODUCT LEVEL FEATURES ( MERGED )

"""

'\n\nUSER AND PRODUCT LEVEL FEATURES ( MERGED )\n\n'

In [84]:
agg_dict_user_product = {
 'reordered': [
  ('total_product_orders_by_user','count'),
  ('total_product_reorders_by_user','sum'),
  ('user_product_reorder_percentage', 'mean')
  ],
  'add_to_cart_order': [
   ('avg_add_to_cart_by_user','mean')
  ],
   'days_since_prior_order': [
    ('avg_days_since_last_bought' ,'mean')
    ],
    'order_number': [
     ('last_ordered_in' , 'max')
     
     ]}

df_user_product_feat = df_by_product_order.groupby(['user_id', 'product_id']).agg(agg_dict_user_product)
df_user_product_feat.columns = df_user_product_feat.columns.droplevel(0)
df_user_product_feat.reset_index(inplace = True)
df_user_product_feat.head(2)

Unnamed: 0,user_id,product_id,total_product_orders_by_user,total_product_reorders_by_user,user_product_reorder_percentage,avg_add_to_cart_by_user,avg_days_since_last_bought,last_ordered_in
0,1,196,10,9,0.9,1.4,17.59375,10
1,1,10258,9,8,0.888889,3.333333,19.5625,10


In [85]:
last_set_of_orders = pd.merge( df_by_product_order ,  last_3_orders , on = ['user_id', 'order_number'], how = 'inner')

last_set_of_orders.head(2)

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_name,aisle_id,department_id,products_by_user_count,level_1,reordered_reorder_in_order,reordered_average_order_size,rank
0,7,34050,1,0,142903,prior,11,2,14,30.0,Orange Juice,31,7,1,2231251,0.0,2,2.0
1,7,46802,2,0,142903,prior,11,2,14,30.0,Pineapple Chunks,116,1,1,2231251,0.0,2,2.0


In [86]:
last_set_of_orders['rank'] = last_set_of_orders.groupby(['user_id', 'product_id'])['order_number'].rank("dense", ascending=True)

In [87]:
product_purchase_history = last_set_of_orders.pivot_table(index = ['user_id', 'product_id'],
                                                          columns='rank', values = 'reordered').reset_index()
product_purchase_history.columns = ['user_id', 'product_id', 'is_reorder_3', 'is_reorder_2', 'is_reorder_1']
product_purchase_history.fillna(0, inplace = True)
product_purchase_history.head(3)

Unnamed: 0,user_id,product_id,is_reorder_3,is_reorder_2,is_reorder_1
0,1,196,1.0,1.0,1.0
1,1,10258,1.0,1.0,1.0
2,1,12427,1.0,1.0,1.0


In [88]:
df_user_product_feat =  pd.merge(df_user_product_feat, product_purchase_history, on = ['user_id', 'product_id'], how = 'left')
df_user_product_feat.head()

Unnamed: 0,user_id,product_id,total_product_orders_by_user,total_product_reorders_by_user,user_product_reorder_percentage,avg_add_to_cart_by_user,avg_days_since_last_bought,last_ordered_in,is_reorder_3,is_reorder_2,is_reorder_1
0,1,196,10,9,0.9,1.4,17.59375,10,1.0,1.0,1.0
1,1,10258,9,8,0.888889,3.333333,19.5625,10,1.0,1.0,1.0
2,1,10326,1,0,0.0,5.0,28.0,5,,,
3,1,12427,10,9,0.9,3.3,17.59375,10,1.0,1.0,1.0
4,1,13032,3,2,0.666667,6.333333,21.671875,10,1.0,0.0,0.0


In [89]:
df_user_product_feat.fillna(0, inplace = True)

In [95]:
df_product_feats.to_pickle(f'{data_set_location}product_features.pkl')
df_user_feats.to_pickle(f'{data_set_location}user_features.pkl')
df_user_product_feat.to_pickle(f'{data_set_location}user_product_features.pkl')

In [93]:
print( df_user_product_feat.shape )
print( df_user_feats.shape )
print( df_product_feats.shape )


(13307953, 11)
(206209, 22)
(49677, 34)


In [None]:
import pandas as pd



Null values in DataFrame:
col1    1
col2    1
dtype: int64


KMeans(n_clusters=2, random_state=0)