## Import libraries and read data

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
path = '/home/dan1dr/data/feature_frame.csv'
data = pd.read_csv(path)

### Understanding the problem

Develop a ML model that. given a user and product, predicts if the user would purchase it at that moment. Here will explore and select the model we will apply to the PoC. This model will be used to target users and send them a push notification. Relevant info:

- Current push notificiations have an open rate of 5%.
- Focus only on purchases of at least 5 items (shipping cost).
- Use only linear models to speed up the development.
- The result should allow Sales team to select an item from a list and segment the users for triggering that notification.
- Target: expected increase on monthly sales by 2% and uplift of 25% on selected items.

### Filtering and data preparation

In [24]:
data.head()

Unnamed: 0,variant_id,product_type,order_id,user_id,created_at,order_date,user_order_seq,outcome,ordered_before,abandoned_before,...,count_children,count_babies,count_pets,people_ex_baby,days_since_purchase_variant_id,avg_days_to_buy_variant_id,std_days_to_buy_variant_id,days_since_purchase_product_type,avg_days_to_buy_product_type,std_days_to_buy_product_type
0,33826472919172,ricepastapulses,2807985930372,3482464092292,2020-10-05 16:46:19,2020-10-05 00:00:00,3,0.0,0.0,0.0,...,0.0,0.0,0.0,2.0,33.0,42.0,31.134053,30.0,30.0,24.27618
1,33826472919172,ricepastapulses,2808027644036,3466586718340,2020-10-05 17:59:51,2020-10-05 00:00:00,2,0.0,0.0,0.0,...,0.0,0.0,0.0,2.0,33.0,42.0,31.134053,30.0,30.0,24.27618
2,33826472919172,ricepastapulses,2808099078276,3481384026244,2020-10-05 20:08:53,2020-10-05 00:00:00,4,0.0,0.0,0.0,...,0.0,0.0,0.0,2.0,33.0,42.0,31.134053,30.0,30.0,24.27618
3,33826472919172,ricepastapulses,2808393957508,3291363377284,2020-10-06 08:57:59,2020-10-06 00:00:00,2,0.0,0.0,0.0,...,0.0,0.0,0.0,2.0,33.0,42.0,31.134053,30.0,30.0,24.27618
4,33826472919172,ricepastapulses,2808429314180,3537167515780,2020-10-06 10:37:05,2020-10-06 00:00:00,3,0.0,0.0,0.0,...,0.0,0.0,0.0,2.0,33.0,42.0,31.134053,30.0,30.0,24.27618


In [38]:
num_items_ordered = data.groupby('order_id')['outcome'].sum()
filter = num_items_ordered[num_items_ordered >= 5].index

filtered_data = data[data['order_id'].isin(filter)]

print(f"Length initial data: {len(data)}")
print(f"Length filtered data: {len(filtered_data)}\n")

print(f"Unique orders initially: {data['order_id'].nunique()}")
print(f"Unique orders >= 5 items: {filtered_data['order_id'].nunique()}")

Length initial data: 2880549
Length filtered data: 2163953

Unique orders initially: 3446
Unique orders >= 5 items: 2603


In [45]:
print(num_items_ordered[num_items_ordered > 5].mean())
print(num_items_ordered[num_items_ordered > 5].median())

12.527332511302918
11.0


## Feature Engineering

According with previous assignments, we will select only the features that we think are more relevant for our prediction. Will make a few adjustments here and will be iterating along the notebook.

We will create a logistic regression model for the model. From there, we know this model may sensitive to feature scale (keep in mind potential feature scaling if model is poor). Additionaly, for feature selection, we will need to select relevant features to simplify it. The multicollinearity might play an important role, so we will be discarding highly correlated features. To sum up, we might need to create new feature that groups others.

1. First, we will try to do some manual feature engineering.
2. Later, we will apply Lasso to force some coefficients to be zero and compare to our manual approach
3. Additionally, we might apply Ridge to see also the coefficients obtained (less extreme selection).

### 1. Manual

In [49]:
pd.set_option('display.max_columns', None)
filtered_data.head()

Unnamed: 0,variant_id,product_type,order_id,user_id,created_at,order_date,user_order_seq,outcome,ordered_before,abandoned_before,active_snoozed,set_as_regular,normalised_price,discount_pct,vendor,global_popularity,count_adults,count_children,count_babies,count_pets,people_ex_baby,days_since_purchase_variant_id,avg_days_to_buy_variant_id,std_days_to_buy_variant_id,days_since_purchase_product_type,avg_days_to_buy_product_type,std_days_to_buy_product_type
0,33826472919172,ricepastapulses,2807985930372,3482464092292,2020-10-05 16:46:19,2020-10-05 00:00:00,3,0.0,0.0,0.0,0.0,0.0,0.081052,0.053512,clearspring,0.0,2.0,0.0,0.0,0.0,2.0,33.0,42.0,31.134053,30.0,30.0,24.27618
1,33826472919172,ricepastapulses,2808027644036,3466586718340,2020-10-05 17:59:51,2020-10-05 00:00:00,2,0.0,0.0,0.0,0.0,0.0,0.081052,0.053512,clearspring,0.0,2.0,0.0,0.0,0.0,2.0,33.0,42.0,31.134053,30.0,30.0,24.27618
2,33826472919172,ricepastapulses,2808099078276,3481384026244,2020-10-05 20:08:53,2020-10-05 00:00:00,4,0.0,0.0,0.0,0.0,0.0,0.081052,0.053512,clearspring,0.0,2.0,0.0,0.0,0.0,2.0,33.0,42.0,31.134053,30.0,30.0,24.27618
3,33826472919172,ricepastapulses,2808393957508,3291363377284,2020-10-06 08:57:59,2020-10-06 00:00:00,2,0.0,0.0,0.0,0.0,0.0,0.081052,0.053512,clearspring,0.038462,2.0,0.0,0.0,0.0,2.0,33.0,42.0,31.134053,30.0,30.0,24.27618
5,33826472919172,ricepastapulses,2808434524292,3479090790532,2020-10-06 10:50:23,2020-10-06 00:00:00,3,0.0,0.0,0.0,0.0,0.0,0.081052,0.053512,clearspring,0.038462,2.0,0.0,0.0,0.0,2.0,33.0,42.0,31.134053,30.0,30.0,24.27618


In [82]:
# We will remeber the classification we did in previous notebook:

predicted = ['outcome']
information = ['variant_id', 'order_id', 'user_id', 'created_at', 'order_date']
numerical = ['user_order_seq', 'normalised_price', 'discount_pct', 'global_popularity',
            'count_adults', 'count_children', 'count_babies', 'count_pets', 
            'people_ex_baby', 'days_since_purchase_variant_id', 
            'avg_days_to_buy_variant_id', 'std_days_to_buy_variant_id',
            'days_since_purchase_product_type', 'avg_days_to_buy_product_type',
                'std_days_to_buy_product_type']

categorical = ['product_type', 'vendor']
binary = ['ordered_before', 'abandoned_before', 'active_snoozed', 'set_as_regular']

From numerical: 
- We will remove count_adults, count_children, count_pets and keep only count_adults, which seems to be highly representative. We will maintain count_babies as correlation it is not that high (0.15) and may provide info.
- We will remove std_days_to_buy_product_type and keep avg_days_to_buy_product_type (highly correlated between themselves. We may do it reversely also)
- We will remove std_days_to_buy_variant_id and keep avg_days_to_buy_variant_id (same thing)

From categorical:
- We will remove vendor and keep product_type (the former has too different values)

From binary: 
- We would try to use some resampling technique for grouping the 4 into just 1. We will create a column 'any_event' to input value '1' if any of the four cols has a value of 1. In this sense, we address the unbalanced distribution (a bit), we simplify it and also keep the info if any of these events occured


In [83]:
# Remove numericals
numerical_remove = ['count_adults', 'count_children', 'count_babies',
                    'std_days_to_buy_product_type', 'std_days_to_buy_variant_id']
numerical = [col for col in numerical if col not in numerical_remove]

# Remove categoricals
categorical.remove('vendor')

# Create the binary one
filtered_data['any_event'] = filtered_data[['ordered_before', 'abandoned_before', 'active_snoozed', 'set_as_regular']].any(axis=1).astype(int)
binary = ['any_event']
# I'm thinking if maybe would be interesting to perform a sume here. If any_event = 2 would be stronger than 1 and so, while also keeping info about events.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['any_event'] = filtered_data[['ordered_before', 'abandoned_before', 'active_snoozed', 'set_as_regular']].any(axis=1).astype(int)


Additionally, let's check if order_date is always equal to created_at (maybe some orders are created but not ordered until X days). If so, let's remove order_date (created_at is more has hour and minut info)

In [76]:
if len(filtered_data[filtered_data['order_date'] == filtered_data['created_at']]) == len(filtered_data):
    print("ofc")

ofc


In [84]:
information.remove('order_date')

In [86]:
cols = numerical + categorical + binary + information + predicted
final_data = filtered_data [cols]
print(final_data.shape)
final_data.head()

(2163953, 17)


Unnamed: 0,user_order_seq,normalised_price,discount_pct,global_popularity,count_pets,people_ex_baby,days_since_purchase_variant_id,avg_days_to_buy_variant_id,days_since_purchase_product_type,avg_days_to_buy_product_type,product_type,any_event,variant_id,order_id,user_id,created_at,outcome
0,3,0.081052,0.053512,0.0,0.0,2.0,33.0,42.0,30.0,30.0,ricepastapulses,0,33826472919172,2807985930372,3482464092292,2020-10-05 16:46:19,0.0
1,2,0.081052,0.053512,0.0,0.0,2.0,33.0,42.0,30.0,30.0,ricepastapulses,0,33826472919172,2808027644036,3466586718340,2020-10-05 17:59:51,0.0
2,4,0.081052,0.053512,0.0,0.0,2.0,33.0,42.0,30.0,30.0,ricepastapulses,0,33826472919172,2808099078276,3481384026244,2020-10-05 20:08:53,0.0
3,2,0.081052,0.053512,0.038462,0.0,2.0,33.0,42.0,30.0,30.0,ricepastapulses,0,33826472919172,2808393957508,3291363377284,2020-10-06 08:57:59,0.0
5,3,0.081052,0.053512,0.038462,0.0,2.0,33.0,42.0,30.0,30.0,ricepastapulses,0,33826472919172,2808434524292,3479090790532,2020-10-06 10:50:23,0.0
