# Data Science Project SoSe 2024
## Team 07
- Maximilian Hoffmann
- Kilian Kempf
- Daniel Schneider
- Tom Schuck

## Project Submission

### Libraries

In [1]:
import os

import pandas as pd

from feature_engineering import DataStore
from feature_engineering.features import TipHistory, ReorderedRatio, DynamicFeatureTest1, DynamicFeatureTest2, \
    OrderSize, ModeDepartment, PrevTippedProductsRatio, SumDaysSincePriorOrder, AvgSizePrevOrders

### Data Preparation

In [2]:
DATA_DIR = os.path.join(os.getcwd(), 'data/Instacart')

op_prior = pd.read_csv(os.path.join(DATA_DIR, 'order_products__prior.csv.zip'))
op_train = pd.read_csv(os.path.join(DATA_DIR, 'order_products__train.csv.zip'))

tip_train = pd.read_csv(os.path.join(DATA_DIR, 'tip_trainingsdaten1_.csv'))[['order_id', 'tip']]
tip_test = pd.read_csv(os.path.join(DATA_DIR, 'tip_testdaten1_template.csv'))

orders = pd.read_csv(os.path.join(DATA_DIR, 'orders.csv.zip'))
aisles = pd.read_csv(os.path.join(DATA_DIR, 'aisles.csv.zip'))
departments = pd.read_csv(os.path.join(DATA_DIR, 'departments.csv.zip'))
products = pd.read_csv(os.path.join(DATA_DIR, 'products.csv.zip'))

data_store = DataStore(op_prior, op_train, tip_train, tip_test, orders, products, aisles, departments)
order_amount = len(data_store.get_orders_tip())

### Feature Engineering

In [3]:
features = ['order_number', 'order_dow', 'order_hour_of_day', 'days_since_prior_order', 'tip_history',
            'reordered_ratio', 'order_size', 'mode_dept', 'prev_tipped_products_ratio', 'sum_days_since_prior_order', 'avg_size_prev_orders']

tip_history = TipHistory(data_store)
reordered_rate = ReorderedRatio(data_store)
order_size = OrderSize(data_store)
mode_dept = ModeDepartment(data_store)
prev_tipped_products_ratio = PrevTippedProductsRatio(data_store)
sum_days_since_prior_order = SumDaysSincePriorOrder(data_store)
avg_size_prev_orders = AvgSizePrevOrders(data_store)

# Static Features
tip_history.compute_feature()
reordered_rate.compute_feature()
order_size.compute_feature()
sum_days_since_prior_order.compute_feature()
avg_size_prev_orders.compute_feature()
# mode_dept.compute_feature()
prev_tipped_products_ratio.compute_feature()



In [5]:
data_store.get_orders_tip().head(25)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,tip,tip_history,reordered_ratio,order_size,sum_days_since_prior_order,avg_size_prev_orders,prev_tipped_products_ratio
0,2539329,1,prior,1,2,8,-1,0.0,-1.0,0.0,5,1248.0,7.0,0.0
1,2398795,1,prior,2,3,7,15,0.0,0.0,0.5,6,1248.0,7.0,0.0
2,473747,1,prior,3,3,12,21,0.0,0.0,0.6,5,1248.0,7.0,0.0
3,2254736,1,prior,4,4,7,29,0.0,0.0,1.0,5,1248.0,7.0,0.0
4,431534,1,prior,5,4,15,28,0.0,0.0,0.625,8,1248.0,7.0,0.0
5,3367565,1,prior,6,2,7,19,0.0,0.0,1.0,4,1248.0,7.0,0.0
6,550135,1,prior,7,1,9,20,0.0,0.0,1.0,5,1248.0,7.0,0.0
7,3108588,1,prior,8,1,14,14,0.0,0.0,0.666667,6,1248.0,7.0,0.0
8,2295261,1,prior,9,1,16,0,0.0,0.0,1.0,6,1248.0,7.0,0.0
9,2550362,1,prior,10,4,8,30,0.0,0.0,0.666667,9,1248.0,7.0,0.0


In [20]:
data_store.get_orders_tip_subset().head(25)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,tip,tip_history,reordered_ratio,order_size,prev_tipped_products_ratio
0,2539329,1,prior,1,2,8,-1,0.0,-1.0,0.0,5,0.0
1,2398795,1,prior,2,3,7,15,0.0,0.0,0.5,6,0.0
2,473747,1,prior,3,3,12,21,0.0,0.0,0.6,5,0.0
3,2254736,1,prior,4,4,7,29,0.0,0.0,1.0,5,0.0
4,431534,1,prior,5,4,15,28,0.0,0.0,0.625,8,0.0
5,3367565,1,prior,6,2,7,19,0.0,0.0,1.0,4,0.0
6,550135,1,prior,7,1,9,20,0.0,0.0,1.0,5,0.0
7,3108588,1,prior,8,1,14,14,0.0,0.0,0.666667,6,0.0
8,2295261,1,prior,9,1,16,0,0.0,0.0,1.0,6,0.0
9,2550362,1,prior,10,4,8,30,0.0,0.0,0.666667,9,0.0


In [21]:
order_ids = data_store.get_orders_tip().groupby('user_id')['order_id'].head(5)
data_store.set_data_subset(order_ids)

# Dynamic Features




In [22]:
data_store.get_orders_tip_subset().head(25)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,tip,tip_history,reordered_ratio,order_size,prev_tipped_products_ratio
0,2539329,1,prior,1,2,8,-1,0.0,-1.0,0.0,5,0.0
1,2398795,1,prior,2,3,7,15,0.0,0.0,0.5,6,0.0
2,473747,1,prior,3,3,12,21,0.0,0.0,0.6,5,0.0
3,2254736,1,prior,4,4,7,29,0.0,0.0,1.0,5,0.0
4,431534,1,prior,5,4,15,28,0.0,0.0,0.625,8,0.0
11,2168274,2,prior,1,2,11,-1,0.0,-1.0,0.0,13,0.0
12,1501582,2,prior,2,5,10,10,0.0,0.0,0.166667,6,0.0
13,1901567,2,prior,3,1,10,3,1.0,0.0,0.6,5,0.0
14,738281,2,prior,4,2,10,8,0.0,0.333333,0.076923,13,0.076923
15,1673511,2,prior,5,3,11,8,1.0,0.25,0.076923,13,0.076923


### Temporary Validation

In [23]:
print(f"Number of orders: {len(data_store.get_orders_tip())}")
print(f"Number of subset orders: {len(data_store.get_orders_tip_subset())}\n")
print(f"Number of orders did not change: {order_amount == len(data_store.get_orders_tip())}")
print(f"Number of subset orders did not change: {len(order_ids) == len(data_store.get_orders_tip_subset())}\n")
print(f"Number of NaN or null values in each column:\n{data_store.get_orders_tip().isnull().sum()}")

Number of orders: 3346083
Number of subset orders: 991222

Number of orders did not change: True
Number of subset orders did not change: True

Number of NaN or null values in each column:
order_id                           0
user_id                            0
eval_set                           0
order_number                       0
order_dow                          0
order_hour_of_day                  0
days_since_prior_order             0
tip                           131209
tip_history                        0
reordered_ratio                    0
order_size                         0
prev_tipped_products_ratio         0
dtype: int64


### Temporary Manual Validation

In [24]:
orders_joined = data_store.get_orders_joined()
order_size = orders_joined.groupby('order_id')['order_number'].size().reset_index().rename(
    columns={'order_number': 'order_size'})
order_size.head(25)

Unnamed: 0,order_id,order_size
0,1,8
1,2,9
2,3,8
3,4,13
4,5,26
5,6,3
6,7,2
7,8,1
8,9,15
9,10,15


In [25]:
print(f"Order size example: {order_size[order_size['order_id'] == 2168274]['order_size'].values[0]}")
orders_joined[orders_joined['order_id'] == 2168274]

Order size example: 13


Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,tip,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,aisle,department
70,2168274,2,prior,1,2,11,,0.0,32792,1,0,Chipotle Beef & Pork Realstick,23,19,popcorn jerky,snacks
71,2168274,2,prior,1,2,11,,0.0,47766,2,0,Organic Avocado,24,4,fresh fruits,produce
72,2168274,2,prior,1,2,11,,0.0,20574,3,0,Roasted Turkey,96,20,lunch meat,deli
73,2168274,2,prior,1,2,11,,0.0,12000,4,0,Baked Organic Sea Salt Crunchy Pea Snack,72,13,condiments,pantry
74,2168274,2,prior,1,2,11,,0.0,48110,5,0,Thin Stackers Brown Rice Lightly Salted,78,19,crackers,snacks
75,2168274,2,prior,1,2,11,,0.0,22474,6,0,Cheddar Bunnies Snack Crackers,78,19,crackers,snacks
76,2168274,2,prior,1,2,11,,0.0,16589,7,0,Plantain Chips,107,19,chips pretzels,snacks
77,2168274,2,prior,1,2,11,,0.0,35917,8,0,Organic Just Concord Grape Juice,98,7,juice nectars,beverages
78,2168274,2,prior,1,2,11,,0.0,27344,9,0,Uncured Genoa Salami,96,20,lunch meat,deli
79,2168274,2,prior,1,2,11,,0.0,30489,10,0,Original Hummus,67,20,fresh dips tapenades,deli


In [28]:
orders_joined = data_store.get_orders_joined()
test = orders_joined.iloc[:1000]
orders_tip = data_store.get_orders_tip().copy()


def cumulative_union_1(user_orders):
    cumulative_products = set()
    for idx, order in user_orders.iterrows():
        prev_tipped_products = cumulative_products.intersection(order['products'])
        user_orders.at[idx, 'prev_tipped_products_ratio'] = len(prev_tipped_products) / len(order['products'])
        if order['tip'] == 1.0:
            cumulative_products.update(order['products'])
    return user_orders


grouped = (test.groupby(['user_id', 'order_number', 'order_id']).agg(
    products=('product_id', lambda x: set(x)), tip=('tip', 'first'))).reset_index()

grouped = grouped.groupby('user_id').apply(cumulative_union_1, include_groups=False).reset_index(
    drop=False).drop(columns='level_1')

final = pd.merge(orders_tip.drop('prev_tipped_products_ratio', axis=1),
                 grouped[['user_id', 'order_number', 'prev_tipped_products_ratio']],
                 on=['user_id', 'order_number'],
                 how='left')
# grouped_1.head(25)
final.head(25)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,tip,tip_history,reordered_ratio,order_size,prev_tipped_products_ratio
0,2539329,1,prior,1,2,8,-1,0.0,-1.0,0.0,5,0.0
1,2398795,1,prior,2,3,7,15,0.0,0.0,0.5,6,0.0
2,473747,1,prior,3,3,12,21,0.0,0.0,0.6,5,0.0
3,2254736,1,prior,4,4,7,29,0.0,0.0,1.0,5,0.0
4,431534,1,prior,5,4,15,28,0.0,0.0,0.625,8,0.0
5,3367565,1,prior,6,2,7,19,0.0,0.0,1.0,4,0.0
6,550135,1,prior,7,1,9,20,0.0,0.0,1.0,5,0.0
7,3108588,1,prior,8,1,14,14,0.0,0.0,0.666667,6,0.0
8,2295261,1,prior,9,1,16,0,0.0,0.0,1.0,6,0.0
9,2550362,1,prior,10,4,8,30,0.0,0.0,0.666667,9,0.0


In [31]:
final.head(87).equals(orders_tip.head(87))

True

### Temporary Tests

In [32]:
dynamic_feature_test_1 = DynamicFeatureTest1(data_store)
dynamic_feature_test_2 = DynamicFeatureTest2(data_store)
dynamic_feature_test_1.compute_feature()
dynamic_feature_test_2.compute_feature()

data_store.get_orders_tip_subset().head(25)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,tip,tip_history,reordered_ratio,order_size,prev_tipped_products_ratio,dynamic_feature_test_1,dynamic_feature_test_2
0,2539329,1,prior,1,2,8,-1,0.0,-1.0,0.0,5,0.0,1,1
1,2398795,1,prior,2,3,7,15,0.0,0.0,0.5,6,0.0,0,2
2,473747,1,prior,3,3,12,21,0.0,0.0,0.6,5,0.0,1,0
3,2254736,1,prior,4,4,7,29,0.0,0.0,1.0,5,0.0,0,1
4,431534,1,prior,5,4,15,28,0.0,0.0,0.625,8,0.0,1,2
5,2168274,2,prior,1,2,11,-1,0.0,-1.0,0.0,13,0.0,1,1
6,1501582,2,prior,2,5,10,10,0.0,0.0,0.166667,6,0.0,0,2
7,1901567,2,prior,3,1,10,3,1.0,0.0,0.6,5,0.0,1,0
8,738281,2,prior,4,2,10,8,0.0,0.333333,0.076923,13,0.076923,0,1
9,1673511,2,prior,5,3,11,8,1.0,0.25,0.076923,13,0.076923,1,2


In [33]:
order_ids = data_store.get_orders_tip().groupby('user_id')['order_id'].head(2)
data_store.set_data_subset(order_ids)

dynamic_feature_test_1.compute_feature()
data_store.get_orders_tip_subset().head(25)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,tip,tip_history,reordered_ratio,order_size,prev_tipped_products_ratio,dynamic_feature_test_1
0,2539329,1,prior,1,2,8,-1,0.0,-1.0,0.0,5,0.0,1
1,2398795,1,prior,2,3,7,15,0.0,0.0,0.5,6,0.0,0
11,2168274,2,prior,1,2,11,-1,0.0,-1.0,0.0,13,0.0,1
12,1501582,2,prior,2,5,10,10,0.0,0.0,0.166667,6,0.0,0
26,1374495,3,prior,1,1,14,-1,1.0,-1.0,0.0,10,0.0,1
27,444309,3,prior,2,3,19,9,1.0,1.0,0.333333,9,0.333333,0
38,3343014,4,prior,1,6,11,-1,0.0,-1.0,0.0,4,0.0,1
39,2030307,4,prior,2,4,11,19,0.0,0.0,0.0,2,0.0,0
43,2717275,5,prior,1,3,12,-1,0.0,-1.0,0.0,11,0.0,1
44,1909121,5,prior,2,0,16,11,1.0,0.0,0.444444,9,0.0,0


### Analysis

### Model Training & Evaluation