# Data Science Project SoSe 2024
## Team 07
- Maximilian Hoffmann
- Kilian Kempf
- Daniel Schneider
- Tom Schuck

## Project Submission

### Libraries

In [1]:
import os

import pandas as pd

from feature_engineering import DataStore
from feature_engineering.features import TipHistory, ReorderedRatio, DynamicFeatureTest1, DynamicFeatureTest2, \
    LastTipSequence, \
    OrderSize, ModeDepartment, PrevTippedProductsRatio, CustomerLifetime, PrevOrderTipped, ProductTipRate, \
    SimOrdersTipRatio

### Data Preparation

In [2]:
DATA_DIR = os.path.join(os.getcwd(), 'data/Instacart')

op_prior = pd.read_csv(os.path.join(DATA_DIR, 'order_products__prior.csv.zip'))
op_train = pd.read_csv(os.path.join(DATA_DIR, 'order_products__train.csv.zip'))

tip_train = pd.read_csv(os.path.join(DATA_DIR, 'tip_trainingsdaten1_.csv'))[['order_id', 'tip']]
tip_test = pd.read_csv(os.path.join(DATA_DIR, 'tip_testdaten1_template.csv'))

orders = pd.read_csv(os.path.join(DATA_DIR, 'orders.csv.zip'))
aisles = pd.read_csv(os.path.join(DATA_DIR, 'aisles.csv.zip'))
departments = pd.read_csv(os.path.join(DATA_DIR, 'departments.csv.zip'))
products = pd.read_csv(os.path.join(DATA_DIR, 'products.csv.zip'))

data_store = DataStore(op_prior, op_train, tip_train, tip_test, orders, products, aisles, departments)
order_amount = len(data_store.get_orders_tip())

### Feature Engineering

In [3]:
features = ['order_number', 'order_dow', 'order_hour_of_day', 'days_since_prior_order', 'tip_history',
            'reordered_ratio', 'order_size', 'mode_dept', 'prev_tipped_products_ratio']

tip_history = TipHistory(data_store)
reordered_rate = ReorderedRatio(data_store)
order_size = OrderSize(data_store)
mode_dept = ModeDepartment(data_store)
prev_tipped_products_ratio = PrevTippedProductsRatio(data_store)
customer_lifetime = CustomerLifetime(data_store)
prev_order_tipped = PrevOrderTipped(data_store)

# Static Features
tip_history.compute_feature()
reordered_rate.compute_feature()
order_size.compute_feature()
customer_lifetime.compute_feature()
prev_order_tipped.compute_feature()
# mode_dept.compute_feature()
# prev_tipped_products_ratio.compute_feature()



In [4]:
data_store.get_orders_tip().head(100)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,tip,sim_orders_tip_ratio
0,2539329,1,prior,1,2,8,-1,0.0,0.000000
1,2398795,1,prior,2,3,7,15,0.0,-0.375000
2,473747,1,prior,3,3,12,21,0.0,-0.625000
3,2254736,1,prior,4,4,7,29,0.0,-1.470238
4,431534,1,prior,5,4,15,28,0.0,-1.470707
...,...,...,...,...,...,...,...,...,...
95,1916106,12,prior,3,5,8,14,0.0,-0.115385
96,1057378,12,prior,4,3,9,26,0.0,-0.169264
97,221248,12,prior,5,1,9,30,0.0,-0.259559
98,2618231,13,prior,1,6,12,-1,0.0,0.000000


In [None]:
data_store.get_orders_tip_subset().head(25)

In [None]:
order_ids = data_store.get_orders_tip().groupby('user_id')['order_id'].head(5)
data_store.set_data_subset(order_ids)

# Dynamic Features




In [None]:
data_store.get_orders_tip_subset().head(25)

### Temporary Validation

In [None]:
print(f"Number of orders: {len(data_store.get_orders_tip())}")
print(f"Number of subset orders: {len(data_store.get_orders_tip_subset())}\n")
print(f"Number of orders did not change: {order_amount == len(data_store.get_orders_tip())}")
print(f"Number of subset orders did not change: {len(order_ids) == len(data_store.get_orders_tip_subset())}\n")
print(f"Number of NaN or null values in each column:\n{data_store.get_orders_tip().isnull().sum()}")

### Temporary Manual Validation

In [None]:
orders_joined = data_store.get_orders_joined()
order_size = orders_joined.groupby('order_id')['order_number'].size().reset_index().rename(
    columns={'order_number': 'order_size'})
order_size.head(25)

In [None]:
print(f"Order size example: {order_size[order_size['order_id'] == 2168274]['order_size'].values[0]}")
orders_joined[orders_joined['order_id'] == 2168274]

In [None]:
orders_joined = data_store.get_orders_joined()
test = orders_joined.iloc[:1000]
orders_tip = data_store.get_orders_tip().copy()


# def cumulative_union_1(user_orders):
#     cumulative_products = set()
#     for idx, order in user_orders.iterrows():
#         prev_tipped_products = cumulative_products.intersection(order['products'])
#         user_orders.at[idx, 'prev_tipped_products_ratio'] = len(prev_tipped_products) / len(order['products'])
#         if order['tip'] == 1.0:
#             cumulative_products.update(order['products'])
#     return user_orders
# 
# 
# grouped = (test.groupby(['user_id', 'order_number', 'order_id']).agg(
#     products=('product_id', lambda x: set(x)), tip=('tip', 'first'))).reset_index()
# 
# grouped = grouped.groupby('user_id').apply(cumulative_union_1, include_groups=False).reset_index(
#     drop=False).drop(columns='level_1')
# 
# final = pd.merge(orders_tip.drop('prev_tipped_products_ratio', axis=1),
#                  grouped[['user_id', 'order_number', 'prev_tipped_products_ratio']],
#                  on=['user_id', 'order_number'],
#                  how='left')
# # grouped_1.head(25)
# final.head(25)

In [None]:
# final.head(87).equals(orders_tip.head(87))

### Temporary Tests

In [None]:
dynamic_feature_test_1 = DynamicFeatureTest1(data_store)
dynamic_feature_test_2 = DynamicFeatureTest2(data_store)
dynamic_feature_test_1.compute_feature()
dynamic_feature_test_2.compute_feature()

data_store.get_orders_tip_subset().head(25)

In [None]:
order_ids = data_store.get_orders_tip().groupby('user_id')['order_id'].head(2)
data_store.set_data_subset(order_ids)

dynamic_feature_test_1.compute_feature()
data_store.get_orders_tip_subset().head(25)

### Analysis

### Model Training & Evaluation