# Data Science Project SoSe 2024
## Team 07
- Maximilian Hoffmann
- Kilian Kempf
- Daniel Schneider
- Tom Schuck

## Project Submission

### Libraries

In [1]:
import os

import pandas as pd

from feature_engineering import DataManager
from feature_engineering.features import TipHistory, ReorderedRatio, DynamicFeatureTest1, DynamicFeatureTest2, \
    OrderSize, ModeDepartment, PrevTippedProductsRatio, CustomerLifetime, PrevOrderTipped, OrderFrequency, \
    SimOrdersTipRatio, ProductTipRate, DepartmentTipRate, AisleTipRate, OrderExcessLikability, AvgSizePrevOrders, \
    MeanOrderedRate, LastTipSequence, RelDaysSinceTip, DaysSinceTip

### Data Preparation

In [2]:
DATA_DIR = os.path.join(os.getcwd(), 'data/Instacart')

op_prior = pd.read_csv(os.path.join(DATA_DIR, 'order_products__prior.csv.zip'))
op_train = pd.read_csv(os.path.join(DATA_DIR, 'order_products__train.csv.zip'))

tip_train = pd.read_csv(os.path.join(DATA_DIR, 'tip_trainingsdaten1_.csv'))[['order_id', 'tip']]
tip_test = pd.read_csv(os.path.join(DATA_DIR, 'tip_testdaten1_template.csv'))

orders = pd.read_csv(os.path.join(DATA_DIR, 'orders.csv.zip'))
aisles = pd.read_csv(os.path.join(DATA_DIR, 'aisles.csv.zip'))
departments = pd.read_csv(os.path.join(DATA_DIR, 'departments.csv.zip'))
products = pd.read_csv(os.path.join(DATA_DIR, 'products.csv.zip'))

data_manager = DataManager(op_prior, op_train, tip_train, tip_test, orders, products, aisles, departments)
order_amount = len(data_manager.get_orders_tip())

### Feature Engineering

In [3]:
# features = ['order_number', 'order_dow', 'order_hour_of_day', 'days_since_prior_order', 'tip_history',
#             'reordered_ratio', 'order_size', 'mode_dept', 'prev_tipped_products_ratio']

# Kilian
tip_history = TipHistory()
reordered_rate = ReorderedRatio()
order_size = OrderSize()
prev_tipped_products_ratio = PrevTippedProductsRatio()
customer_lifetime = CustomerLifetime()
prev_order_tipped = PrevOrderTipped()

# Daniel
mode_dept = ModeDepartment()
order_frequency = OrderFrequency()
mean_ordered_rate = MeanOrderedRate()
rel_days_since_tip = RelDaysSinceTip()
days_since_tip = DaysSinceTip()

# Max
sim_orders_tip_ratio = SimOrdersTipRatio()
product_tip_rate = ProductTipRate()
department_tip_rate = DepartmentTipRate()
aisle_tip_rate = AisleTipRate()
last_tip_sequence = LastTipSequence()

# Tom
order_excess_likability = OrderExcessLikability()  # TODO
avg_size_prev_orders = AvgSizePrevOrders()  # TODO

# Test
dynamic_feature_test_1 = DynamicFeatureTest1()
dynamic_feature_test_2 = DynamicFeatureTest2()

In [4]:
# Static Features
data_manager.register_feature(tip_history)
data_manager.register_feature(reordered_rate)
data_manager.register_feature(order_size)
data_manager.register_feature(customer_lifetime)
data_manager.register_feature(prev_order_tipped)
data_manager.register_feature(prev_tipped_products_ratio)

data_manager.register_feature(mode_dept)
data_manager.register_feature(order_frequency)
data_manager.register_feature(sim_orders_tip_ratio)
data_manager.register_feature(order_excess_likability)
data_manager.register_feature(avg_size_prev_orders)
data_manager.register_feature(mean_ordered_rate)
# data_manager.register_feature(last_tip_sequence)
data_manager.register_feature(rel_days_since_tip)
data_manager.register_feature(days_since_tip)

# Dynamic Features
data_manager.register_feature(dynamic_feature_test_1)
data_manager.register_feature(dynamic_feature_test_2)
data_manager.register_feature(product_tip_rate)
data_manager.register_feature(department_tip_rate)
data_manager.register_feature(aisle_tip_rate)

In [5]:
data_manager.compute_features()

  orders_tip_copy['tip_temp'] = orders_tip_copy['tip'].fillna(0).astype(int)


In [6]:
data_manager.get_orders_tip().head(100)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,tip,prev_tipped_products_ratio,customer_lifetime,...,prev_order_tipped,reordered_ratio,sim_orders_tip_ratio,mode_dept,order_excess_likability,aisle_tip_rate,dynamic_feature_test_1,dynamic_feature_test_2,dept_tip_rate,product_tip_rate
0,2539329,1,prior,1,2,8,,0.0,0.0,0,...,-1,0.000000,0.000000,19,0.000000,0.422794,0.090909,0.090909,0.43166,0.392608
1,2398795,1,prior,2,3,7,15.0,0.0,0.0,15,...,0.0,0.500000,-0.187500,19,0.500000,0.435643,0.181818,0.181818,0.435526,0.436999
2,473747,1,prior,3,3,12,21.0,0.0,0.0,36,...,0.0,0.600000,-0.208333,19,0.600000,0.418555,0.272727,0.272727,0.423082,0.432942
3,2254736,1,prior,4,4,7,29.0,0.0,0.0,65,...,0.0,1.000000,-0.367560,19,1.000000,0.423573,0.363636,0.363636,0.43166,0.407192
4,431534,1,prior,5,4,15,28.0,0.0,0.0,93,...,0.0,0.625000,-0.294141,4,0.625000,0.463958,0.454545,0.454545,0.454249,0.468226
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1916106,12,prior,3,5,8,14.0,0.0,0.0,44,...,0.0,0.250000,-0.038462,19,0.250000,0.406478,0.600000,0.600000,0.415053,0.419411
96,1057378,12,prior,4,3,9,26.0,0.0,0.0,70,...,0.0,0.250000,-0.042316,4,0.250000,0.448863,0.800000,0.800000,0.442395,0.453567
97,221248,12,prior,5,1,9,30.0,0.0,0.0,100,...,0.0,0.227273,-0.051912,4,0.227273,0.437352,1.000000,1.000000,0.442676,0.454669
98,2618231,13,prior,1,6,12,,0.0,0.0,0,...,-1,0.000000,0.000000,16,0.000000,0.468568,0.076923,0.076923,0.466173,0.451617


In [7]:
print(f"Number of orders: {len(data_manager.get_orders_tip())}")
print(f"Number of orders did not change: {order_amount == len(data_manager.get_orders_tip())}\n")
print(f"Number of NaN or null values in each column:\n{data_manager.get_orders_tip().isnull().sum()}")

Number of orders: 3346083
Number of orders did not change: True
Number of NaN or null values in each column:
order_id                            0
user_id                             0
eval_set                            0
order_number                        0
order_dow                           0
order_hour_of_day                   0
days_since_prior_order         206209
tip                            131209
prev_tipped_products_ratio          0
customer_lifetime                   0
rel_days_since_tip            1273798
order_frequency                     0
order_size                          0
avg_size_prev_orders                0
mean_ordered_rate                   0
days_since_tip                      0
tip_history                         0
prev_order_tipped                   0
reordered_ratio                     0
sim_orders_tip_ratio                0
mode_dept                           0
order_excess_likability             0
aisle_tip_rate                      0
dynamic_feature_t

In [8]:
order_ids = data_manager.get_orders_tip(full=True).groupby('user_id')['order_id'].head(5)
order_amount = len(order_ids)

In [9]:
data_manager.set_subset(order_ids)

In [10]:
data_manager.get_orders_tip().head(25)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,tip,prev_tipped_products_ratio,customer_lifetime,...,prev_order_tipped,reordered_ratio,sim_orders_tip_ratio,mode_dept,order_excess_likability,aisle_tip_rate,dynamic_feature_test_1,dynamic_feature_test_2,dept_tip_rate,product_tip_rate
0,2539329,1,prior,1,2,8,-1.0,0.0,0.0,0,...,-1.0,0.0,0.0,19,0.0,0.339618,0.2,0.2,0.346443,0.305662
1,2398795,1,prior,2,3,7,15.0,0.0,0.0,15,...,0.0,0.5,-0.1875,19,0.5,0.350406,0.4,0.4,0.34929,0.342736
2,473747,1,prior,3,3,12,21.0,0.0,0.0,36,...,0.0,0.6,-0.208333,19,0.6,0.331775,0.6,0.6,0.336445,0.328557
3,2254736,1,prior,4,4,7,29.0,0.0,0.0,65,...,0.0,1.0,-0.36756,19,1.0,0.339673,0.8,0.8,0.346443,0.304496
4,431534,1,prior,5,4,15,28.0,0.0,0.0,93,...,0.0,0.625,-0.294141,4,0.625,0.37799,1.0,1.0,0.367828,0.372122
5,2168274,2,prior,1,2,11,-1.0,0.0,0.0,0,...,-1.0,0.0,0.0,4,0.0,0.353337,0.2,0.2,0.349107,0.388366
6,1501582,2,prior,2,5,10,10.0,0.0,0.0,10,...,0.0,0.166667,-0.027778,4,0.166667,0.36929,0.4,0.4,0.36711,0.362496
7,1901567,2,prior,3,1,10,3.0,1.0,0.0,13,...,0.0,0.6,-0.1,20,0.6,0.364262,0.6,0.6,0.364443,0.394544
8,738281,2,prior,4,2,10,8.0,0.0,0.076923,21,...,1.0,0.076923,0.048824,20,0.076923,0.338992,0.8,0.8,0.333335,0.345744
9,1673511,2,prior,5,3,11,8.0,1.0,0.076923,29,...,0.0,0.076923,0.039712,16,0.076923,0.391448,1.0,1.0,0.378839,0.419061


In [11]:
print(f"Number of orders: {len(data_manager.get_orders_tip())}")
print(f"Number of orders did not change: {order_amount == len(data_manager.get_orders_tip())}\n")
print(f"Number of NaN or null values in each column:\n{data_manager.get_orders_tip().isnull().sum()}")

Number of orders: 991222
Number of orders did not change: True

Number of NaN or null values in each column:
order_id                           0
user_id                            0
eval_set                           0
order_number                       0
order_dow                          0
order_hour_of_day                  0
days_since_prior_order             0
tip                            27739
prev_tipped_products_ratio         0
customer_lifetime                  0
rel_days_since_tip            799043
order_frequency                    0
order_size                         0
avg_size_prev_orders               0
mean_ordered_rate                  0
days_since_tip                     0
tip_history                        0
prev_order_tipped                  0
reordered_ratio                    0
sim_orders_tip_ratio               0
mode_dept                          0
order_excess_likability            0
aisle_tip_rate                     0
dynamic_feature_test_1             0
dyn

In [12]:
order_ids = data_manager.get_orders_tip(full=True).groupby('user_id')['order_id'].head(4)
order_amount = len(order_ids)

data_manager.set_subset(order_ids)
data_manager.get_orders_tip().head(25)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,tip,prev_tipped_products_ratio,customer_lifetime,...,prev_order_tipped,reordered_ratio,sim_orders_tip_ratio,mode_dept,order_excess_likability,aisle_tip_rate,dynamic_feature_test_1,dynamic_feature_test_2,dept_tip_rate,product_tip_rate
0,2539329,1,prior,1,2,8,-1.0,0.0,0.0,0,...,-1.0,0.0,0.0,19,0.0,0.337544,0.25,0.25,0.343805,0.306204
1,2398795,1,prior,2,3,7,15.0,0.0,0.0,15,...,0.0,0.5,-0.1875,19,0.5,0.347567,0.5,0.5,0.346357,0.340457
2,473747,1,prior,3,3,12,21.0,0.0,0.0,36,...,0.0,0.6,-0.208333,19,0.6,0.328416,0.75,0.75,0.333293,0.325764
3,2254736,1,prior,4,4,7,29.0,0.0,0.0,65,...,0.0,1.0,-0.36756,19,1.0,0.337082,1.0,1.0,0.343805,0.302408
4,2168274,2,prior,1,2,11,-1.0,0.0,0.0,0,...,-1.0,0.0,0.0,4,0.0,0.350058,0.25,0.25,0.345703,0.385917
5,1501582,2,prior,2,5,10,10.0,0.0,0.0,10,...,0.0,0.166667,-0.027778,4,0.166667,0.366224,0.5,0.5,0.363978,0.358395
6,1901567,2,prior,3,1,10,3.0,1.0,0.0,13,...,0.0,0.6,-0.1,20,0.6,0.361089,0.75,0.75,0.361157,0.393484
7,738281,2,prior,4,2,10,8.0,0.0,0.076923,21,...,1.0,0.076923,0.048824,20,0.076923,0.335374,1.0,1.0,0.329717,0.348398
8,1374495,3,prior,1,1,14,-1.0,1.0,0.0,0,...,-1.0,0.0,0.0,4,0.0,0.389935,0.25,0.25,0.379997,0.377808
9,444309,3,prior,2,3,19,9.0,1.0,0.333333,9,...,1.0,0.333333,0.1875,16,0.333333,0.364977,0.5,0.5,0.367403,0.376597


In [13]:
print(f"Number of orders: {len(data_manager.get_orders_tip())}")
print(f"Number of orders did not change: {order_amount == len(data_manager.get_orders_tip())}\n")
print(f"Number of NaN or null values in each column:\n{data_manager.get_orders_tip().isnull().sum()}")

Number of orders: 816150
Number of orders did not change: True

Number of NaN or null values in each column:
order_id                           0
user_id                            0
eval_set                           0
order_number                       0
order_dow                          0
order_hour_of_day                  0
days_since_prior_order             0
tip                            15300
prev_tipped_products_ratio         0
customer_lifetime                  0
rel_days_since_tip            699015
order_frequency                    0
order_size                         0
avg_size_prev_orders               0
mean_ordered_rate                  0
days_since_tip                     0
tip_history                        0
prev_order_tipped                  0
reordered_ratio                    0
sim_orders_tip_ratio               0
mode_dept                          0
order_excess_likability            0
aisle_tip_rate                     0
dynamic_feature_test_1             0
dyn

### Analysis

In [ ]:
from analysis import ExampleAnalysis

analysis = ExampleAnalysis(data_manager)
analysis.analyze()
analysis.plot()

### Model Training & Evaluation