# Data Science Project SoSe 2024
## Team 07
- Maximilian Hoffmann
- Kilian Kempf
- Daniel Schneider
- Tom Schuck

## Project Submission

### Libraries

In [8]:
import os

import pandas as pd

from feature_engineering import DataManager
from feature_engineering.features import TipHistory, ReorderedRatio, DynamicFeatureTest1, DynamicFeatureTest2, \
    OrderSize, ModeDepartment, PrevTippedProductsRatio, CustomerLifetime, PrevOrderTipped, OrderFrequency, \
    SimOrdersTipRatio, ProductTipRate, DepartmentTipRate, AisleTipRate, OrderExcessLikability, AvgSizePrevOrders, \
    MeanOrderedRate, LastTipSequence, RelDaysSinceTip, DaysSinceTip

### Data Preparation

In [9]:
DATA_DIR = os.path.join(os.getcwd(), 'data/Instacart')

op_prior = pd.read_csv(os.path.join(DATA_DIR, 'order_products__prior.csv.zip'))
op_train = pd.read_csv(os.path.join(DATA_DIR, 'order_products__train.csv.zip'))

tip_train = pd.read_csv(os.path.join(DATA_DIR, 'tip_trainingsdaten1_.csv'))[['order_id', 'tip']]
tip_test = pd.read_csv(os.path.join(DATA_DIR, 'tip_testdaten1_template.csv'))

orders = pd.read_csv(os.path.join(DATA_DIR, 'orders.csv.zip'))
aisles = pd.read_csv(os.path.join(DATA_DIR, 'aisles.csv.zip'))
departments = pd.read_csv(os.path.join(DATA_DIR, 'departments.csv.zip'))
products = pd.read_csv(os.path.join(DATA_DIR, 'products.csv.zip'))

data_manager = DataManager(op_prior, op_train, tip_train, tip_test, orders, products, aisles, departments)
order_amount = len(data_manager.get_orders_tip())

### Feature Engineering

In [10]:
# features = ['order_number', 'order_dow', 'order_hour_of_day', 'days_since_prior_order', 'tip_history',
#             'reordered_ratio', 'order_size', 'mode_dept', 'prev_tipped_products_ratio']

# Kilian
tip_history = TipHistory()
reordered_rate = ReorderedRatio()
order_size = OrderSize()
prev_tipped_products_ratio = PrevTippedProductsRatio()
customer_lifetime = CustomerLifetime()
prev_order_tipped = PrevOrderTipped()

# Daniel
mode_dept = ModeDepartment()
order_frequency = OrderFrequency()
mean_ordered_rate = MeanOrderedRate()
rel_days_since_tip = RelDaysSinceTip()
days_since_tip = DaysSinceTip()

# Max
sim_orders_tip_ratio = SimOrdersTipRatio()
product_tip_rate = ProductTipRate()
department_tip_rate = DepartmentTipRate()
aisle_tip_rate = AisleTipRate()
last_tip_sequence = LastTipSequence()

# Tom
order_excess_likability = OrderExcessLikability()  # TODO
avg_size_prev_orders = AvgSizePrevOrders()  # TODO

# Test
dynamic_feature_test_1 = DynamicFeatureTest1()
dynamic_feature_test_2 = DynamicFeatureTest2()

In [11]:
# Static Features
data_manager.register_feature(tip_history)
data_manager.register_feature(reordered_rate)
data_manager.register_feature(order_size)
data_manager.register_feature(customer_lifetime)
data_manager.register_feature(prev_order_tipped)
# data_manager.register_feature(prev_tipped_products_ratio)

# data_manager.register_feature(mode_dept)
# data_manager.register_feature(order_frequency)
# data_manager.register_feature(sim_orders_tip_ratio)
# data_manager.register_feature(order_excess_likability)
# data_manager.register_feature(avg_size_prev_orders)
# data_manager.register_feature(mean_ordered_rate)
# data_manager.register_feature(last_tip_sequence)
# data_manager.register_feature(rel_days_since_tip)
# data_manager.register_feature(days_since_tip)
# 
# # Dynamic Features
data_manager.register_feature(dynamic_feature_test_1)
data_manager.register_feature(dynamic_feature_test_2)
# data_manager.register_feature(product_tip_rate)
# data_manager.register_feature(department_tip_rate)
# data_manager.register_feature(aisle_tip_rate)

In [12]:
data_manager.compute_features()

In [13]:
data_manager.get_orders_tip().head(100)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,tip,order_size,customer_lifetime,tip_history,reordered_ratio,prev_order_tipped,dynamic_feature_test_1,dynamic_feature_test_2
0,2539329,1,prior,1,2,8,,0.0,5,0,-1.0,0.000000,-1,0.090909,0.090909
1,2398795,1,prior,2,3,7,15.0,0.0,6,15,0.0,0.500000,0.0,0.181818,0.181818
2,473747,1,prior,3,3,12,21.0,0.0,5,36,0.0,0.600000,0.0,0.272727,0.272727
3,2254736,1,prior,4,4,7,29.0,0.0,5,65,0.0,1.000000,0.0,0.363636,0.363636
4,431534,1,prior,5,4,15,28.0,0.0,8,93,0.0,0.625000,0.0,0.454545,0.454545
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1916106,12,prior,3,5,8,14.0,0.0,12,44,0.0,0.250000,0.0,0.600000,0.600000
96,1057378,12,prior,4,3,9,26.0,0.0,20,70,0.0,0.250000,0.0,0.800000,0.800000
97,221248,12,prior,5,1,9,30.0,0.0,22,100,0.0,0.227273,0.0,1.000000,1.000000
98,2618231,13,prior,1,6,12,,0.0,5,0,-1.0,0.000000,-1,0.076923,0.076923


In [14]:
data_manager.export_features('data/prepared_data/computed_features.csv.zip', only_static=False)

In [15]:
data_manager.import_features('data/prepared_data/computed_features.csv.zip', only_static=False)

In [16]:
data_manager.get_orders_tip().head(100)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,tip,order_size,customer_lifetime,tip_history,reordered_ratio,prev_order_tipped,dynamic_feature_test_1,dynamic_feature_test_2
0,2539329,1,prior,1,2,8,,0.0,5,0,-1.0,0.000000,-1.0,0.090909,0.090909
1,2398795,1,prior,2,3,7,15.0,0.0,6,15,0.0,0.500000,0.0,0.181818,0.181818
2,473747,1,prior,3,3,12,21.0,0.0,5,36,0.0,0.600000,0.0,0.272727,0.272727
3,2254736,1,prior,4,4,7,29.0,0.0,5,65,0.0,1.000000,0.0,0.363636,0.363636
4,431534,1,prior,5,4,15,28.0,0.0,8,93,0.0,0.625000,0.0,0.454545,0.454545
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1916106,12,prior,3,5,8,14.0,0.0,12,44,0.0,0.250000,0.0,0.600000,0.600000
96,1057378,12,prior,4,3,9,26.0,0.0,20,70,0.0,0.250000,0.0,0.800000,0.800000
97,221248,12,prior,5,1,9,30.0,0.0,22,100,0.0,0.227273,0.0,1.000000,1.000000
98,2618231,13,prior,1,6,12,,0.0,5,0,-1.0,0.000000,-1.0,0.076923,0.076923


In [17]:
print(f"Number of orders: {len(data_manager.get_orders_tip())}")
print(f"Number of orders did not change: {order_amount == len(data_manager.get_orders_tip())}\n")
print(f"Number of NaN or null values in each column:\n{data_manager.get_orders_tip().isnull().sum()}")

Number of orders: 3346083
Number of orders did not change: True

Number of NaN or null values in each column:
order_id                       0
user_id                        0
eval_set                       0
order_number                   0
order_dow                      0
order_hour_of_day              0
days_since_prior_order    206209
tip                       131209
order_size                     0
customer_lifetime              0
tip_history                    0
reordered_ratio                0
prev_order_tipped              0
dynamic_feature_test_1         0
dynamic_feature_test_2         0
dtype: int64


In [18]:
# order_ids = data_manager.get_orders_tip(complete=True).groupby('user_id')['order_id'].head(5)
order_ids = data_manager.get_orders_tip(complete=True)['order_id']
order_amount = len(order_ids)

In [19]:
data_manager.set_subset(order_ids)

In [20]:
data_manager.get_orders_tip().head(25)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,tip,order_size,customer_lifetime,tip_history,reordered_ratio,prev_order_tipped,dynamic_feature_test_1,dynamic_feature_test_2
0,2539329,1,prior,1,2,8,,0.0,5,0,-1.0,0.0,-1.0,0.090909,0.090909
1,2398795,1,prior,2,3,7,15.0,0.0,6,15,0.0,0.5,0.0,0.181818,0.181818
2,473747,1,prior,3,3,12,21.0,0.0,5,36,0.0,0.6,0.0,0.272727,0.272727
3,2254736,1,prior,4,4,7,29.0,0.0,5,65,0.0,1.0,0.0,0.363636,0.363636
4,431534,1,prior,5,4,15,28.0,0.0,8,93,0.0,0.625,0.0,0.454545,0.454545
5,3367565,1,prior,6,2,7,19.0,0.0,4,112,0.0,1.0,0.0,0.545455,0.545455
6,550135,1,prior,7,1,9,20.0,0.0,5,132,0.0,1.0,0.0,0.636364,0.636364
7,3108588,1,prior,8,1,14,14.0,0.0,6,146,0.0,0.666667,0.0,0.727273,0.727273
8,2295261,1,prior,9,1,16,0.0,0.0,6,146,0.0,1.0,0.0,0.818182,0.818182
9,2550362,1,prior,10,4,8,30.0,0.0,9,176,0.0,0.666667,0.0,0.909091,0.909091


In [21]:
print(f"Number of orders: {len(data_manager.get_orders_tip())}")
print(f"Number of orders did not change: {order_amount == len(data_manager.get_orders_tip())}\n")
print(f"Number of NaN or null values in each column:\n{data_manager.get_orders_tip().isnull().sum()}")

Number of orders: 3346083
Number of orders did not change: True

Number of NaN or null values in each column:
order_id                       0
user_id                        0
eval_set                       0
order_number                   0
order_dow                      0
order_hour_of_day              0
days_since_prior_order    206209
tip                       131209
order_size                     0
customer_lifetime              0
tip_history                    0
reordered_ratio                0
prev_order_tipped              0
dynamic_feature_test_1         0
dynamic_feature_test_2         0
dtype: int64


In [22]:
order_ids = data_manager.get_orders_tip(complete=True).groupby('user_id')['order_id'].head(4)
order_amount = len(order_ids)

data_manager.set_subset(order_ids)
data_manager.get_orders_tip().head(25)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,tip,order_size,customer_lifetime,tip_history,reordered_ratio,prev_order_tipped,dynamic_feature_test_1,dynamic_feature_test_2
0,2539329,1,prior,1,2,8,,0.0,5,0,-1.0,0.0,-1.0,0.25,0.25
1,2398795,1,prior,2,3,7,15.0,0.0,6,15,0.0,0.5,0.0,0.5,0.5
2,473747,1,prior,3,3,12,21.0,0.0,5,36,0.0,0.6,0.0,0.75,0.75
3,2254736,1,prior,4,4,7,29.0,0.0,5,65,0.0,1.0,0.0,1.0,1.0
4,2168274,2,prior,1,2,11,,0.0,13,0,-1.0,0.0,-1.0,0.25,0.25
5,1501582,2,prior,2,5,10,10.0,0.0,6,10,0.0,0.166667,0.0,0.5,0.5
6,1901567,2,prior,3,1,10,3.0,1.0,5,13,0.0,0.6,0.0,0.75,0.75
7,738281,2,prior,4,2,10,8.0,0.0,13,21,0.333333,0.076923,1.0,1.0,1.0
8,1374495,3,prior,1,1,14,,1.0,10,0,-1.0,0.0,-1.0,0.25,0.25
9,444309,3,prior,2,3,19,9.0,1.0,9,9,1.0,0.333333,1.0,0.5,0.5


In [23]:
print(f"Number of orders: {len(data_manager.get_orders_tip())}")
print(f"Number of orders did not change: {order_amount == len(data_manager.get_orders_tip())}\n")
print(f"Number of NaN or null values in each column:\n{data_manager.get_orders_tip().isnull().sum()}")

Number of orders: 816150
Number of orders did not change: True

Number of NaN or null values in each column:
order_id                       0
user_id                        0
eval_set                       0
order_number                   0
order_dow                      0
order_hour_of_day              0
days_since_prior_order    206209
tip                        15300
order_size                     0
customer_lifetime              0
tip_history                    0
reordered_ratio                0
prev_order_tipped              0
dynamic_feature_test_1         0
dynamic_feature_test_2         0
dtype: int64


### Analysis

In [24]:
from analysis import ExampleAnalysis

analysis = ExampleAnalysis(data_manager)
analysis.analyze()
analysis.plot()

AttributeError: 'ExampleAnalysis' object has no attribute 'plot'

### Model Training & Evaluation