# Data Science Project SoSe 2024
## Team 07
- Maximilian Hoffmann
- Kilian Kempf
- Daniel Schneider
- Tom Schuck

## Project Submission

### Libraries

In [1]:
import os

import pandas as pd

from feature_engineering import DataStore
from feature_engineering.features import TipHistory, ReorderedRatio, DynamicFeatureTest1, DynamicFeatureTest2

### Data Preparation

In [2]:
DATA_DIR = os.path.join(os.getcwd(), 'data/Instacart')

op_prior = pd.read_csv(os.path.join(DATA_DIR, 'order_products__prior.csv.zip'))
op_train = pd.read_csv(os.path.join(DATA_DIR, 'order_products__train.csv.zip'))

tip_train = pd.read_csv(os.path.join(DATA_DIR, 'tip_trainingsdaten1_.csv'))[['order_id', 'tip']]
tip_test = pd.read_csv(os.path.join(DATA_DIR, 'tip_testdaten1_template.csv'))

orders = pd.read_csv(os.path.join(DATA_DIR, 'orders.csv.zip'))
aisles = pd.read_csv(os.path.join(DATA_DIR, 'aisles.csv.zip'))
departments = pd.read_csv(os.path.join(DATA_DIR, 'departments.csv.zip'))
products = pd.read_csv(os.path.join(DATA_DIR, 'products.csv.zip'))

data_store = DataStore(op_prior, op_train, tip_train, tip_test, orders, products, aisles, departments)
order_amount = len(data_store.get_orders_tip())

### Feature Engineering

In [3]:
features = ['order_number', 'order_dow', 'order_hour_of_day', 'days_since_prior_order', 'tip_history',
            'reordered_ratio']
tip_history = TipHistory(data_store)
reordered_rate = ReorderedRatio(data_store)

# Static Features
tip_history.compute_feature()
reordered_rate.compute_feature()

In [4]:
data_store.get_orders_tip().head(25)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,tip,tip_history,reordered_ratio
0,2539329,1,prior,1,2,8,-1,0.0,-1.0,0.0
1,2398795,1,prior,2,3,7,15,0.0,0.0,0.5
2,473747,1,prior,3,3,12,21,0.0,0.0,0.6
3,2254736,1,prior,4,4,7,29,0.0,0.0,1.0
4,431534,1,prior,5,4,15,28,0.0,0.0,0.625
5,3367565,1,prior,6,2,7,19,0.0,0.0,1.0
6,550135,1,prior,7,1,9,20,0.0,0.0,1.0
7,3108588,1,prior,8,1,14,14,0.0,0.0,0.666667
8,2295261,1,prior,9,1,16,0,0.0,0.0,1.0
9,2550362,1,prior,10,4,8,30,0.0,0.0,0.666667


In [5]:
order_ids = data_store.get_orders_tip().groupby('user_id')['order_id'].head(5)
data_store.set_data_subset(order_ids)

# Dynamic Features

In [6]:
data_store.get_orders_tip_subset().head(25)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,tip,tip_history,reordered_ratio
0,2539329,1,prior,1,2,8,-1,0.0,-1.0,0.0
1,2398795,1,prior,2,3,7,15,0.0,0.0,0.5
2,473747,1,prior,3,3,12,21,0.0,0.0,0.6
3,2254736,1,prior,4,4,7,29,0.0,0.0,1.0
4,431534,1,prior,5,4,15,28,0.0,0.0,0.625
11,2168274,2,prior,1,2,11,-1,0.0,-1.0,0.0
12,1501582,2,prior,2,5,10,10,0.0,0.0,0.166667
13,1901567,2,prior,3,1,10,3,1.0,0.0,0.6
14,738281,2,prior,4,2,10,8,0.0,0.333333,0.076923
15,1673511,2,prior,5,3,11,8,1.0,0.25,0.076923


### Temporary Validatiom

In [11]:
print(f"Number of orders did not change: {order_amount == len(data_store.get_orders_tip())}")
print(f"Number of subset orders did not change: {len(order_ids) == len(data_store.get_orders_tip_subset())}\n")
print(f"Number of NaN or null values in each column:\n{data_store.get_orders_tip().isnull().sum()}")

Number of orders did not change: True
Number of subset orders did not change: True

Number of NaN or null values in each column:
order_id                       0
user_id                        0
eval_set                       0
order_number                   0
order_dow                      0
order_hour_of_day              0
days_since_prior_order         0
tip                       131209
tip_history                    0
reordered_ratio                0
dtype: int64


### Temporary Tests

In [8]:
dynamic_feature_test_1 = DynamicFeatureTest1(data_store)
dynamic_feature_test_2 = DynamicFeatureTest2(data_store)
dynamic_feature_test_1.compute_feature()
dynamic_feature_test_2.compute_feature()

data_store.get_orders_tip_subset().head(25)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,tip,tip_history,reordered_ratio,dynamic_feature_test_1,dynamic_feature_test_2
0,2539329,1,prior,1,2,8,-1,0.0,-1.0,0.0,1,1
1,2398795,1,prior,2,3,7,15,0.0,0.0,0.5,0,2
2,473747,1,prior,3,3,12,21,0.0,0.0,0.6,1,0
3,2254736,1,prior,4,4,7,29,0.0,0.0,1.0,0,1
4,431534,1,prior,5,4,15,28,0.0,0.0,0.625,1,2
5,2168274,2,prior,1,2,11,-1,0.0,-1.0,0.0,1,1
6,1501582,2,prior,2,5,10,10,0.0,0.0,0.166667,0,2
7,1901567,2,prior,3,1,10,3,1.0,0.0,0.6,1,0
8,738281,2,prior,4,2,10,8,0.0,0.333333,0.076923,0,1
9,1673511,2,prior,5,3,11,8,1.0,0.25,0.076923,1,2


In [9]:
order_ids = data_store.get_orders_tip().groupby('user_id')['order_id'].head(4)
data_store.set_data_subset(order_ids)

dynamic_feature_test_1.compute_feature()
data_store.get_orders_tip_subset().head(25)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,tip,tip_history,reordered_ratio,dynamic_feature_test_1
0,2539329,1,prior,1,2,8,-1,0.0,-1.0,0.0,1
1,2398795,1,prior,2,3,7,15,0.0,0.0,0.5,0
2,473747,1,prior,3,3,12,21,0.0,0.0,0.6,1
3,2254736,1,prior,4,4,7,29,0.0,0.0,1.0,0
11,2168274,2,prior,1,2,11,-1,0.0,-1.0,0.0,1
12,1501582,2,prior,2,5,10,10,0.0,0.0,0.166667,0
13,1901567,2,prior,3,1,10,3,1.0,0.0,0.6,1
14,738281,2,prior,4,2,10,8,0.0,0.333333,0.076923,0
26,1374495,3,prior,1,1,14,-1,1.0,-1.0,0.0,1
27,444309,3,prior,2,3,19,9,1.0,1.0,0.333333,0


### Analysis

### Model Training & Evaluation