# Data Science Project SoSe 2024
## Team 07
- Maximilian Hoffmann
- Kilian Kempf
- Daniel Schneider
- Tom Schuck

## Project Submission

### Data Initialization

In [1]:
import os

import pandas as pd

from data_management import DataManager

DATA_DIR = os.path.join(os.getcwd(), 'data/Instacart')

op_prior = pd.read_csv(os.path.join(DATA_DIR, 'order_products__prior.csv.zip'))
op_train = pd.read_csv(os.path.join(DATA_DIR, 'order_products__train.csv.zip'))

tip_train = pd.read_csv(os.path.join(DATA_DIR, 'tip_trainingsdaten1_.csv'))[['order_id', 'tip']]
tip_test = pd.read_csv(os.path.join(DATA_DIR, 'tip_testdaten1_template.csv'))

orders = pd.read_csv(os.path.join(DATA_DIR, 'orders.csv.zip'))
aisles = pd.read_csv(os.path.join(DATA_DIR, 'aisles.csv.zip'))
departments = pd.read_csv(os.path.join(DATA_DIR, 'departments.csv.zip'))
products = pd.read_csv(os.path.join(DATA_DIR, 'products.csv.zip'))

data_manager = DataManager(op_prior, op_train, tip_train, tip_test, orders, products, aisles, departments)
order_amount = len(data_manager.get_orders_tip())

### Feature Engineering

In [2]:
from feature_engineering.static_features import TipHistory, ReorderedRatio, OrderSize, PrevTippedProductsRatio, \
    CustomerLifetime, PrevOrderTipped, OrderFrequency, SimOrdersTipRatio, AvgSizePrevOrders, MeanOrderedRate, \
    LastTipSequence, RelDaysSinceTip, DaysSinceTip

from feature_engineering.dynamic_features import ProductTipRate, DepartmentTipRate, AisleTipRate, DynamicFeatureTest1, \
    DynamicFeatureTest2

# Kilian
tip_history = TipHistory()
reordered_rate = ReorderedRatio()
order_size = OrderSize()
prev_tipped_products_ratio = PrevTippedProductsRatio()
customer_lifetime = CustomerLifetime()
prev_order_tipped = PrevOrderTipped()

# Daniel
order_frequency = OrderFrequency()
mean_ordered_rate = MeanOrderedRate()
rel_days_since_tip = RelDaysSinceTip()
days_since_tip = DaysSinceTip()

# Max
sim_orders_tip_ratio = SimOrdersTipRatio()
product_tip_rate = ProductTipRate()
department_tip_rate = DepartmentTipRate()
aisle_tip_rate = AisleTipRate()
last_tip_sequence = LastTipSequence()  # TODO: Maybe remove weighting or add additional feature without weoghting

# Tom
avg_size_prev_orders = AvgSizePrevOrders()

# Test
dynamic_feature_test_1 = DynamicFeatureTest1()
dynamic_feature_test_2 = DynamicFeatureTest2()

In [3]:
# Static Features
data_manager.register_feature(tip_history)
# data_manager.register_feature(reordered_rate)
data_manager.register_feature(order_size)
# data_manager.register_feature(customer_lifetime)
# data_manager.register_feature(prev_order_tipped)
# data_manager.register_feature(prev_tipped_products_ratio)

# data_manager.register_feature(order_frequency)
# data_manager.register_feature(sim_orders_tip_ratio)
# data_manager.register_feature(order_excess_likability)
# data_manager.register_feature(avg_size_prev_orders)
# data_manager.register_feature(mean_ordered_rate)
# data_manager.register_feature(last_tip_sequence)
# data_manager.register_feature(rel_days_since_tip)
# data_manager.register_feature(days_since_tip)

# # Dynamic Features
# data_manager.register_feature(dynamic_feature_test_1)
# data_manager.register_feature(dynamic_feature_test_2)
# data_manager.register_feature(product_tip_rate)
# data_manager.register_feature(department_tip_rate)
# data_manager.register_feature(aisle_tip_rate)

In [4]:
data_manager.compute_features()

In [5]:
data_manager.get_orders_tip().head(100)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,tip,order_size,tip_history
0,2539329,1,prior,1,2,8,,0.0,5,
1,2398795,1,prior,2,3,7,15.0,0.0,6,0.0
2,473747,1,prior,3,3,12,21.0,0.0,5,0.0
3,2254736,1,prior,4,4,7,29.0,0.0,5,0.0
4,431534,1,prior,5,4,15,28.0,0.0,8,0.0
...,...,...,...,...,...,...,...,...,...,...
95,1916106,12,prior,3,5,8,14.0,0.0,12,0.0
96,1057378,12,prior,4,3,9,26.0,0.0,20,0.0
97,221248,12,prior,5,1,9,30.0,0.0,22,0.0
98,2618231,13,prior,1,6,12,,0.0,5,


In [6]:
# data_manager.export_features('data/prepared_data/computed_features.csv.zip', only_static=False)

In [7]:
# data_manager.import_features('data/prepared_data/computed_features.csv.zip', only_static=False)

In [8]:
# data_manager.get_orders_tip().head(100)

In [9]:
# test = data_manager.get_orders_tip()

In [10]:
# print(f"Number of orders: {len(data_manager.get_orders_tip())}")
# print(f"Number of orders did not change: {order_amount == len(data_manager.get_orders_tip())}\n")
# print(f"Number of NaN or null values in each column:\n{data_manager.get_orders_tip().isnull().sum()}")

In [11]:
# order_ids = data_manager.get_orders_tip(complete=True).groupby('user_id')['order_id'].head(5)
# order_amount = len(order_ids)

In [12]:
# data_manager.set_subset(order_ids)

In [13]:
# data_manager.get_orders_tip().head(25)

In [14]:
# print(f"Number of orders: {len(data_manager.get_orders_tip())}")
# print(f"Number of orders did not change: {order_amount == len(data_manager.get_orders_tip())}\n")
# print(f"Number of NaN or null values in each column:\n{data_manager.get_orders_tip().isnull().sum()}")

In [15]:
# order_ids = data_manager.get_orders_tip(complete=True).groupby('user_id')['order_id'].head(4)
# order_amount = len(order_ids)
# 
# data_manager.set_subset(order_ids)
# data_manager.get_orders_tip().head(25)

In [16]:
# print(f"Number of orders: {len(data_manager.get_orders_tip())}")
# print(f"Number of orders did not change: {order_amount == len(data_manager.get_orders_tip())}\n")
# print(f"Number of NaN or null values in each column:\n{data_manager.get_orders_tip().isnull().sum()}")

### Analysis
- Only orders from the training set (orders with tip information) are considered

In [17]:
from analysis import DayOfWeek, HourOfDay, DaysSincePriorOrder, OrderNumber, Department, Aisle, Product

In [18]:
day_of_week = DayOfWeek(data_manager)
hour_of_day = HourOfDay(data_manager)
days_since_prior_order = DaysSincePriorOrder(data_manager)
order_number = OrderNumber(data_manager)
department = Department(data_manager)
aisle = Aisle(data_manager)
product = Product(data_manager)
# number_order_user = NumberOrderUser(data_manager)

In [19]:
# day_of_week.execute_analysis()

In [20]:
# hour_of_day.execute_analysis()

In [21]:
# days_since_prior_order.execute_analysis()

In [22]:
# order_number.execute_analysis()

In [23]:
# department.execute_analysis()

In [24]:
# aisle.execute_analysis()

In [25]:
# product.execute_analysis()

In [26]:
# number_order_user.execute_analysis()

### Data Preparation

In [27]:
from feature_engineering.static_features import DowHighTipProbability, HodHighTipProbability, OrderNumberSquared, \
    ContainsAlcohol

data_manager.register_feature(DowHighTipProbability())
data_manager.register_feature(HodHighTipProbability())
data_manager.register_feature(OrderNumberSquared())
data_manager.register_feature(ContainsAlcohol())

In [28]:
data_manager.compute_features()

### Model Training & Evaluation

In [29]:
from data_management import LastOrderUserTSCVSplitter

In [30]:
orders_tip = data_manager.get_orders_tip()
orders_tip.shape

(3346083, 14)

In [31]:
orders_ids = orders_tip[orders_tip['order_number'] > 1]['order_id']
data_manager.set_subset(orders_ids)

In [32]:
orders_tip_train = data_manager.get_orders_tip_train()
orders_tip_train.shape

(3008665, 14)

In [33]:
# cv = KFold(n_splits=5, shuffle=True, random_state=42)

In [34]:
last_order_user_tscv_splitter = LastOrderUserTSCVSplitter(data_manager, n_splits=5)

In [35]:
splits = last_order_user_tscv_splitter.split(orders_tip_train)

In [36]:
folds = {}
prev_test = 0
for i, (train_index, test_index) in enumerate(splits):
    print(f'Fold {i + 1}')
    print(f'Train: {len(train_index)}')
    print(f'Test: {len(test_index)}')
    print(f'Orders: {len(orders_tip_train)}')
    print(f'Sum equal: {(len(train_index) + len(test_index) + prev_test) == len(orders_tip_train)}')
    print(f'Ratio: {len(test_index) / (len(train_index) + len(test_index))}')

    train_orders = orders_tip_train.iloc[train_index]
    test_orders = orders_tip_train.iloc[test_index]
    current_fold = {
        'train': train_orders,
        'test': test_orders
    }
    folds[f'fold_{i}'] = current_fold
    prev_test += len(test_index)

Iteration 1: Train size: 2802456, Test size: 206209
Fold 1
Train: 2802456
Test: 206209
Orders: 3008665
Sum equal: True
Ratio: 0.06853837166982699
Iteration 2: Train size: 2596247, Test size: 206209
Fold 2
Train: 2596247
Test: 206209
Orders: 3008665
Sum equal: True
Ratio: 0.07358152991518868
Iteration 3: Train size: 2414024, Test size: 182223
Fold 3
Train: 2414024
Test: 182223
Orders: 3008665
Sum equal: True
Ratio: 0.07018708158353192
Iteration 4: Train size: 2251391, Test size: 162633
Fold 4
Train: 2251391
Test: 162633
Orders: 3008665
Sum equal: True
Ratio: 0.06737008414166554
Iteration 5: Train size: 2104923, Test size: 146468
Fold 5
Train: 2104923
Test: 146468
Orders: 3008665
Sum equal: True
Ratio: 0.06505666941015577


In [37]:
# user_tscv_splitter = UserTSCVSplitter(data_manager, n_splits=5, validation_set_ratio=0.2)

In [38]:
# splits = user_tscv_splitter.split(orders_tip_train)

In [39]:
# folds = {}
# prev_test = 0
# for i, (train_index, test_index) in enumerate(splits):
#     orders_tip_current = data_manager.get_orders_tip()
#     print(f'Fold {i + 1}')
#     print(f'Train: {len(train_index)}')
#     print(f'Test: {len(test_index)}')
#     print(f'Orders: {len(orders_tip_current)}')
#     print(f'Sum equal: {(len(train_index) + len(test_index) + prev_test) == len(orders_tip_current)}')
#     print(f'Ratio: {len(test_index) / (len(train_index) + len(test_index))}')
# 
#     train_orders = orders_tip_current.loc[train_index]
#     test_orders = orders_tip_current.loc[test_index]
#     current_fold = {
#         'train': train_orders,
#         'test': test_orders
#     }
#     folds[f'fold_{i}'] = current_fold
#     prev_test += len(test_index)

In [48]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {
    'criterion': ['entropy'],
    'max_depth': [9],
    'min_samples_leaf': [256]
}

grid_search_clf = GridSearchCV(DecisionTreeClassifier(), param_grid, n_jobs=-1, cv=last_order_user_tscv_splitter,
                               scoring='accuracy')

In [49]:
features = data_manager.get_registered_features() + ['order_number', 'order_dow', 'order_hour_of_day',
                                                     'days_since_prior_order']

X = orders_tip_train[features]
y = orders_tip_train['tip'].astype(bool)

In [50]:
X.head(25)

Unnamed: 0,contains_alcohol,hod_high_tip_probability,order_size,order_number_squared,dow_high_tip_probability,tip_history,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,0,0,6,4,0,0.0,2,3,7,15.0
1,0,0,5,9,0,0.0,3,3,12,21.0
2,0,0,5,16,0,0.0,4,4,7,29.0
3,0,0,8,25,0,0.0,5,4,15,28.0
4,0,0,4,36,0,0.0,6,2,7,19.0
5,0,0,5,49,1,0.0,7,1,9,20.0
6,0,0,6,64,1,0.0,8,1,14,14.0
7,0,0,6,81,1,0.0,9,1,16,0.0
8,0,0,9,100,0,0.0,10,4,8,30.0
9,0,0,6,4,0,0.0,2,5,10,10.0


In [51]:
grid_search_clf.fit(X, y)

Iteration 1: Train size: 2802456, Test size: 206209
Iteration 2: Train size: 2596247, Test size: 206209
Iteration 3: Train size: 2414024, Test size: 182223
Iteration 4: Train size: 2251391, Test size: 162633
Iteration 5: Train size: 2104923, Test size: 146468


In [57]:
grid_search_clf.cv_results_

{'mean_fit_time': array([9.62144513]),
 'std_fit_time': array([0.91506883]),
 'mean_score_time': array([0.03104668]),
 'std_score_time': array([0.00257137]),
 'param_criterion': masked_array(data=['entropy'],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_max_depth': masked_array(data=[9],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_min_samples_leaf': masked_array(data=[256],
              mask=[False],
        fill_value='?',
             dtype=object),
 'params': [{'criterion': 'entropy', 'max_depth': 9, 'min_samples_leaf': 256}],
 'split0_test_score': array([0.80144902]),
 'split1_test_score': array([0.79661896]),
 'split2_test_score': array([0.78959297]),
 'split3_test_score': array([0.78268863]),
 'split4_test_score': array([0.77742579]),
 'mean_test_score': array([0.78955507]),
 'std_test_score': array([0.00878218]),
 'rank_test_score': array([1])}

In [44]:
# import numpy as np
# from sklearn.datasets import load_iris
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.preprocessing import StandardScaler
# from sklearn.model_selection import GridSearchCV, Pipeline
# from sklearn.base import BaseEstimator, TransformerMixin
# 
# 
# # Example custom transformer that could modify data based on some condition per fold
# class CustomTransformer(BaseEstimator, TransformerMixin):
#     def fit(self, X, y=None):
#         # Implement fitting logic if necessary
#         return self
# 
#     def transform(self, X):
#         # Modify X based on some condition or computation
#         return X  # Return modified X
# 
# 
# class CustomSplit:
#     def __init__(self, n_splits=3):
#         self.n_splits = n_splits
# 
#     def split(self, X, y=None, groups=None):
#         n_samples = len(X)
#         fold_sizes = np.full(self.n_splits, n_samples // self.n_splits, dtype=int)
#         fold_sizes[:n_samples % self.n_splits] += 1
#         current = 0
#         for fold_size in fold_sizes:
#             start, stop = current, current + fold_size
#             test_indices = np.arange(start, stop)
#             train_indices = np.setdiff1d(np.arange(n_samples), test_indices)
#             yield train_indices, test_indices
# 
#     def get_n_splits(self, X=None, y=None, groups=None):
#         return self.n_splits
# 
# 
# # Load data
# data = load_iris()
# X, y = data.data, data.target
# 
# # Create a pipeline with a custom transformer and a classifier
# pipeline = Pipeline([
#     ('custom_transform', CustomTransformer()),
#     ('scaler', StandardScaler()),  # Example of another preprocessing step
#     ('classifier', RandomForestClassifier())
# ])
# 
# # Define parameter grid (note: include pipeline step names)
# param_grid = {
#     'classifier__n_estimators': [100, 200],
#     'classifier__max_features': ['auto', 'sqrt']
# }
# 
# # Setup GridSearchCV with custom cross-validator
# cv = CustomSplit(n_splits=5)
# grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring='accuracy')
# 
# # Perform grid search
# grid_search.fit(X, y)
# print("Best parameters:", grid_search.best_params_)
# print("Best cross-validation score: {:.3f}".format(grid_search.best_score_))
