# Data Science Project SoSe 2024
## Team 07
- Maximilian Hoffmann
- Kilian Kempf
- Daniel Schneider
- Tom Schuck

## Project Submission

### Libraries

In [1]:
import os

import pandas as pd

from data_management import DataManager

### Data Initialization

In [2]:
DATA_DIR = os.path.join(os.getcwd(), 'data/Instacart')

op_prior = pd.read_csv(os.path.join(DATA_DIR, 'order_products__prior.csv.zip'))
op_train = pd.read_csv(os.path.join(DATA_DIR, 'order_products__train.csv.zip'))

tip_train = pd.read_csv(os.path.join(DATA_DIR, 'tip_trainingsdaten1_.csv'))[['order_id', 'tip']]
tip_test = pd.read_csv(os.path.join(DATA_DIR, 'tip_testdaten1_template.csv'))

orders = pd.read_csv(os.path.join(DATA_DIR, 'orders.csv.zip'))
aisles = pd.read_csv(os.path.join(DATA_DIR, 'aisles.csv.zip'))
departments = pd.read_csv(os.path.join(DATA_DIR, 'departments.csv.zip'))
products = pd.read_csv(os.path.join(DATA_DIR, 'products.csv.zip'))

data_manager = DataManager(op_prior, op_train, tip_train, tip_test, orders, products, aisles, departments)
order_amount = len(data_manager.get_orders_tip())

### Feature Engineering

In [3]:
from feature_engineering.static_features import TipHistory, ReorderedRatio, OrderSize, PrevTippedProductsRatio, \
    CustomerLifetime, PrevOrderTipped, OrderFrequency, SimOrdersTipRatio, AvgSizePrevOrders, MeanOrderedRate, \
    LastTipSequence, RelDaysSinceTip, DaysSinceTip

from feature_engineering.dynamic_features import ProductTipRate, DepartmentTipRate, AisleTipRate, DynamicFeatureTest1, \
    DynamicFeatureTest2

# Kilian
tip_history = TipHistory()
reordered_rate = ReorderedRatio()
order_size = OrderSize()
prev_tipped_products_ratio = PrevTippedProductsRatio()
customer_lifetime = CustomerLifetime()
prev_order_tipped = PrevOrderTipped()

# Daniel
order_frequency = OrderFrequency()
mean_ordered_rate = MeanOrderedRate()
rel_days_since_tip = RelDaysSinceTip()
days_since_tip = DaysSinceTip()

# Max
sim_orders_tip_ratio = SimOrdersTipRatio()
product_tip_rate = ProductTipRate()
department_tip_rate = DepartmentTipRate()
aisle_tip_rate = AisleTipRate()
last_tip_sequence = LastTipSequence()  # TODO: Maybe remove weighting or add additional feature without weoghting

# Tom
avg_size_prev_orders = AvgSizePrevOrders()

# Test
dynamic_feature_test_1 = DynamicFeatureTest1()
dynamic_feature_test_2 = DynamicFeatureTest2()

In [4]:
# Static Features
# data_manager.register_feature(tip_history)
# data_manager.register_feature(reordered_rate)
# data_manager.register_feature(order_size)
# data_manager.register_feature(customer_lifetime)
# data_manager.register_feature(prev_order_tipped)
# data_manager.register_feature(prev_tipped_products_ratio)

# data_manager.register_feature(order_frequency)
# data_manager.register_feature(sim_orders_tip_ratio)
# data_manager.register_feature(order_excess_likability)
# data_manager.register_feature(avg_size_prev_orders)
# data_manager.register_feature(mean_ordered_rate)
# data_manager.register_feature(last_tip_sequence)
# data_manager.register_feature(rel_days_since_tip)
# data_manager.register_feature(days_since_tip)

# # Dynamic Features
# data_manager.register_feature(dynamic_feature_test_1)
# data_manager.register_feature(dynamic_feature_test_2)
# data_manager.register_feature(product_tip_rate)
# data_manager.register_feature(department_tip_rate)
# data_manager.register_feature(aisle_tip_rate)

In [5]:
data_manager.compute_features()

In [6]:
data_manager.get_orders_tip().head(100)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,tip
0,2539329,1,prior,1,2,8,,0.0
1,2398795,1,prior,2,3,7,15.0,0.0
2,473747,1,prior,3,3,12,21.0,0.0
3,2254736,1,prior,4,4,7,29.0,0.0
4,431534,1,prior,5,4,15,28.0,0.0
...,...,...,...,...,...,...,...,...
95,1916106,12,prior,3,5,8,14.0,0.0
96,1057378,12,prior,4,3,9,26.0,0.0
97,221248,12,prior,5,1,9,30.0,0.0
98,2618231,13,prior,1,6,12,,0.0


In [7]:
# data_manager.export_features('data/prepared_data/computed_features.csv.zip', only_static=False)

In [8]:
# data_manager.import_features('data/prepared_data/computed_features.csv.zip', only_static=False)

In [9]:
# data_manager.get_orders_tip().head(100)

In [10]:
# test = data_manager.get_orders_tip()

In [11]:
# print(f"Number of orders: {len(data_manager.get_orders_tip())}")
# print(f"Number of orders did not change: {order_amount == len(data_manager.get_orders_tip())}\n")
# print(f"Number of NaN or null values in each column:\n{data_manager.get_orders_tip().isnull().sum()}")

In [12]:
# order_ids = data_manager.get_orders_tip(complete=True).groupby('user_id')['order_id'].head(5)
# order_amount = len(order_ids)

In [13]:
# data_manager.set_subset(order_ids)

In [14]:
# data_manager.get_orders_tip().head(25)

In [15]:
# print(f"Number of orders: {len(data_manager.get_orders_tip())}")
# print(f"Number of orders did not change: {order_amount == len(data_manager.get_orders_tip())}\n")
# print(f"Number of NaN or null values in each column:\n{data_manager.get_orders_tip().isnull().sum()}")

In [16]:
# order_ids = data_manager.get_orders_tip(complete=True).groupby('user_id')['order_id'].head(4)
# order_amount = len(order_ids)
# 
# data_manager.set_subset(order_ids)
# data_manager.get_orders_tip().head(25)

In [17]:
# print(f"Number of orders: {len(data_manager.get_orders_tip())}")
# print(f"Number of orders did not change: {order_amount == len(data_manager.get_orders_tip())}\n")
# print(f"Number of NaN or null values in each column:\n{data_manager.get_orders_tip().isnull().sum()}")

### Analysis
- Only orders from the training set (orders with tip information) are considered

In [18]:
from analysis import DaysSincePriorOrder, DayOfWeek, HourOfDay, Department, OrderNumber, Aisle, Product

In [19]:
day_of_week = DayOfWeek(data_manager)
hour_of_day = HourOfDay(data_manager)
days_since_prior_order = DaysSincePriorOrder(data_manager)
order_number = OrderNumber(data_manager)
department = Department(data_manager)
aisle = Aisle(data_manager)
product = Product(data_manager)
# number_order_user = NumberOrderUser(data_manager)

# Frequency of items with tip probability > 0.9, 0.8, 0.7, 0.6, 0.5 ... 0.1 
# Group into percentiles and average tip probability
# Plot bar chart with tip probability and mean tip probability (limit to top 10?)

# TODO:
# - Decide on departments grouping
# - Product/Department/Aisle Tip Rate

In [20]:
# day_of_week.execute_analysis()

In [21]:
# hour_of_day.execute_analysis()

In [22]:
# days_since_prior_order.execute_analysis()

In [23]:
# order_number.execute_analysis()

In [24]:
# department.execute_analysis()

In [25]:
# aisle.execute_analysis()

In [26]:
# product.execute_analysis()

In [27]:
# number_order_user.execute_analysis()

### Data Preparation

In [28]:
from feature_engineering.static_features import DowHighTipProbability, HodHighTipProbability, OrderNumberSquared, \
    ContainsAlcohol

data_manager.register_feature(DowHighTipProbability())
data_manager.register_feature(HodHighTipProbability())
data_manager.register_feature(OrderNumberSquared())
data_manager.register_feature(ContainsAlcohol())

In [29]:
data_manager.compute_features()

### Model Training & Evaluation

In [30]:
from data_management import LastOrderUserTSCVSplitter, UserTSCVSplitter

In [31]:
orders_tip = data_manager.get_orders_tip()
orders_tip.shape

(3346083, 12)

In [32]:
orders_ids = orders_tip[orders_tip['order_number'] > 1]['order_id']

In [33]:
data_manager.set_subset(orders_ids)

In [34]:
data_manager.get_orders_tip().shape

(3139874, 12)

In [35]:
# cv = KFold(n_splits=5, shuffle=True, random_state=42)

In [36]:
last_order_user_tscv_splitter = LastOrderUserTSCVSplitter(data_manager, n_splits=5)

In [37]:
splits = last_order_user_tscv_splitter.split(data_manager.get_orders_tip())

In [38]:
folds = {}
prev_test = 0
for i, (train_index, test_index) in enumerate(splits):
    orders_tip_current = data_manager.get_orders_tip()
    print(f'Fold {i + 1}')
    print(f'Train: {len(train_index)}')
    print(f'Test: {len(test_index)}')
    print(f'Orders: {len(orders_tip_current)}')
    print(f'Sum equal: {(len(train_index) + len(test_index) + prev_test) == len(orders_tip_current)}')
    print(f'Ratio: {len(test_index) / (len(train_index) + len(test_index))}')

    train_orders = orders_tip_current.loc[train_index]
    test_orders = orders_tip_current.loc[test_index]
    current_fold = {
        'train': train_orders,
        'test': test_orders
    }
    folds[f'fold_{i}'] = current_fold
    prev_test += len(test_index)

Iteration 1: Train size: 2933665, Test size: 206209
Fold 1
Train: 2933665
Test: 206209
Orders: 3139874
Sum equal: True
Ratio: 0.06567429138876274
Iteration 2: Train size: 2727456, Test size: 206209
Fold 2
Train: 2727456
Test: 206209
Orders: 3139874
Sum equal: True
Ratio: 0.07029057509974725
Iteration 3: Train size: 2529933, Test size: 197523
Fold 3
Train: 2529933
Test: 197523
Orders: 3139874
Sum equal: True
Ratio: 0.07242023336031819
Iteration 4: Train size: 2354861, Test size: 175072
Fold 4
Train: 2354861
Test: 175072
Orders: 3139874
Sum equal: True
Ratio: 0.06920025154816353
Iteration 5: Train size: 2198056, Test size: 156805
Fold 5
Train: 2198056
Test: 156805
Orders: 3139874
Sum equal: True
Ratio: 0.0665877943538918


In [39]:
user_tscv_splitter = UserTSCVSplitter(data_manager, n_splits=5, validation_set_ratio=0.2)

In [40]:
splits = user_tscv_splitter.split(data_manager.get_orders_tip())

In [41]:
folds = {}
prev_test = 0
for i, (train_index, test_index) in enumerate(splits):
    orders_tip_current = data_manager.get_orders_tip()
    print(f'Fold {i + 1}')
    print(f'Train: {len(train_index)}')
    print(f'Test: {len(test_index)}')
    print(f'Orders: {len(orders_tip_current)}')
    print(f'Sum equal: {(len(train_index) + len(test_index) + prev_test) == len(orders_tip_current)}')
    print(f'Ratio: {len(test_index) / (len(train_index) + len(test_index))}')

    train_orders = orders_tip_current.loc[train_index]
    test_orders = orders_tip_current.loc[test_index]
    current_fold = {
        'train': train_orders,
        'test': test_orders
    }
    folds[f'fold_{i}'] = current_fold
    prev_test += len(test_index)

Fold 1
Train: 2512221
Test: 627653
Orders: 3139874
Sum equal: True
Ratio: 0.1998975118109835
Fold 2
Train: 2009582
Test: 502639
Orders: 3139874
Sum equal: True
Ratio: 0.20007754094882577
Fold 3
Train: 1607449
Test: 402133
Orders: 3139874
Sum equal: True
Ratio: 0.20010778360873058
Fold 4
Train: 1285714
Test: 321735
Orders: 3139874
Sum equal: True
Ratio: 0.20015253983174583
Fold 5
Train: 1028401
Test: 257313
Orders: 3139874
Sum equal: True
Ratio: 0.20013237780719506


In [42]:
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.model_selection import GridSearchCV
# 
# param_grid = {
#     'max_depth': [3, 5],
#     'min_samples_leaf': [1, 2]
# }
# 
# grid_search_clf = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=last_order_user_tscv_splitter, n_jobs=-1,
#                                scoring='accuracy')

In [44]:
# features = ['order_number', 'order_dow', 'order_hour_of_day', 'days_since_prior_order', 'tip_history']
# X = orders_tip[features]
# y = orders_tip['tip']

In [45]:
# grid_search_clf.fit(X, y)

In [46]:
# import numpy as np
# from sklearn.datasets import load_iris
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.preprocessing import StandardScaler
# from sklearn.model_selection import GridSearchCV, Pipeline
# from sklearn.base import BaseEstimator, TransformerMixin
# 
# 
# # Example custom transformer that could modify data based on some condition per fold
# class CustomTransformer(BaseEstimator, TransformerMixin):
#     def fit(self, X, y=None):
#         # Implement fitting logic if necessary
#         return self
# 
#     def transform(self, X):
#         # Modify X based on some condition or computation
#         return X  # Return modified X
# 
# 
# class CustomSplit:
#     def __init__(self, n_splits=3):
#         self.n_splits = n_splits
# 
#     def split(self, X, y=None, groups=None):
#         n_samples = len(X)
#         fold_sizes = np.full(self.n_splits, n_samples // self.n_splits, dtype=int)
#         fold_sizes[:n_samples % self.n_splits] += 1
#         current = 0
#         for fold_size in fold_sizes:
#             start, stop = current, current + fold_size
#             test_indices = np.arange(start, stop)
#             train_indices = np.setdiff1d(np.arange(n_samples), test_indices)
#             yield train_indices, test_indices
# 
#     def get_n_splits(self, X=None, y=None, groups=None):
#         return self.n_splits
# 
# 
# # Load data
# data = load_iris()
# X, y = data.data, data.target
# 
# # Create a pipeline with a custom transformer and a classifier
# pipeline = Pipeline([
#     ('custom_transform', CustomTransformer()),
#     ('scaler', StandardScaler()),  # Example of another preprocessing step
#     ('classifier', RandomForestClassifier())
# ])
# 
# # Define parameter grid (note: include pipeline step names)
# param_grid = {
#     'classifier__n_estimators': [100, 200],
#     'classifier__max_features': ['auto', 'sqrt']
# }
# 
# # Setup GridSearchCV with custom cross-validator
# cv = CustomSplit(n_splits=5)
# grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring='accuracy')
# 
# # Perform grid search
# grid_search.fit(X, y)
# print("Best parameters:", grid_search.best_params_)
# print("Best cross-validation score: {:.3f}".format(grid_search.best_score_))
