# Data Science Project SoSe 2024
## Team 07
- Maximilian Hoffmann
- Kilian Kempf
- Daniel Schneider
- Tom Schuck

## Project Submission

### Data Initialization

In [1]:
import os

import pandas as pd

from data_management import DataManager

DATA_DIR = os.path.join(os.getcwd(), 'data/Instacart')

op_prior = pd.read_csv(os.path.join(DATA_DIR, 'order_products__prior.csv.zip'))
op_train = pd.read_csv(os.path.join(DATA_DIR, 'order_products__train.csv.zip'))

tip_train = pd.read_csv(os.path.join(DATA_DIR, 'tip_trainingsdaten1_.csv'))[['order_id', 'tip']]
tip_test = pd.read_csv(os.path.join(DATA_DIR, 'tip_testdaten1_template.csv'))

orders = pd.read_csv(os.path.join(DATA_DIR, 'orders.csv.zip'))
aisles = pd.read_csv(os.path.join(DATA_DIR, 'aisles.csv.zip'))
departments = pd.read_csv(os.path.join(DATA_DIR, 'departments.csv.zip'))
products = pd.read_csv(os.path.join(DATA_DIR, 'products.csv.zip'))

data_manager = DataManager(op_prior, op_train, tip_train, tip_test, orders, products, aisles, departments)
order_amount = len(data_manager.get_orders_tip())

### Feature Engineering

In [2]:
from feature_engineering.static_features import TipHistory, ReorderedRatio, OrderSize, PrevTippedProductsRatio, \
    CustomerLifetime, PrevOrderTipped, OrderFrequency, SimOrdersTipRatio, AvgSizePrevOrders, MeanOrderedRate, \
    LastTipSequence, RelDaysSinceTip, DaysSinceTip, OrderNumberSquared, HodHighTipProbability, DowHighTipProbability, \
    ContainsAlcohol

from feature_engineering.dynamic_features import ProductTipRate, DepartmentTipRate, AisleTipRate, DynamicFeatureTest1, \
    DynamicFeatureTest2, AssocRulesAisles, AssocRulesDepartments

# Kilian
tip_history = TipHistory()
reordered_rate = ReorderedRatio()
order_size = OrderSize()
prev_tipped_products_ratio = PrevTippedProductsRatio()
customer_lifetime = CustomerLifetime()
prev_order_tipped = PrevOrderTipped()

# Daniel
order_frequency = OrderFrequency()
mean_ordered_rate = MeanOrderedRate()
rel_days_since_tip = RelDaysSinceTip()
days_since_tip = DaysSinceTip()

# Max
sim_orders_tip_ratio = SimOrdersTipRatio()
product_tip_rate = ProductTipRate()
department_tip_rate = DepartmentTipRate()
aisle_tip_rate = AisleTipRate()
last_tip_sequence = LastTipSequence()

# Tom
avg_size_prev_orders = AvgSizePrevOrders()

# Features analysis
order_number_squared = OrderNumberSquared()
hod_high_tip_probability = HodHighTipProbability()
dow_high_tip_probability = DowHighTipProbability()
contains_alcohol = ContainsAlcohol()

assoc_rules_departments = AssocRulesDepartments()
assoc_rules_aisles = AssocRulesAisles()

# Test
dynamic_feature_test_1 = DynamicFeatureTest1()
dynamic_feature_test_2 = DynamicFeatureTest2()

In [3]:
# Static Features
data_manager.register_feature(tip_history)
data_manager.register_feature(reordered_rate)
data_manager.register_feature(order_size)
data_manager.register_feature(customer_lifetime)
data_manager.register_feature(prev_order_tipped)
data_manager.register_feature(prev_tipped_products_ratio)
 
data_manager.register_feature(order_frequency)
data_manager.register_feature(sim_orders_tip_ratio)
data_manager.register_feature(avg_size_prev_orders)
data_manager.register_feature(mean_ordered_rate)
data_manager.register_feature(last_tip_sequence)
data_manager.register_feature(rel_days_since_tip)
data_manager.register_feature(days_since_tip)
 
data_manager.register_feature(order_number_squared)
data_manager.register_feature(hod_high_tip_probability)
data_manager.register_feature(dow_high_tip_probability)
data_manager.register_feature(contains_alcohol)

# Dynamic Features
data_manager.register_feature(dynamic_feature_test_1)
data_manager.register_feature(dynamic_feature_test_2)
data_manager.register_feature(product_tip_rate)
data_manager.register_feature(department_tip_rate)
data_manager.register_feature(aisle_tip_rate)

# data_manager.register_feature(assoc_rules_departments)
# data_manager.register_feature(assoc_rules_aisles)

In [None]:
data_manager.compute_features()

In [None]:
data_manager.get_orders_tip().head(100)

In [None]:
# data_manager.export_features('data/prepared_data/computed_features.csv.zip', only_static=False)

In [4]:
data_manager.import_features('data/prepared_data/computed_features.csv.zip', only_static=False)

In [None]:
data_manager.calculate_feature_correlations(only_static=False)

In [None]:
data_manager.visualize_feature_analysis(only_static=False)

In [None]:
# data_manager.get_orders_tip().head(100)

In [None]:
# test = data_manager.get_orders_tip()

In [None]:
# print(f"Number of orders: {len(data_manager.get_orders_tip())}")
# print(f"Number of orders did not change: {order_amount == len(data_manager.get_orders_tip())}\n")
# print(f"Number of NaN or null values in each column:\n{data_manager.get_orders_tip().isnull().sum()}")

In [None]:
# order_ids = data_manager.get_orders_tip(complete=True).groupby('user_id')['order_id'].head(5)
# order_amount = len(order_ids)

In [None]:
# data_manager.set_subset(order_ids)

In [None]:
# data_manager.get_orders_tip().head(25)

In [None]:
# print(f"Number of orders: {len(data_manager.get_orders_tip())}")
# print(f"Number of orders did not change: {order_amount == len(data_manager.get_orders_tip())}\n")
# print(f"Number of NaN or null values in each column:\n{data_manager.get_orders_tip().isnull().sum()}")

In [None]:
# order_ids = data_manager.get_orders_tip(complete=True).groupby('user_id')['order_id'].head(4)
# order_amount = len(order_ids)
# 
# data_manager.set_subset(order_ids)
# data_manager.get_orders_tip().head(25)

In [None]:
# print(f"Number of orders: {len(data_manager.get_orders_tip())}")
# print(f"Number of orders did not change: {order_amount == len(data_manager.get_orders_tip())}\n")
# print(f"Number of NaN or null values in each column:\n{data_manager.get_orders_tip().isnull().sum()}")

### Analysis
- Only orders from the training set (orders with tip information) are considered

In [None]:
# from analysis import DayOfWeek, HourOfDay, DaysSincePriorOrder, OrderNumber, Department, Aisle, Product, GeneralAnalysis

In [None]:
# day_of_week = DayOfWeek(data_manager)
# hour_of_day = HourOfDay(data_manager)
# days_since_prior_order = DaysSincePriorOrder(data_manager)
# order_number = OrderNumber(data_manager)
# department = Department(data_manager)
# aisle = Aisle(data_manager)
# product = Product(data_manager)
# number_order_user = NumberOrderUser(data_manager)
# general_analysis = GeneralAnalysis(data_manager)

In [None]:
# day_of_week.execute_analysis()

In [None]:
# hour_of_day.execute_analysis()

In [None]:
# days_since_prior_order.execute_analysis()

In [None]:
# order_number.execute_analysis()

In [None]:
# department.execute_analysis()

In [None]:
# aisle.execute_analysis()

In [None]:
# product.execute_analysis()

In [None]:
# number_order_user.execute_analysis()

### Data Preparation

In [None]:
from feature_engineering.static_features import DowHighTipProbability, HodHighTipProbability, OrderNumberSquared, \
    ContainsAlcohol

data_manager.register_feature(DowHighTipProbability())
data_manager.register_feature(HodHighTipProbability())
data_manager.register_feature(OrderNumberSquared())
data_manager.register_feature(ContainsAlcohol())

In [None]:
data_manager.compute_features()

### Model Training & Evaluation

Order is important:
- Remove first order of each user and assign the dataset for parameter tuning
- Initialize cross-validation splitter
- Export splits
- Tune the parameters of the model with the dataset from 1.

In [5]:
from data_management.cross_validation import LastOrderUserTSCVSplitter

In [6]:
orders_tip = data_manager.get_orders_tip()
orders_tip.shape

(3346083, 30)

In [7]:
data_manager.remove_first_orders()

In [8]:
orders_tip_train = data_manager.get_orders_tip_train().copy()
orders_tip_train.shape

(3008665, 30)

In [9]:
# cv = KFold(n_splits=5, shuffle=True, random_state=42)

In [10]:
last_order_user_tscv_splitter = LastOrderUserTSCVSplitter(data_manager, n_splits=5)

In [11]:
splits = last_order_user_tscv_splitter.split(orders_tip_train)

In [None]:
folds = {}
prev_test = 0
for i, (train_index, test_index) in enumerate(splits):
    print(f'Fold {i + 1}')
    print(f'Train: {len(train_index)}')
    print(f'Test: {len(test_index)}')
    print(f'Orders: {len(orders_tip_train)}')
    print(f'Sum equal: {(len(train_index) + len(test_index) + prev_test) == len(orders_tip_train)}')
    print(f'Ratio: {len(test_index) / (len(train_index) + len(test_index))}')

    train_orders = orders_tip_train.loc[train_index]
    test_orders = orders_tip_train.loc[test_index]
    current_fold = {
        'train': train_orders,
        'test': test_orders
    }
    folds[f'fold_{i}'] = current_fold
    prev_test += len(test_index)

In [None]:
last_order_user_tscv_splitter.export_splits('data/prepared_data/')

In [None]:
# user_tscv_splitter = UserTSCVSplitter(data_manager, n_splits=5, validation_set_ratio=0.2, seed=42)

In [None]:
# splits = user_tscv_splitter.split(orders_tip_train)

In [None]:
# folds = {}
# prev_test = 0
# for i, (train_index, test_index) in enumerate(splits):
#     print(f'Fold {i + 1}')
#     print(f'Train: {len(train_index)}')
#     print(f'Test: {len(test_index)}')
#     print(f'Orders: {len(orders_tip_train)}')
#     print(f'Sum equal: {(len(train_index) + len(test_index) + prev_test) == len(orders_tip_train)}')
#     print(f'Ratio: {len(test_index) / (len(train_index) + len(test_index))}')
# 
#     train_orders = orders_tip_train.loc[train_index]
#     test_orders = orders_tip_train.loc[test_index]
#     current_fold = {
#         'train': train_orders,
#         'test': test_orders
#     }
#     folds[f'fold_{i}'] = current_fold
#     prev_test += len(test_index)

In [None]:
# user_tscv_splitter.export_splits('data/prepared_data/')

In [None]:
# features = ['order_number', 'order_dow', 'order_hour_of_day', 'days_since_prior_order', 'order_size', 'tip_history',
#             'dynamic_feature_test_1', 'dynamic_feature_test_2']
#
# prepared_splits_dict = user_tscv_splitter.import_splits('data/prepared_data/', features)

In [12]:
# features = ['order_number', 'order_dow', 'order_hour_of_day', 'days_since_prior_order', 'order_size', 'tip_history', 'customer_lifetime', 'reordered_ratio', 'hod_high_tip_probability', 'order_frequency', 'last_tip_sequence', 'avg_size_prev_orders', 'contains_alcohol', 'prev_order_tipped', 'dow_high_tip_probability', 'prev_tipped_products_ratio', 'order_number_squared', 'mean_ordered_rate', 'sim_orders_tip_ratio', 'aisle_tip_rate', 'dept_tip_rate', 'product_tip_rate']

features = ['order_number', 'order_dow', 'order_hour_of_day', 'days_since_prior_order', 'tip_history', 'customer_lifetime', 'reordered_ratio', 'hod_high_tip_probability', 'order_frequency', 'last_tip_sequence', 'contains_alcohol', 'prev_order_tipped', 'dow_high_tip_probability', 'prev_tipped_products_ratio', 'sim_orders_tip_ratio', 'aisle_tip_rate', 'dept_tip_rate', 'product_tip_rate']

# features = ['order_number', 'order_dow', 'order_hour_of_day', 'days_since_prior_order', 'order_size', 'tip_history']

prepared_splits_dict = last_order_user_tscv_splitter.import_splits('data/prepared_data/', features)

In [13]:
from data_management import DatasetSelector
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from tempfile import mkdtemp
from joblib import Memory

In [None]:
# TODO: Add scaling & feature selection
pipeline_steps = [('selector', DatasetSelector(prepared_splits_dict)),
                  ('estimator', DecisionTreeClassifier())]

cachedir = mkdtemp()
memory = Memory(location=cachedir, verbose=0)

pipeline = Pipeline(pipeline_steps, verbose=False, memory=memory)

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'estimator__criterion': ['entropy'],
    'estimator__max_depth': [9, 10],
    'estimator__min_samples_leaf': [256]
}

grid_search_clf = GridSearchCV(pipeline, param_grid, cv=last_order_user_tscv_splitter, scoring='accuracy', verbose=1,
                               n_jobs=-1)

In [None]:
X = orders_tip_train[features]
y = orders_tip_train['tip'].astype(bool)

In [None]:
X.head(25)

In [None]:
print(f'X shape: {X.shape}')
print(f'y shape: {y.shape}')

In [None]:
grid_search_clf.fit(X, y)

In [14]:
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler

In [15]:
pipeline_steps = [('selector', DatasetSelector(prepared_splits_dict)),
                  ('scaler', StandardScaler()),
                  ('mlp', MLPClassifier(max_iter=100))]

# cachedir = mkdtemp()
# memory = Memory(location=cachedir, verbose=0)

# pipeline = Pipeline(pipeline_steps, verbose=False, memory=memory)
pipeline = Pipeline(pipeline_steps, verbose=False)

In [16]:
from sklearn.model_selection import GridSearchCV

# param_grid = {
#     'mlp__hidden_layer_sizes': [(5,), (10,), (5, 5)],
#     'mlp__activation': ['relu', 'tanh'],
#     'mlp__alpha': [0.0001, 0.001, 0.01],
# }

param_grid = {
    'mlp__hidden_layer_sizes': [(15,)],
    'mlp__activation': ['relu'],
    'mlp__alpha': [0.01],
}

grid_search_clf = GridSearchCV(pipeline, param_grid, cv=last_order_user_tscv_splitter, scoring='accuracy', verbose=1, n_jobs=2)

In [17]:
X = orders_tip_train[features]
y = orders_tip_train['tip'].astype(bool)

In [18]:
print(f'X shape: {X.shape}')
print(f'y shape: {y.shape}')

X shape: (3008665, 18)
y shape: (3008665,)


In [19]:
grid_search_clf.fit(X, y)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Transformation: (3008665, 18) -> (3008665, 18)


In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
pipeline_steps = [('selector', DatasetSelector(prepared_splits_dict)),
                  ('scaler', StandardScaler()),
                  ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss'))]

pipeline = Pipeline(pipeline_steps, verbose=False)

# param_grid = {
#     'xgb__n_estimators': [30, 100],
#     'xgb__max_depth': [5, 10],
#     'xgb__learning_rate': [0.01, 0.1],
#     'xgb__subsample': [0.4, 0.6, 0.8]
# }

param_grid = {
    'xgb__learning_rate': [0.1],
    'xgb__max_depth': [5],
    'xgb__n_estimators': [100],
    'xgb__subsample': [0.6]
}

In [None]:
grid_search_clf = GridSearchCV(pipeline, param_grid, cv=last_order_user_tscv_splitter, scoring='accuracy', verbose=1, n_jobs=2)

In [None]:
X = orders_tip_train[features]
y = orders_tip_train['tip'].astype(bool)

print(f'X shape: {X.shape}')
print(f'y shape: {y.shape}')

In [None]:
grid_search_clf.fit(X, y)

In [37]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [38]:
pipeline_steps = [('selector', DatasetSelector(prepared_splits_dict)),
                  ('scaler', StandardScaler()),
                  ('logreg', LogisticRegression(max_iter=1000))]

# cachedir = mkdtemp()
# memory = Memory(location=cachedir, verbose=0)

# pipeline = Pipeline(pipeline_steps, verbose=False, memory=memory)
pipeline = Pipeline(pipeline_steps, verbose=False)

In [39]:
from sklearn.model_selection import GridSearchCV
# param_grid = {
#     'logreg__C': [0.001, 0.01, 0.1, 1, 10, 100],
#     'logreg__solver': ['liblinear', 'lbfgs', 'newton-cg', 'sag', 'saga'],
# }
param_grid = {
    'logreg__C': [0.001],
    'logreg__solver': ['sag'],
}
# {'logreg__C': 0.001, 'logreg__solver': 'sag'} - 0.8025

grid_search_clf = GridSearchCV(pipeline, param_grid, cv=last_order_user_tscv_splitter, scoring='accuracy', verbose=1, n_jobs=2)

In [40]:
X = orders_tip_train[features]
y = orders_tip_train['tip'].astype(bool)

In [41]:
print(f'X shape: {X.shape}')
print(f'y shape: {y.shape}')

X shape: (3008665, 18)
y shape: (3008665,)


In [42]:
grid_search_clf.fit(X, y)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Transformation: (3008665, 18) -> (3008665, 18)


In [33]:
from sklearn.naive_bayes import GaussianNB
# Assuming 'DatasetSelector' and 'last_order_user_tscv_splitter' are defined somewhere in your code

# Define the pipeline steps
pipeline_steps = [('selector', DatasetSelector(prepared_splits_dict)),
                  ('scaler', StandardScaler()),
                  ('nb', GaussianNB())]

# Create the pipeline
pipeline = Pipeline(pipeline_steps, verbose=False)

In [41]:
# Define the parameter grid for GaussianNB (Naive Bayes does not have many hyperparameters to tune)
param_grid = {
    'nb__var_smoothing': [1e-9, 1e-10, 1e-11]  # Variation smoothing parameter
}
# {'nb__var_smoothing': 1e-09} - 0.7841381709885383

# Set up GridSearchCV
grid_search_clf = GridSearchCV(pipeline, param_grid, cv=last_order_user_tscv_splitter, scoring='accuracy', verbose=1, n_jobs=2)

In [42]:
# Assuming 'orders_tip_train' and 'features' are defined somewhere in your code
X = orders_tip_train[features]
y = orders_tip_train['tip'].astype(bool)

# Display the shapes of X and y
print(f'X shape: {X.shape}')
print(f'y shape: {y.shape}')

X shape: (3008665, 18)
y shape: (3008665,)


In [43]:
# Perform the grid search
grid_search_clf.fit(X, y)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
Transformation: (2251391, 18) -> (2251391, 18)
Transformation: (162633, 18) -> (162633, 18)
Transformation: (2596247, 18) -> (2596247, 18)
Transformation: (206209, 18) -> (206209, 18)
Transformation: (2414024, 18) -> (2414024, 18)
Transformation: (182223, 18) -> (182223, 18)
Transformation: (2104923, 18) -> (2104923, 18)
Transformation: (146468, 18) -> (146468, 18)
Transformation: (2596247, 18) -> (2596247, 18)
Transformation: (206209, 18) -> (206209, 18)
Transformation: (2251391, 18) -> (2251391, 18)
Transformation: (162633, 18) -> (162633, 18)
Transformation: (2802456, 18) -> (2802456, 18)
Transformation: (206209, 18) -> (206209, 18)
Transformation: (2414024, 18) -> (2414024, 18)
Transformation: (182223, 18) -> (182223, 18)
Transformation: (2104923, 18) -> (2104923, 18)
Transformation: (146468, 18) -> (146468, 18)
Transformation: (3008665, 18) -> (3008665, 18)


In [44]:
cv_results = grid_search_clf.cv_results_
result_list = [grid_search_clf.cv_results_[f'split{i}_test_score'][grid_search_clf.best_index_] for i in range(5)]

for i, result in enumerate(result_list):
    print(f'Accuracy (Fold {i + 1}): {result}')

print(f'\nMean Accuracy: {cv_results["mean_test_score"][grid_search_clf.best_index_]}')

Accuracy (Fold 1): 0.794654937466357
Accuracy (Fold 2): 0.7907220344407858
Accuracy (Fold 3): 0.7851917705229307
Accuracy (Fold 4): 0.7780831688525699
Accuracy (Fold 5): 0.7720389436600487

Mean Accuracy: 0.7841381709885383


In [45]:
grid_search_clf.best_score_

0.7841381709885383

In [46]:
grid_search_clf.best_params_

{'nb__var_smoothing': 1e-09}

In [23]:
best_estimator = grid_search_clf.best_estimator_

In [47]:
# Relevant for linear regression
logreg_model = best_estimator.named_steps['logreg']
coefficients = logreg_model.coef_
intercept = logreg_model.intercept_

print("Best parameters:", grid_search_clf.best_params_)
print("Coefficients:", coefficients)
print("Intercept:", intercept)

pltcoef = coefficients[0]

import matplotlib.pyplot as plt
plt.figure(figsize=(10, 6))
plt.bar(features, pltcoef)
plt.xlabel('Features')
plt.ylabel('Coefficient Value')
plt.title('Coefficients of Linear Model')
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()

Best parameters: {'logreg__C': 0.001, 'logreg__solver': 'sag'}
Coefficients: [[-0.68114537 -0.00465985 -0.05736427 -0.12053306  2.38529508  0.32146342
  -0.33154621  0.79110887 -0.9226077   2.7156059   0.59494381  0.8051806
   0.85252069  0.73677522  0.01476508  1.7547354   1.84671362  1.97335151]]
Intercept: [-3.63042402]


In [None]:
# features = ['order_number', 'order_dow', 'order_hour_of_day', 'days_since_prior_order', 'order_size', 'tip_history',
#             'dynamic_feature_test_1', 'dynamic_feature_test_2']
#
# prepared_splits_dict = last_order_user_tscv_splitter.import_splits('data/prepared_data/', features)

In [None]:
# all_orders = data_manager.get_orders_tip(complete=True)
# all_orders.shape

In [None]:
# data_manager.set_subset(all_orders['order_id'])

In [None]:
# validation_test = data_manager.get_orders_tip_test()
# validation_test.shape

In [None]:
# validation_train = data_manager.get_orders_tip_train()
# validation_train.shape

In [None]:
# validation_train_without_first = validation_train[validation_train['order_number'] > 1]
# validation_train_without_first.shape

In [None]:
# prepared_splits_dict[-1147446606210854265].shape[0]

In [None]:
# all_orders_without_first = all_orders[all_orders['order_number'] > 1].reset_index(drop=True)
# all_orders_without_first.shape

In [None]:
# validation_train_without_first[features].reset_index(drop=True)

In [None]:
# prepared_splits_dict[-1147446606210854265]

In [None]:
# round(prepared_splits_dict[-1147446606210854265], 6).equals(round(validation_train_without_first[
#                                                                       features].reset_index(drop=True), 6))

In [None]:
# validation_test[features]

In [None]:
# prepared_splits_dict[4027820417624348460]

In [None]:
# round(prepared_splits_dict[4027820417624348460].reset_index(drop=True), 6).equals(round(
#     validation_test[features].reset_index(drop=True), 6))

### Prediction

IMPORTANT:
- Before predicting, the indices must probably be reseted, for X and y
- To save the results, we must join the predictions with the template based on the ORDER_ID, beause the indices don't align


In [None]:
# orders_tip_test = data_manager.get_orders_tip_test()
# orders_tip_test['tip'] = best_estimator.predict(orders_tip_test[features])

In [None]:
# orders_tip_test_csv = orders_tip_test[tip_test.columns].copy()
# orders_tip_test_csv.rename(columns={tip_test.columns[0]: ''}, inplace=True)
# orders_tip_test_csv

In [None]:
# orders_tip_test_csv.to_csv(os.path.join(DATA_DIR, 'tip_testdaten1.csv'), index=False)