# Data Science Project SoSe 2024
## Team 07
- Maximilian Hoffmann
- Kilian Kempf
- Daniel Schneider
- Tom Schuck

## Project Submission

### Libraries

In [None]:
import os

import pandas as pd

from data_management import DataManager

### Data Initialization

In [None]:
DATA_DIR = os.path.join(os.getcwd(), 'data/Instacart')

op_prior = pd.read_csv(os.path.join(DATA_DIR, 'order_products__prior.csv.zip'))
op_train = pd.read_csv(os.path.join(DATA_DIR, 'order_products__train.csv.zip'))

tip_train = pd.read_csv(os.path.join(DATA_DIR, 'tip_trainingsdaten1_.csv'))[['order_id', 'tip']]
tip_test = pd.read_csv(os.path.join(DATA_DIR, 'tip_testdaten1_template.csv'))

orders = pd.read_csv(os.path.join(DATA_DIR, 'orders.csv.zip'))
aisles = pd.read_csv(os.path.join(DATA_DIR, 'aisles.csv.zip'))
departments = pd.read_csv(os.path.join(DATA_DIR, 'departments.csv.zip'))
products = pd.read_csv(os.path.join(DATA_DIR, 'products.csv.zip'))

data_manager = DataManager(op_prior, op_train, tip_train, tip_test, orders, products, aisles, departments)
order_amount = len(data_manager.get_orders_tip())

### Feature Engineering

In [None]:
from feature_engineering.static_features import TipHistory, ReorderedRatio, OrderSize, PrevTippedProductsRatio, \
    CustomerLifetime, PrevOrderTipped, OrderFrequency, SimOrdersTipRatio, AvgSizePrevOrders, MeanOrderedRate, \
    LastTipSequence, RelDaysSinceTip, DaysSinceTip

from feature_engineering.dynamic_features import ProductTipRate, DepartmentTipRate, AisleTipRate, DynamicFeatureTest1, \
    DynamicFeatureTest2

# Kilian
tip_history = TipHistory()
reordered_rate = ReorderedRatio()
order_size = OrderSize()
prev_tipped_products_ratio = PrevTippedProductsRatio()
customer_lifetime = CustomerLifetime()
prev_order_tipped = PrevOrderTipped()

# Daniel
order_frequency = OrderFrequency()
mean_ordered_rate = MeanOrderedRate()
rel_days_since_tip = RelDaysSinceTip()
days_since_tip = DaysSinceTip()

# Max
sim_orders_tip_ratio = SimOrdersTipRatio()
product_tip_rate = ProductTipRate()
department_tip_rate = DepartmentTipRate()
aisle_tip_rate = AisleTipRate()
last_tip_sequence = LastTipSequence()  # TODO: Maybe remove weighting or add additional feature without weoghting

# Tom
avg_size_prev_orders = AvgSizePrevOrders()

# Test
dynamic_feature_test_1 = DynamicFeatureTest1()
dynamic_feature_test_2 = DynamicFeatureTest2()

In [None]:
# Static Features
# data_manager.register_feature(tip_history)
# data_manager.register_feature(reordered_rate)
# data_manager.register_feature(order_size)
# data_manager.register_feature(customer_lifetime)
# data_manager.register_feature(prev_order_tipped)
# data_manager.register_feature(prev_tipped_products_ratio)

# data_manager.register_feature(order_frequency)
# data_manager.register_feature(sim_orders_tip_ratio)
# data_manager.register_feature(avg_size_prev_orders)
# data_manager.register_feature(mean_ordered_rate)
# data_manager.register_feature(last_tip_sequence)
# data_manager.register_feature(rel_days_since_tip)
# data_manager.register_feature(days_since_tip)

# # Dynamic Features
# data_manager.register_feature(dynamic_feature_test_1)
# data_manager.register_feature(dynamic_feature_test_2)
# data_manager.register_feature(product_tip_rate)
# data_manager.register_feature(department_tip_rate)
# data_manager.register_feature(aisle_tip_rate)

In [None]:
data_manager.compute_features()

In [None]:
data_manager.get_orders_tip().head(100)

In [None]:
# data_manager.export_features('data/prepared_data/computed_features.csv.zip', only_static=False)

In [None]:
# data_manager.import_features('data/prepared_data/computed_features.csv.zip', only_static=False)

In [None]:
data_manager.get_orders_tip().head(100)

In [None]:
# test = data_manager.get_orders_tip()

In [None]:
# print(f"Number of orders: {len(data_manager.get_orders_tip())}")
# print(f"Number of orders did not change: {order_amount == len(data_manager.get_orders_tip())}\n")
# print(f"Number of NaN or null values in each column:\n{data_manager.get_orders_tip().isnull().sum()}")

In [None]:
# order_ids = data_manager.get_orders_tip(complete=True).groupby('user_id')['order_id'].head(5)
# order_amount = len(order_ids)

In [None]:
# data_manager.set_subset(order_ids)

In [None]:
# data_manager.get_orders_tip().head(25)

In [None]:
# print(f"Number of orders: {len(data_manager.get_orders_tip())}")
# print(f"Number of orders did not change: {order_amount == len(data_manager.get_orders_tip())}\n")
# print(f"Number of NaN or null values in each column:\n{data_manager.get_orders_tip().isnull().sum()}")

In [None]:
# order_ids = data_manager.get_orders_tip(complete=True).groupby('user_id')['order_id'].head(4)
# order_amount = len(order_ids)
# 
# data_manager.set_subset(order_ids)
# data_manager.get_orders_tip().head(25)

In [None]:
# print(f"Number of orders: {len(data_manager.get_orders_tip())}")
# print(f"Number of orders did not change: {order_amount == len(data_manager.get_orders_tip())}\n")
# print(f"Number of NaN or null values in each column:\n{data_manager.get_orders_tip().isnull().sum()}")

### Analysis
- Only orders from the training set (orders with tip information) are considered

In [None]:
from analysis import DaysSincePriorOrder, DayOfWeek, HourOfDay, Department, OrderNumber, Aisle, Product, NumberOrderUser, GeneralAnalysis

In [None]:
day_of_week = DayOfWeek(data_manager)
hour_of_day = HourOfDay(data_manager)
days_since_prior_order = DaysSincePriorOrder(data_manager)
order_number = OrderNumber(data_manager)
department = Department(data_manager)
aisle = Aisle(data_manager)
product = Product(data_manager)
number_order_user = NumberOrderUser(data_manager)
general_analysis = GeneralAnalysis(data_manager)

# Frequency of items with tip probability > 0.9, 0.8, 0.7, 0.6, 0.5 ... 0.1 
# Group into percentiles and average tip probability
# Plot bar chart with tip probability and mean tip probability (limit to top 10?)

# TODO:
# - Decide on departments grouping
# - Product/Department/Aisle Tip Rate

In [None]:
# day_of_week.execute_analysis()

In [None]:
# hour_of_day.execute_analysis()

In [None]:
# days_since_prior_order.execute_analysis()

In [None]:
# order_number.execute_analysis()

In [None]:
# department.execute_analysis()

In [None]:
# aisle.execute_analysis()

In [None]:
# product.execute_analysis()

In [None]:
# number_order_user.execute_analysis()

In [None]:
# general_analysis.execute_analysis()

### Data Preparation

In [None]:
from feature_engineering.static_features import DowHighTipProbability, HodHighTipProbability, OrderNumberSquared, \
    ContainsAlcohol

data_manager.register_feature(DowHighTipProbability())
data_manager.register_feature(HodHighTipProbability())
data_manager.register_feature(OrderNumberSquared())
data_manager.register_feature(ContainsAlcohol())

In [None]:
data_manager.compute_features()

### Model Training & Evaluation

In [None]:
from data_management import LastOrderUserTSCVSplitter

In [None]:
orders_tip = data_manager.get_orders_tip()
orders_tip.shape

In [None]:
orders_ids = orders_tip[orders_tip['order_number'] > 1]['order_id']

In [None]:
data_manager.set_subset(orders_ids)

In [None]:
data_manager.get_orders_tip().shape

In [None]:
# cv = KFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
last_order_user_tscv_splitter = LastOrderUserTSCVSplitter(data_manager, n_splits=5)

In [None]:
# user_tscv_splitter = UserTSCVSplitter(n_splits=5, test_size=0.2, data_manager)

In [None]:
splits = last_order_user_tscv_splitter.split(data_manager.get_orders_tip())

In [None]:
folds = {}
for i, (train_index, test_index) in enumerate(splits):
    orders_tip_current = data_manager.get_orders_tip()
    print(f'Fold {i + 1}')
    print(f'Train: {len(train_index)}')
    print(f'Test: {len(test_index)}')
    print(f'Orders: {len(orders_tip_current)}')

    train_orders = orders_tip_current.loc[train_index]
    test_orders = orders_tip_current.loc[test_index]
    current_fold = {
        'train': train_orders,
        'test': test_orders
    }
    folds[f'fold_{i}'] = current_fold

In [None]:
# import numpy as np
# from sklearn.datasets import load_iris
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.preprocessing import StandardScaler
# from sklearn.model_selection import GridSearchCV, Pipeline
# from sklearn.base import BaseEstimator, TransformerMixin
# 
# 
# # Example custom transformer that could modify data based on some condition per fold
# class CustomTransformer(BaseEstimator, TransformerMixin):
#     def fit(self, X, y=None):
#         # Implement fitting logic if necessary
#         return self
# 
#     def transform(self, X):
#         # Modify X based on some condition or computation
#         return X  # Return modified X
# 
# 
# class CustomSplit:
#     def __init__(self, n_splits=3):
#         self.n_splits = n_splits
# 
#     def split(self, X, y=None, groups=None):
#         n_samples = len(X)
#         fold_sizes = np.full(self.n_splits, n_samples // self.n_splits, dtype=int)
#         fold_sizes[:n_samples % self.n_splits] += 1
#         current = 0
#         for fold_size in fold_sizes:
#             start, stop = current, current + fold_size
#             test_indices = np.arange(start, stop)
#             train_indices = np.setdiff1d(np.arange(n_samples), test_indices)
#             yield train_indices, test_indices
# 
#     def get_n_splits(self, X=None, y=None, groups=None):
#         return self.n_splits
# 
# 
# # Load data
# data = load_iris()
# X, y = data.data, data.target
# 
# # Create a pipeline with a custom transformer and a classifier
# pipeline = Pipeline([
#     ('custom_transform', CustomTransformer()),
#     ('scaler', StandardScaler()),  # Example of another preprocessing step
#     ('classifier', RandomForestClassifier())
# ])
# 
# # Define parameter grid (note: include pipeline step names)
# param_grid = {
#     'classifier__n_estimators': [100, 200],
#     'classifier__max_features': ['auto', 'sqrt']
# }
# 
# # Setup GridSearchCV with custom cross-validator
# cv = CustomSplit(n_splits=5)
# grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring='accuracy')
# 
# # Perform grid search
# grid_search.fit(X, y)
# print("Best parameters:", grid_search.best_params_)
# print("Best cross-validation score: {:.3f}".format(grid_search.best_score_))
