# Data Science Project SoSe 2024
## Team 07
- Maximilian Hoffmann
- Kilian Kempf
- Daniel Schneider
- Tom Schuck

## Submission of Task 2

### Libraries 

In [300]:
import os

import numpy as np
import pandas as pd
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
from sklearn.tree import DecisionTreeClassifier

### Data Preparation

In [301]:
DATA_DIR = os.path.join(os.getcwd(), 'data/Instacart')

op_prior = pd.read_csv(os.path.join(DATA_DIR, 'order_products__prior.csv.zip'))
op_train = pd.read_csv(os.path.join(DATA_DIR, 'order_products__train.csv.zip'))
op = pd.concat([op_prior, op_train])

tip_train = pd.read_csv(os.path.join(DATA_DIR, 'tip_trainingsdaten1_.csv'))[['order_id', 'tip']]
tip_test = pd.read_csv(os.path.join(DATA_DIR, 'tip_testdaten1_template.csv'))
tip = pd.concat(
    [tip_train, tip_test[['order_id', 'tip']]])

orders = pd.read_csv(os.path.join(DATA_DIR, 'orders.csv.zip'))
orders_tip = pd.merge(orders, tip)
orders_tip

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,tip
0,2539329,1,prior,1,2,8,,0.0
1,2398795,1,prior,2,3,7,15.0,0.0
2,473747,1,prior,3,3,12,21.0,0.0
3,2254736,1,prior,4,4,7,29.0,0.0
4,431534,1,prior,5,4,15,28.0,0.0
...,...,...,...,...,...,...,...,...
3346078,2266710,206209,prior,10,5,18,29.0,0.0
3346079,1854736,206209,prior,11,4,10,30.0,0.0
3346080,626363,206209,prior,12,1,12,18.0,0.0
3346081,2977660,206209,prior,13,1,12,7.0,0.0


### Feature Engineering

In [302]:
orders_tip['days_since_prior_order'] = orders_tip['days_since_prior_order'].fillna(-1).astype(int)

orders_tip['tip'] = orders_tip['tip'].astype(bool)
orders_tip['tip_history'] = orders_tip.groupby('user_id')['tip'].transform('cumsum').shift(1) / orders_tip[
    'order_number'].shift(1)
orders_tip.loc[orders_tip['order_number'] == 1, 'tip_history'] = -1

orders_tip['relative_order_number'] = orders_tip.groupby('user_id')['order_number'].rank(pct=True)
orders_tip['tip'] = orders_tip['tip'].astype(object)
orders_tip.loc[orders_tip['order_id'].isin(tip_test['order_id']), 'tip'] = np.nan
orders_tip

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,tip,tip_history,relative_order_number
0,2539329,1,prior,1,2,8,-1,False,-1.000000,0.090909
1,2398795,1,prior,2,3,7,15,False,0.000000,0.181818
2,473747,1,prior,3,3,12,21,False,0.000000,0.272727
3,2254736,1,prior,4,4,7,29,False,0.000000,0.363636
4,431534,1,prior,5,4,15,28,False,0.000000,0.454545
...,...,...,...,...,...,...,...,...,...,...
3346078,2266710,206209,prior,10,5,18,29,False,0.222222,0.714286
3346079,1854736,206209,prior,11,4,10,30,False,0.200000,0.785714
3346080,626363,206209,prior,12,1,12,18,False,0.181818,0.857143
3346081,2977660,206209,prior,13,1,12,7,False,0.166667,0.928571


### Task 2.1

In [303]:
orders_tip_train = pd.merge(orders_tip, tip_train)
orders_tip_train['tip'] = orders_tip_train['tip'].astype(bool)
orders_tip_train

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,tip,tip_history,relative_order_number
0,2539329,1,prior,1,2,8,-1,False,-1.000000,0.090909
1,2398795,1,prior,2,3,7,15,False,0.000000,0.181818
2,473747,1,prior,3,3,12,21,False,0.000000,0.272727
3,2254736,1,prior,4,4,7,29,False,0.000000,0.363636
4,431534,1,prior,5,4,15,28,False,0.000000,0.454545
...,...,...,...,...,...,...,...,...,...,...
3214869,2558525,206209,prior,9,4,15,22,False,0.250000,0.642857
3214870,2266710,206209,prior,10,5,18,29,False,0.222222,0.714286
3214871,1854736,206209,prior,11,4,10,30,False,0.200000,0.785714
3214872,626363,206209,prior,12,1,12,18,False,0.181818,0.857143


In [304]:
features = ['order_number', 'order_dow', 'order_hour_of_day', 'days_since_prior_order', 'tip_history',
            'relative_order_number']
# param_grid = {'criterion': ['entropy'],
# 'max_depth': [depth for depth in range(1, 21)],
# 'min_samples_leaf': [2 ** i for i in range(0, 15)]}
param_grid = {'criterion': ['entropy'],
              'max_depth': [11],
              'min_samples_leaf': [512]}
cv = KFold(n_splits=5, shuffle=True, random_state=42)
grid_search_clf = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=cv, n_jobs=-1, scoring='accuracy')

In [305]:
X = orders_tip_train[features]
y = orders_tip_train[['tip']]
grid_search_clf.fit(X, y)

In [306]:
decision_tree_clf = grid_search_clf.best_estimator_
grid_search_clf.best_params_

{'criterion': 'entropy', 'max_depth': 11, 'min_samples_leaf': 512}

### Task 2.2

##### Estimated accuracy of the decision tree classifier on the test set:

In [307]:
grid_search_clf.best_score_

0.756124501055468

### Task 2.3

In [308]:
most_freq_clf = DummyClassifier(strategy='most_frequent')
most_freq_score = cross_val_score(most_freq_clf, X, y, cv=cv, scoring='accuracy').mean()
most_freq_score

0.5591559729508828

##### Benefit of decision tree classifier over classifier that predicts most frequent class:

In [309]:
grid_search_clf.best_score_ - most_freq_score

0.1969685281045852

### Task 2.4

In [310]:
orders_tip_test = pd.merge(orders_tip.drop('tip', axis=1), tip_test)
orders_tip_test['tip'] = decision_tree_clf.predict(orders_tip_test[features])
orders_tip_test

Unnamed: 0.1,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,tip_history,relative_order_number,Unnamed: 0,tip
0,1187899,1,train,11,4,8,14,0.000000,1.0,10,False
1,1492625,2,train,15,1,11,30,0.571429,1.0,25,True
2,2196797,5,train,5,0,11,6,0.500000,1.0,49,True
3,525192,7,train,21,2,11,6,0.650000,1.0,74,True
4,880375,8,train,4,1,14,10,0.666667,1.0,78,True
...,...,...,...,...,...,...,...,...,...,...,...
131204,2585586,206199,train,20,2,16,30,1.000000,1.0,3420838,True
131205,943915,206200,train,24,6,19,6,0.652174,1.0,3420862,True
131206,2371631,206203,train,6,4,19,30,0.000000,1.0,3420924,False
131207,1716008,206205,train,4,1,16,10,0.000000,1.0,3420933,False


In [311]:
orders_tip_test_csv = orders_tip_test[tip_test.columns].copy()
orders_tip_test_csv.rename(columns={tip_test.columns[0]: ''}, inplace=True)
orders_tip_test_csv

Unnamed: 0,Unnamed: 1,order_id,tip
0,10,1187899,False
1,25,1492625,True
2,49,2196797,True
3,74,525192,True
4,78,880375,True
...,...,...,...
131204,3420838,2585586,True
131205,3420862,943915,True
131206,3420924,2371631,False
131207,3420933,1716008,False


In [312]:
orders_tip_test_csv.to_csv(os.path.join(DATA_DIR, 'tip_testdaten1.csv'), index=False)

### Task 2.5 a)

In [313]:
first_orders = orders[orders['order_number'] == 1]
orders_21137 = op[op['product_id'] == 21137]
first_orders_21137 = first_orders[first_orders['order_id'].isin(orders_21137['order_id'])]
first_orders_21137

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
1159,2669994,77,prior,1,2,11,
1518,185725,101,prior,1,2,11,
1598,2313085,108,prior,1,0,13,
1823,1747819,126,prior,1,6,11,
1943,3204695,135,prior,1,3,19,
...,...,...,...,...,...,...,...
3420524,2962368,206178,prior,1,3,10,
3420593,31169,206183,prior,1,0,13,
3420750,3395125,206194,prior,1,1,9,
3420925,1438269,206204,prior,1,1,11,


##### Percentage of orders containing product 21137 (organic strawberries) first orders of the respective user:

In [314]:
(first_orders_21137.shape[0] / first_orders.shape[0]) * 100

7.984132603329631

### Task 2.5 b)

In [315]:
last_orders = orders[['user_id', 'order_number']].groupby('user_id').max().reset_index()
last_orders = pd.merge(orders, last_orders, on=['user_id', 'order_number'])
last_orders_21137 = last_orders[last_orders['order_id'].isin(orders_21137['order_id'])]
last_orders_21137

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
17,2461523,18,train,7,6,9,7.0
26,2614670,27,train,82,5,14,4.0
58,1651215,59,train,11,0,12,10.0
62,2989905,63,train,40,6,8,13.0
78,21708,79,train,8,0,6,30.0
...,...,...,...,...,...,...,...
206152,642502,206153,train,14,0,8,21.0
206173,1620607,206174,train,53,4,22,10.0
206187,3367945,206188,train,8,1,21,30.0
206199,943915,206200,train,24,6,19,6.0


##### Percentage of orders containing product 21137 (organic strawberries) last orders of the respective user:

In [316]:
(last_orders_21137.shape[0] / last_orders.shape[0]) * 100

5.282989588233297

##### Comparison:
Percentage of first orders containing product 21137 (organic strawberries) of the respective user: $\approx 7.984\%$\
Percentage of last orders containing product 21137 (organic strawberries) of the respective user: $\approx 5.283\%$

Therefore, the ratio of first orders containing product 21137 (organic strawberries) is $\approx 51.126\%$ larger than the ratio of last orders containing this product. This seems to be rather a systematic reason than being random due to the relatively large difference. It seems that users buy the product 21137 (organic strawberries) less often in their last order compared to their first order.

### Sanity Check

In [317]:
orders_tip_test_alt = orders_tip_train[['user_id', 'order_number']].groupby('user_id').max().reset_index()
orders_tip_test_alt = pd.merge(orders_tip_train, orders_tip_test_alt)

orders_tip_train_alt = orders_tip_train[~orders_tip_train['order_id'].isin(orders_tip_test_alt['order_id'])]

In [318]:
(len(orders_tip_train_alt) + len(orders_tip_test_alt)) == len(orders_tip_train)

True

In [323]:
features = ['order_number', 'order_dow', 'order_hour_of_day', 'days_since_prior_order', 'tip_history',
            'relative_order_number']
# param_grid = {'criterion': ['entropy'],
# 'max_depth': [depth for depth in range(1, 21)],
# 'min_samples_leaf': [2 ** i for i in range(0, 15)]}
param_grid = {'criterion': ['entropy'],
              'max_depth': [11],
              'min_samples_leaf': [512]}
cv = KFold(n_splits=5, shuffle=True, random_state=42)
grid_search_clf = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=cv, n_jobs=-1, scoring='accuracy')

In [324]:
X = orders_tip_train_alt[features]
y = orders_tip_train_alt[['tip']]
grid_search_clf.fit(X, y)

In [325]:
grid_search_clf.best_score_

0.7530861694472465

In [326]:
orders_tip_test_alt['tip_pred'] = decision_tree_clf.predict(orders_tip_test_alt[features])
orders_tip_test_alt

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,tip,tip_history,relative_order_number,tip_pred
0,2550362,1,prior,10,4,8,30,False,0.000000,0.909091,False
1,839880,2,prior,14,3,10,13,False,0.615385,0.933333,False
2,1402502,3,prior,12,1,15,15,True,0.818182,1.000000,True
3,2557754,4,prior,5,5,13,0,False,0.000000,1.000000,False
4,157374,5,prior,4,1,18,19,True,0.333333,0.800000,False
...,...,...,...,...,...,...,...,...,...,...,...
206204,414137,206205,prior,3,5,16,10,False,0.000000,0.750000,False
206205,1904200,206206,prior,67,0,13,11,False,0.227273,1.000000,False
206206,1005822,206207,prior,16,2,7,18,False,0.200000,1.000000,False
206207,1882108,206208,prior,49,1,22,7,True,0.395833,1.000000,True


In [327]:
score = accuracy_score(orders_tip_test_alt['tip'], orders_tip_test_alt['tip_pred'])
score

0.801061059410598