# Data Science Project SoSe 2024
## Team 07
- Maximilian Hoffmann
- Kilian Kempf
- Daniel Schneider
- Tom Schuck

## Submission of Task 2

In [278]:
import os
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
from sklearn.dummy import DummyClassifier

### Data Preparation

In [279]:
DATA_DIR = os.path.join(os.getcwd(), 'data/Instacart')

orders = pd.read_csv(os.path.join(DATA_DIR, 'orders.csv.zip'))
tip_train = pd.read_csv(os.path.join(DATA_DIR, 'tip_trainingsdaten1_.csv'))[['order_id', 'tip']]

In [280]:
orders_tip_train = pd.merge(orders, tip_train)
orders_tip_train

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,tip
0,2539329,1,prior,1,2,8,,False
1,2398795,1,prior,2,3,7,15.0,False
2,473747,1,prior,3,3,12,21.0,False
3,2254736,1,prior,4,4,7,29.0,False
4,431534,1,prior,5,4,15,28.0,False
...,...,...,...,...,...,...,...,...
3214869,2558525,206209,prior,9,4,15,22.0,False
3214870,2266710,206209,prior,10,5,18,29.0,False
3214871,1854736,206209,prior,11,4,10,30.0,False
3214872,626363,206209,prior,12,1,12,18.0,False


### Task 2.1

In [304]:
features = ['order_id', 'user_id', 'order_number', 'order_dow', 'order_hour_of_day', 'days_since_prior_order']
# to be updated
param_grid = {'criterion': ['gini', 'entropy', 'log_loss'], 
              'max_depth': [depth for depth in range(1, 10)],
              'min_samples_leaf': [2**i for i in range(0, 10)]}
cv = KFold(n_splits=5, shuffle=True, random_state=42)
grid_search_clf = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=cv)

In [305]:
X = orders_tip_train[features]
y = orders_tip_train[['tip']]
grid_search_clf.fit(X, y)

KeyboardInterrupt: 

In [283]:
decision_tree_clf = grid_search_clf.best_estimator_
grid_search_clf.best_params_

{'criterion': 'gini', 'max_depth': 6, 'min_samples_leaf': 4}

### Task 2.2

##### Estimated accuracy of the decision tree classifier on the test set:

In [284]:
grid_search_clf.best_score_

0.6067603265834369

### Task 2.3

In [285]:
most_freq_clf = DummyClassifier(strategy='most_frequent')
scores = cross_val_score(most_freq_clf, X, y, cv=cv)
scores.mean()

0.5591559729508828

##### Benefit of decision tree classifier over classifier that predicts most frequent class:

In [286]:
grid_search_clf.best_score_ - scores.mean()

0.04760435363255411

### Task 2.4

In [287]:
tip_test = pd.read_csv(os.path.join(DATA_DIR, 'tip_testdaten1_template.csv'))
orders_tip_test = pd.merge(orders, tip_test)
orders_tip_test['tip'] = decision_tree_clf.predict(orders_tip_test[features])
orders_tip_test

Unnamed: 0.1,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,Unnamed: 0,tip
0,1187899,1,train,11,4,8,14.0,10,False
1,1492625,2,train,15,1,11,30.0,25,True
2,2196797,5,train,5,0,11,6.0,49,True
3,525192,7,train,21,2,11,6.0,74,False
4,880375,8,train,4,1,14,10.0,78,True
...,...,...,...,...,...,...,...,...,...
131204,2585586,206199,train,20,2,16,30.0,3420838,False
131205,943915,206200,train,24,6,19,6.0,3420862,False
131206,2371631,206203,train,6,4,19,30.0,3420924,False
131207,1716008,206205,train,4,1,16,10.0,3420933,True


In [288]:
orders_tip_test_csv = orders_tip_test[tip_test.columns].copy()
orders_tip_test_csv.rename(columns={tip_test.columns[0]:''}, inplace=True)
orders_tip_test_csv

Unnamed: 0,Unnamed: 1,order_id,tip
0,10,1187899,False
1,25,1492625,True
2,49,2196797,True
3,74,525192,False
4,78,880375,True
...,...,...,...
131204,3420838,2585586,False
131205,3420862,943915,False
131206,3420924,2371631,False
131207,3420933,1716008,True


In [289]:
orders_tip_test_csv.to_csv(os.path.join(DATA_DIR, 'tip_testdaten1.csv'), index=False)

### Task 2.5 a)

In [290]:
op_prior = pd.read_csv(os.path.join(DATA_DIR, 'order_products__prior.csv.zip'))
op_train = pd.read_csv(os.path.join(DATA_DIR, 'order_products__train.csv.zip'))
op = pd.concat([op_prior, op_train])
op

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0
...,...,...,...,...
1384612,3421063,14233,3,1
1384613,3421063,35548,4,1
1384614,3421070,35951,1,1
1384615,3421070,16953,2,1


In [291]:
first_orders = orders[orders['order_number']==1]
products_first_orders = pd.merge(op[['order_id', 'product_id']], first_orders[['order_id']])
products_first_orders

Unnamed: 0,order_id,product_id
0,20,35430
1,20,47485
2,20,7419
3,20,5112
4,20,33452
...,...,...
2078063,3421081,32299
2078064,3421081,3060
2078065,3421081,20539
2078066,3421081,35221


##### Percentage of orders containing product 21137 (organic strawberries) first orders of the respective user:

In [292]:
(first_orders_products[first_orders_products['product_id']==21137].shape[0] / first_orders_products.shape[0]) * 100

0.7922743625328912

### Task 2.5 b)

In [293]:
last_orders = orders[['user_id', 'order_number']].groupby('user_id').max().reset_index()
last_orders = pd.merge(orders, last_orders, on=['user_id', 'order_number'])
last_orders

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,1187899,1,train,11,4,8,14.0
1,1492625,2,train,15,1,11,30.0
2,2774568,3,test,13,5,15,11.0
3,329954,4,test,6,3,12,30.0
4,2196797,5,train,5,0,11,6.0
...,...,...,...,...,...,...,...
206204,1716008,206205,train,4,1,16,10.0
206205,1043943,206206,test,68,0,20,0.0
206206,2821651,206207,test,17,2,13,14.0
206207,803273,206208,test,50,5,11,4.0


In [294]:
products_last_orders = pd.merge(op[['order_id', 'product_id']], last_orders[['order_id']])
products_last_orders

Unnamed: 0,order_id,product_id
0,1,49302
1,1,11109
2,1,10246
3,1,49683
4,1,43633
...,...,...
1384612,3421063,14233
1384613,3421063,35548
1384614,3421070,35951
1384615,3421070,16953


##### Percentage of orders containing product 21137 (organic strawberries) last orders of the respective user:

In [295]:
(last_orders_products[last_orders_products['product_id']==21137].shape[0] / last_orders_products.shape[0]) * 100

0.7867879709695893

##### Comparison:
Percentage of orders containing product 21137 (organic strawberries) first orders of the respective user: $\approx 0.792\%$\
Percentage of orders containing product 21137 (organic strawberries) last orders of the respective user: $\approx 0.787\%$

The difference can probably explained as being random, since it is relatively small.