# Scrumpulicious

The goal of this notebook is to search for a model that best predicts wheather a scrumpulicious sweet comes green or golden, just looking at some production chain parameters.
The input of our model is represented by these parameters, while the output is represented by the label GREEN/NOT GREEN

In [2]:
import pandas as pd
import numpy as np
import random

np.random.seed(10)

Scrumpulicious dataset

In [2]:
# read dataset
dataset = pd.read_csv('Scrumpulicious.csv')
dataset

Unnamed: 0,TIME,PIB,MOZ,NOL,ZEN,SIP,MAZ,FOK,BIN,SIG,...,NIT,SIM,DUB,ZIR,RUP,LEM,GIS,POF,SUG,GREEN
0,2018-10-10 07:40:00,0.49185,3.80995,2.15806,25.09738,-0.01918,-17.64035,0.06206,-0.09259,0.00782,...,-2.25524,4.16916,4.27578,0.01312,2.97603,-0.77726,-0.00917,0.02302,0.00835,0
1,2018-10-10 07:42:00,-0.14336,3.86365,2.17636,24.60226,0.00093,-21.96503,0.05610,-0.08650,-0.00295,...,-2.25231,4.15313,4.27571,0.00866,4.42526,-0.86524,-0.00934,0.57344,0.00835,0
2,2018-10-10 07:44:00,-0.14263,3.64129,2.15806,26.12899,0.02104,-23.74379,0.05154,-0.08650,-0.01310,...,-2.24939,4.13736,4.27564,0.01416,4.42958,-0.97301,-0.00951,1.17828,0.00835,0
3,2018-10-10 07:46:00,-0.14813,2.95535,2.28731,29.98923,0.04114,-24.46509,0.05154,-0.08650,-0.00526,...,-2.24641,4.12160,4.27557,0.01219,4.33068,-0.76113,-0.00967,1.12441,0.00835,0
4,2018-10-10 07:48:00,-0.22327,2.93920,2.22378,31.09300,0.06125,-25.58076,0.05154,-0.07578,-0.00526,...,-2.24349,4.10557,4.27550,0.00319,4.48351,-0.97301,-0.00984,-0.02715,0.00835,0
5,2018-10-10 07:50:00,-0.18042,2.87240,2.23300,31.09656,0.08136,-25.49245,0.05154,-0.07578,-0.00526,...,-2.24051,4.08980,4.27543,0.00710,4.89647,-0.83445,-0.01001,0.10137,0.00835,0
6,2018-10-10 07:52:00,-0.25291,2.66109,2.14568,38.36607,0.09606,-28.02262,0.05154,-0.07578,-0.02044,...,-2.23759,4.07377,4.27536,0.00091,5.33277,-0.98841,-0.01018,0.73964,0.00834,0
7,2018-10-10 07:54:00,-0.21133,1.70279,2.14865,40.34637,0.10319,-30.17148,0.05154,-0.07578,-0.02113,...,-2.23461,4.05800,4.27529,0.00370,5.76908,-0.83445,-0.01041,1.49041,0.00834,1
8,2018-10-10 07:58:00,-0.27277,1.73264,2.15134,39.64694,0.11744,-30.40180,0.04417,-0.07198,0.03555,...,-2.22871,4.02620,4.27516,0.00826,6.78471,-1.10499,-0.01247,0.04794,0.00834,0
9,2018-10-10 08:00:00,-0.33429,1.73324,1.61257,42.84274,0.12457,-32.65338,0.05472,-0.07578,0.00580,...,-2.22579,4.01017,4.27508,0.00577,7.29283,-0.92829,-0.01311,0.46545,0.00834,0


In [12]:
print('Total samples: ' + str(dataset['GREEN'].shape[0]))
print('Total GREEN scrumpulicious: ' + str(np.count_nonzero(dataset['GREEN'])) + '/' + str(dataset['GREEN'].shape[0]))

Total samples: 15806
Total GREEN scrumpulicious: 102/15806


Split train and test

In [21]:
# split features from label
dataset_x = dataset.iloc[:, 1:-1].values
dataset_y = dataset.iloc[:, -1].values

In [23]:
from sklearn.utils import shuffle

# shuffle input data
dataset = shuffle(dataset)
# split dataset into train data and validation data
split = int(0.7 * dataset.shape[0])
x_train = dataset_x[:split, :]
y_train = dataset_y[:split]
x_test = dataset_x[split:, :]
y_test = dataset_y[split:]

Some statistics (on train set)

In [24]:
print('Total train samples: ' + str(x_train.shape[0]))
print('Total GREEN scrumpulicious: ' + str(np.count_nonzero(y_train == 1)) + '/' + str(x_train.shape[0]))

Total train samples: 11064
Total GREEN scrumpulicious: 75/11064


Train model (RandomForest)

In [25]:
from sklearn.ensemble import RandomForestClassifier
# model selection
from sklearn.model_selection import StratifiedKFold
from skopt.space import Real, Categorical, Integer
# Metrics
from sklearn.metrics import f1_score, accuracy_score
from sklearn.metrics import make_scorer
# model optimization
from skopt import BayesSearchCV

In [26]:
# Setting a 5-fold stratified cross-validation (note: shuffle=True)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

In [46]:
# chosen classifier: xgboost classifier
import xgboost as xgb

In [47]:
# set up input structure
data_dmatrix = xgb.DMatrix(data=x_train,label=y_train)

In [81]:
xg_reg = xgb.XGBClassifier(objective= 'binary:logistic')

Since the input dataset is a lot unbalanced (just few green sweets), an accuracy score does not give enough information about performance of the model. A better score is chosen: f_score

In [82]:
my_scorer = make_scorer(f1_score)

Searching for the best combination of hyperparameters is an important task for optimization. The best solution (in my opinion) is the bayesian optimization

In [83]:
# BayesSearchCV

search_spaces = {"learning_rate": Real(0.01, 0.1),
                 "gamma": Real(0, 10),
                 "max_depth": Integer(5, 20),
                 "colsample_bytree": Real(0.1, 1),
                 "subsample": Real(0.1, 1),
                 "min_child_weight": Real(0.1, 10),
                 "n_estimators": Integer(10, 1000),
                }

opt = BayesSearchCV(xg_reg,
                    search_spaces,
                    scoring=my_scorer,
                    cv=skf,
                    n_iter=40,
                    n_jobs=-1,
                    return_train_score=True,
                    random_state=4)

# fit model
opt.fit(x_train, y_train)

best_score = opt.best_score_
best_score_std = opt.cv_results_['std_test_score'][opt.best_index_]
best_params = opt.best_params_

print('Best score: ' + str(best_score) + " std: " + str(best_score_std))
print('Best params: ' + str(best_params))
preds = opt.predict(x_test)
print('f1-score: ' + str(f1_score(y_test, preds)))


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  'precision', 'predicted', average, warn_for)
  if diff:
  if diff:
  'precision', 'predicted', average, warn_for)
  if diff:
  if diff:
  'precision', 'predicted', average, warn_for)
  if diff:
  if diff:
  'precision', 'predicted', average, warn_for)
  if diff:
  if diff:
  'precision', 'predicted', average, warn_for)
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  'precision', 'predicted', average, warn_for)
  if diff:
  if diff:
  'precision', 'predicted', average, warn_for)
  if diff:
  if diff:
  if diff:
  if diff:
  'precision', 'predicted', average, warn_for)
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  'precision', 'predicted', average, warn_for)
  if diff:
  if diff:
  if diff:
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  if diff:
  if diff:
  if diff:
  'precision', 'predicted', average, warn_for)
  if diff:
  if diff:
  'precision', 'predicted', average, warn_for)
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:


Best score: 0.639507877034992 std: 0.0978958124246428
Best params: {'colsample_bytree': 0.8369522767060542, 'gamma': 0.0, 'learning_rate': 0.01, 'max_depth': 5, 'min_child_weight': 0.1, 'n_estimators': 181, 'subsample': 0.8662090988753832}
f1-score: 0.7843137254901961


  if diff:


In [7]:
# save the model
import pickle

In [86]:
# save
with open('model.pkl', 'wb') as f:
    pickle.dump(opt, f)

In [8]:
# load
with open('model.pkl', 'rb') as f:
    opt = pickle.load(f)



Now test model on validation data

In [89]:
preds = opt.predict(x_test)
print('F_score on validation data: ' + str(f1_score(y_test, preds)))

F_score on validation data: 0.7843137254901961


  if diff:


## Test dataset

In [36]:
test_dataset = pd.read_csv('Scrumpulicious_test.csv')
test_dataset

Unnamed: 0,TIME,PIB,MOZ,NOL,ZEN,SIP,MAZ,FOK,BIN,SIG,...,NIT,SIM,DUB,ZIR,RUP,LEM,GIS,POF,SUG,GREEN
0,2019-03-06 03:00:00,0.32613,4.41527,2.08047,-10.36190,0.05316,-10.78790,-0.00077,-0.06506,0.03181,...,-2.04925,0.05831,4.22538,0.01558,-2.85291,-0.83445,-0.02310,1.02393,0.00670,
1,2019-03-06 03:02:00,0.41189,4.36346,2.04116,-12.45995,0.05759,-9.79011,-0.00077,-0.06506,0.03181,...,-2.05009,0.06874,4.22601,0.02054,-3.50693,-0.64455,-0.02316,1.17140,0.00671,
2,2019-03-06 03:04:00,0.31503,4.49688,2.21122,-10.84578,0.06202,-9.30919,-0.01129,-0.06506,0.01594,...,-2.05099,0.07916,4.22665,0.02166,-4.16095,-0.84984,-0.02323,1.97708,0.00672,
3,2019-03-06 03:06:00,0.26113,4.57136,2.04383,-10.10387,0.06627,-9.66460,-0.00217,-0.06506,0.01060,...,-2.05188,0.08931,4.22729,0.02278,-3.33935,-0.95762,-0.02329,2.74441,0.00672,
4,2019-03-06 03:08:00,0.22995,4.56271,2.20106,-11.72043,0.06948,-9.13658,-0.00077,-0.06506,0.01594,...,-2.05277,0.09973,4.22793,0.02390,-3.37553,-0.86524,-0.02333,1.51513,0.00673,
5,2019-03-06 03:10:00,0.33010,4.42997,2.07504,-10.85745,0.07269,-8.60858,-0.00077,-0.06506,0.01594,...,-2.05366,0.11016,4.22856,0.02295,-3.59211,-0.88064,-0.02322,0.95241,0.00674,
6,2019-03-06 03:12:00,0.27129,4.35165,2.09326,-14.34139,0.07590,-8.00069,-0.00077,-0.06506,0.01594,...,-2.05456,0.12031,4.22920,0.02042,-3.80869,-0.98841,-0.02312,1.42087,0.00675,
7,2019-03-06 03:14:00,0.34322,4.51383,2.06952,-16.35746,0.07912,-7.27419,-0.00077,-0.05434,0.01594,...,-2.05545,0.13073,4.22984,0.01789,-4.02527,-0.98841,-0.02301,2.25481,0.00676,
8,2019-03-06 03:16:00,0.29671,4.37403,2.12778,-17.57376,0.08233,-7.11372,-0.00077,-0.05434,0.01594,...,-2.05629,0.14115,4.23048,0.02960,-4.53546,-0.82272,-0.02291,2.16831,0.00677,
9,2019-03-06 03:18:00,0.39677,4.45669,2.15707,-17.78020,0.08554,-6.21617,-0.00077,-0.05434,0.02129,...,-2.05718,0.15131,4.23111,0.02132,-5.06825,-0.91509,-0.02280,0.91297,0.00677,


Read input data and predict

In [37]:
test_dataset_x = test_dataset.iloc[:, 1:-1].values

In [38]:
predictions = opt.predict(test_dataset_x)

  if diff:


In [39]:
test_dataset['GREEN'] = predictions

In [40]:
test_dataset

Unnamed: 0,TIME,PIB,MOZ,NOL,ZEN,SIP,MAZ,FOK,BIN,SIG,...,NIT,SIM,DUB,ZIR,RUP,LEM,GIS,POF,SUG,GREEN
0,2019-03-06 03:00:00,0.32613,4.41527,2.08047,-10.36190,0.05316,-10.78790,-0.00077,-0.06506,0.03181,...,-2.04925,0.05831,4.22538,0.01558,-2.85291,-0.83445,-0.02310,1.02393,0.00670,0
1,2019-03-06 03:02:00,0.41189,4.36346,2.04116,-12.45995,0.05759,-9.79011,-0.00077,-0.06506,0.03181,...,-2.05009,0.06874,4.22601,0.02054,-3.50693,-0.64455,-0.02316,1.17140,0.00671,0
2,2019-03-06 03:04:00,0.31503,4.49688,2.21122,-10.84578,0.06202,-9.30919,-0.01129,-0.06506,0.01594,...,-2.05099,0.07916,4.22665,0.02166,-4.16095,-0.84984,-0.02323,1.97708,0.00672,0
3,2019-03-06 03:06:00,0.26113,4.57136,2.04383,-10.10387,0.06627,-9.66460,-0.00217,-0.06506,0.01060,...,-2.05188,0.08931,4.22729,0.02278,-3.33935,-0.95762,-0.02329,2.74441,0.00672,0
4,2019-03-06 03:08:00,0.22995,4.56271,2.20106,-11.72043,0.06948,-9.13658,-0.00077,-0.06506,0.01594,...,-2.05277,0.09973,4.22793,0.02390,-3.37553,-0.86524,-0.02333,1.51513,0.00673,0
5,2019-03-06 03:10:00,0.33010,4.42997,2.07504,-10.85745,0.07269,-8.60858,-0.00077,-0.06506,0.01594,...,-2.05366,0.11016,4.22856,0.02295,-3.59211,-0.88064,-0.02322,0.95241,0.00674,0
6,2019-03-06 03:12:00,0.27129,4.35165,2.09326,-14.34139,0.07590,-8.00069,-0.00077,-0.06506,0.01594,...,-2.05456,0.12031,4.22920,0.02042,-3.80869,-0.98841,-0.02312,1.42087,0.00675,0
7,2019-03-06 03:14:00,0.34322,4.51383,2.06952,-16.35746,0.07912,-7.27419,-0.00077,-0.05434,0.01594,...,-2.05545,0.13073,4.22984,0.01789,-4.02527,-0.98841,-0.02301,2.25481,0.00676,0
8,2019-03-06 03:16:00,0.29671,4.37403,2.12778,-17.57376,0.08233,-7.11372,-0.00077,-0.05434,0.01594,...,-2.05629,0.14115,4.23048,0.02960,-4.53546,-0.82272,-0.02291,2.16831,0.00677,0
9,2019-03-06 03:18:00,0.39677,4.45669,2.15707,-17.78020,0.08554,-6.21617,-0.00077,-0.05434,0.02129,...,-2.05718,0.15131,4.23111,0.02132,-5.06825,-0.91509,-0.02280,0.91297,0.00677,0


In [42]:
test_dataset.to_csv('Scrumpulicious_test_predictions.csv')