## Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
cd /content/drive/MyDrive/brewing/

/content/drive/.shortcut-targets-by-id/1wGrW3Um-cH0PiGtcEam-_30m09KHmtTm/brewing


# Preprocessing Pipeline

In [4]:
from dataset.dataset import DataSet
import pandas as pd

In [5]:
ds = DataSet()

# Base Line Model 1

In [6]:
import scipy
import numpy as np
import pandas as pd

In [7]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted

class BaseLineClassifier(BaseEstimator, ClassifierMixin):

  def __init__(self):
      pass
      
  def fit(self, X, y):
      X, y = check_X_y(X, y, multi_output=True)
      self.n_marcas_ = 5
      return self
  
  def predict_proba(self, X):
      check_is_fitted(self)
      X = check_array(X)
      preds = (X[:,-self.n_marcas_:] > 0) * 1.0
      return preds

In [8]:
train_features = ds.get_features(train=True)
labels = ds.get_labels()
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_features, labels, test_size=0.1)

In [9]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
num_attributes = ['marca_2_2019_5_vol', 'marca_3_2019_5_vol', 'marca_2_2019_6_vol',
       'marca_3_2019_6_vol', 'marca_2_2019_7_vol', 'marca_3_2019_7_vol',
       'marca_2_2019_8_vol', 'marca_3_2019_8_vol', 'marca_2_2019_9_vol',
       'marca_3_2019_9_vol', 'marca_2_2019_10_vol', 'marca_3_2019_10_vol',
       'marca_2_2019_11_vol', 'marca_3_2019_11_vol', 'marca_2_2019_12_vol',
       'marca_3_2019_12_vol', 'marca_2_2020_1_vol', 'marca_3_2020_1_vol',
       'marca_2_2020_2_vol', 'marca_3_2020_2_vol', 'marca_2_2020_3_vol',
       'marca_3_2020_3_vol', 'marca_2_2020_4_vol', 'marca_3_2020_4_vol',
       'marca_2_2020_5_vol', 'marca_3_2020_5_vol', 'marca_2_2020_6_vol',
       'marca_3_2020_6_vol', 'marca_1_2020_7_vol', 'marca_2_2020_7_vol',
       'marca_3_2020_7_vol', 'marca_4_2020_7_vol', 'marca_5_2020_7_vol',
       'marca_1_2020_8_vol', 'marca_2_2020_8_vol', 'marca_3_2020_8_vol',
       'marca_4_2020_8_vol', 'marca_5_2020_8_vol']
cat_attributes = ['Gerencia2', 'SubCanal2', 'Categoria', 'Nevera']

In [10]:
preprocessing_pipeline = ColumnTransformer([
("cat", OneHotEncoder(handle_unknown = 'ignore'), cat_attributes),
("num", StandardScaler(), num_attributes)
])

In [11]:
baseline_clf = Pipeline([ 
                           ("preprocessing", preprocessing_pipeline),
                           ("model", BaseLineClassifier())                    
])
baseline_clf.fit(X_train, y_train);

In [12]:
# Evaluation
from sklearn.metrics import roc_auc_score
y_pred_train = baseline_clf.predict_proba(X_train)
y_pred_test = baseline_clf.predict_proba(X_test)
score_train = roc_auc_score(y_true=y_train.values, y_score=y_pred_train)
score_test = roc_auc_score(y_true=y_test.values, y_score=y_pred_test)
print(f"Train score: {score_test:.6f}, Test score:{score_test:.6f}")

Train score: 0.656467, Test score:0.656467


In [13]:
# CrossVal Score
from sklearn.model_selection import cross_val_score
scores = cross_val_score(baseline_clf, train_features, labels, cv=30, scoring='roc_auc')
print('mean_accuracy: ',scores.mean())
print('accuracy_std*3:',3*scores.std())

mean_accuracy:  0.6495869076317143
accuracy_std*3: 0.08848676617325452


# Training Several Models

In [14]:
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import RidgeClassifierCV
from xgboost import XGBClassifier

In [15]:
from sklearn.multioutput import MultiOutputClassifier

In [16]:
 classifiers =[
               MultiOutputClassifier(XGBClassifier())
               # Trees
               #DecisionTreeClassifier(),
               ##ExtraTreeClassifier(),
               # Ensemble
               #ExtraTreesClassifier(),
               #RandomForestClassifier(),
               # Neighbors
               ##KNeighborsClassifier(n_neighbors=15),
               ##RadiusNeighborsClassifier(radius=200),
               # Neural Network
               #MLPClassifier(max_iter=400),
               # Linear
               ##RidgeClassifierCV(),
               # XGBoost
               ##XGBClassifier()
]

In [17]:
training_summary = pd.DataFrame(columns=['test_score_mean','test_score_3_std','train_score_mean','train_score_3_std','model_name','model_params'])
models = [Pipeline([("preprocessing", preprocessing_pipeline),("classifier", classifier)]) for classifier in classifiers]
from sklearn.model_selection import cross_validate
for model in models:
  classifier = model.steps[1][1]
  classifier_name = classifier.__class__.__name__
  print(classifier_name)
  cv = cross_validate(model, train_features, labels, cv=10, return_train_score=True, scoring='roc_auc')
  new_line = pd.Series({
      'test_score_mean': cv['test_score'].mean(),
      'test_score_3_std': 3*cv['test_score'].std(),
      'train_score_mean': cv['train_score'].mean(),
      'train_score_3_std': 3*cv['train_score'].std(),
      'model_name': classifier_name,
      'model_params': classifier.get_params()
  })
  training_summary = training_summary.append(new_line,ignore_index=True)

MultiOutputClassifier


In [18]:
training_summary.sort_values(by='test_score_mean',ascending=False)

Unnamed: 0,test_score_mean,test_score_3_std,train_score_mean,train_score_3_std,model_name,model_params
0,0.830472,0.132373,0.865172,0.014415,MultiOutputClassifier,"{'estimator__base_score': 0.5, 'estimator__boo..."


In [19]:
model.fit(train_features, labels)

Pipeline(memory=None,
         steps=[('preprocessing',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('cat',
                                                  OneHotEncoder(categories='auto',
                                                                drop=None,
                                                                dtype=<class 'numpy.float64'>,
                                                                handle_unknown='ignore',
                                                                sparse=True),
                                                  ['Gerencia2', 'SubCanal2',
                                                   'Categoria', 'Nevera']),
                                                 ('num',
                                                  StandardScaler(copy=Tru

# Fine Tunning RandomForestClassifier

In [21]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [22]:
ds = DataSet()
train_features = ds.get_features(train=True)
labels = ds.get_labels()
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_features, labels, test_size=0.1)

In [23]:
model = Pipeline([("preprocessing", preprocessing_pipeline),("classifier", RandomForestClassifier())])

In [None]:
param_grid = [{"classifier__max_depth":[10, 100],
               "classifier__max_features":[20, 10, 5],
               "classifier__max_leaf_nodes":[10, 200],
               "classifier__n_estimators":[200, 400]}]
grid_search = GridSearchCV(model, param_grid=param_grid, cv=5, return_train_score=True, scoring='roc_auc')
grid_search.fit(train_features, labels)

In [None]:
grid_search.best_score_

In [None]:
grid_search.best_params_

In [None]:
cvres = grid_search.cv_results_
print("--------------MEAN-SCORE--------------------")
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print("{} {}".format(mean_score, params))
print("--------------MEAN-SCORE--------------------")
for mean_score, params in zip(cvres["mean_train_score"], cvres["params"]):
    print("{} {}".format(mean_score, params))

print("---------------3-*-STD--------------------")
for std_score, params in zip(cvres["std_test_score"], cvres["params"]):
    print("{} {}".format(3*std_score, params))

# Fine Tunning ExtraTreesClassifier

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import ExtraTreesClassifier

In [None]:
ds = DataSet()
train_features = ds.get_features(train=True)
labels = ds.get_labels()
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_features, labels, test_size=0.1)

In [None]:
model = Pipeline([("preprocessing", preprocessing_pipeline),("classifier", ExtraTreesClassifier())])

In [None]:
param_grid = [{"classifier__max_depth":[10, 100, 150],
               "classifier__max_features":[10, 20],
               "classifier__max_leaf_nodes":[100, 10, 200],
               "classifier__n_estimators":[200, 300]}]
grid_search = GridSearchCV(model, param_grid=param_grid, cv=5, return_train_score=True, scoring='roc_auc')
grid_search.fit(train_features,labels)

In [None]:
y_pred_train = grid_search.best_estimator_.predict_proba(X_train)
y_pred_train = np.array(y_pred_train)[:,:,1].transpose()
y_pred_test = grid_search.best_estimator_.predict_proba(X_test)
y_pred_test = np.array(y_pred_test)[:,:,1].transpose()
score_train = roc_auc_score(y_true=y_train.values, y_score=y_pred_train)
score_test = roc_auc_score(y_true=y_test.values, y_score=y_pred_test)
print(f"Train score: {score_train:.6f}, Test score:{score_test:.6f}")

In [None]:
cvres = grid_search.cv_results_
print("--------------MEAN-SCORE--------------------")
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print("{} {}".format(mean_score, params))
print("--------------MEAN-SCORE--------------------")
for mean_score, params in zip(cvres["mean_train_score"], cvres["params"]):
    print("{} {}".format(mean_score, params))

print("---------------3-*-STD--------------------")
for std_score, params in zip(cvres["std_test_score"], cvres["params"]):
    print("{} {}".format(3*std_score, params))

# Neural Networks

In [None]:
from dataset.dataset import DataSet
ds = DataSet()
train_features = ds.get_features(train=True)
labels = ds.get_labels()
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_features, labels, test_size=0.1)

In [None]:
import tensorflow as tf

In [None]:
clf = tf.keras.models.Sequential([
                                  tf.keras.layers.Dropout(0.2),
                                  tf.keras.layers.Dense(256, activation="relu", kernel_constraint=tf.keras.constraints.max_norm(3)),
                                  tf.keras.layers.Dropout(0.2),
                                  tf.keras.layers.Dense(256, activation="relu", kernel_constraint=tf.keras.constraints.max_norm(3)),
                                  tf.keras.layers.Dropout(0.2),
                                  tf.keras.layers.Dense(128, activation="relu", kernel_constraint=tf.keras.constraints.max_norm(3)),
                                  tf.keras.layers.Dropout(0.2),
                                  tf.keras.layers.Dense(32, activation="relu", kernel_constraint=tf.keras.constraints.max_norm(3)),
                                  tf.keras.layers.Dropout(0.2),
                                  tf.keras.layers.Dense(16, activation="relu", kernel_constraint=tf.keras.constraints.max_norm(3)),
                                  tf.keras.layers.Dropout(0.2),
                                  tf.keras.layers.Dense(5, activation="sigmoid"),
])
clf.compile(optimizer=tf.keras.optimizers.Adam(1e-3), loss='binary_crossentropy')

In [None]:
model = Pipeline([("preprocessing", preprocessing_pipeline),("classifier", clf)])

In [None]:
model.fit(train_features, labels, classifier__validation_split=0.1, 
          classifier__callbacks=[tf.keras.callbacks.EarlyStopping(patience=15),
                                 tf.keras.callbacks.ReduceLROnPlateau(patience=5, factor=0.05)],
          classifier__epochs=400)

In [None]:
y_pred_train = model.predict(X_train)
y_pred_train = y_pred_train
y_pred_test = model.predict(X_test)
y_pred_test = y_pred_test
score_train = roc_auc_score(y_true=y_train.values, y_score=y_pred_train)
score_test = roc_auc_score(y_true=y_test.values, y_score=y_pred_test)
print(f"Train score: {score_train:.6f}, Test score:{score_test:.6f}")

# Submission

In [None]:
from dataset.dataset import DataSet

In [None]:
ds = DataSet()
test_features = ds.get_features(train=False)
#y_preds = grid_search.best_estimator_.predict_proba(test_features)
y_preds = model.predict_proba(test_features)
y_preds = np.array(y_preds)[:,:,1].transpose()
ds.test_df.iloc[:,1:] = y_preds

In [None]:
y_preds

array([[0.07300664, 0.00769999, 0.10316239, 0.0610882 , 0.0311062 ],
       [0.05941252, 0.05941886, 0.3093102 , 0.03826314, 0.03771249],
       [0.17872791, 0.00610686, 0.09726604, 0.07125627, 0.0311062 ],
       ...,
       [0.1065371 , 0.00499116, 0.0831932 , 0.51750594, 0.47975406],
       [0.08262488, 0.00500015, 0.06877049, 0.77198017, 0.53664905],
       [0.09075408, 0.00499116, 0.09651759, 0.04194202, 0.03633538]],
      dtype=float32)

In [None]:
from datetime import datetime
filename = ds.BASEPATH/f'predictions/{datetime.now().strftime("%Y-%m-%d_%H:%M:%S")}.csv'
ds.test_df.to_csv(filename, index=False)