In [22]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import _pickle as cPickle
import requests, json
import pickle

In [23]:
df = pd.read_csv('top_products.csv')

In [24]:
category = ["Café", "Massas", "Hortifrutti", "Bebidas", "Laticínios", "Padaria", "Doces", "Básicos"]

In [25]:
df['CategoryA'] = np.random.choice(category, len(df))
df['CategoryB'] = np.random.choice(category, len(df))
df['CategoryC'] = np.random.choice(category, len(df))


In [26]:
df.head()

Unnamed: 0,Top1,Top2,Top3,Top4,Preco,Quantidade,Promotion,Receita,Clima,CategoryA,CategoryB,CategoryC
0,Carbonated Water - Blackberry,Soup - Campbells Tomato Ravioli,Wine - Crozes Hermitage E.,Bread - Sour Batard,17,7,1,234,3,Padaria,Doces,Café
1,Puff Pastry - Slab,Soup - Campbells Mushroom,Gatorade - Cool Blue Raspberry,Cookie - Oreo 100x2,78,2,1,318,1,Padaria,Café,Bebidas
2,Wine - Stoneliegh Sauvignon,Momiji Oroshi Chili Sauce,Vodka - Moskovskaya,Coffee - Almond Amaretto,100,4,1,476,2,Doces,Bebidas,Padaria
3,Baking Soda,Icecream - Dstk Cml And Fdg,"Artichoke - Bottom, Canned",Bread - Hamburger Buns,40,6,1,438,2,Hortifrutti,Doces,Bebidas
4,Wine - Soave Folonari,Couscous,Bread - Pumpernickel,Wine - Casillero Del Diablo,61,2,1,492,4,Massas,Hortifrutti,Massas


In [27]:
df = df.iloc[:, 4:]

In [28]:
df.head()

Unnamed: 0,Preco,Quantidade,Promotion,Receita,Clima,CategoryA,CategoryB,CategoryC
0,17,7,1,234,3,Padaria,Doces,Café
1,78,2,1,318,1,Padaria,Café,Bebidas
2,100,4,1,476,2,Doces,Bebidas,Padaria
3,40,6,1,438,2,Hortifrutti,Doces,Bebidas
4,61,2,1,492,4,Massas,Hortifrutti,Massas


In [64]:
from sklearn.preprocessing import LabelEncoder
class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

In [67]:
df_label = MultiColumnLabelEncoder(columns = ['CategoryA', 'CategoryB', 'CategoryC']).fit_transform(df)


In [68]:
df_label.head()

Unnamed: 0,Preco,Quantidade,Promotion,Receita,Clima,CategoryA,CategoryB,CategoryC
0,17,7,1,234,3,7,3,2
1,78,2,1,318,1,7,2,0
2,100,4,1,476,2,3,0,7
3,40,6,1,438,2,4,3,0
4,61,2,1,492,4,6,4,6


In [69]:
y = df.iloc[:, 5:]
x = df.iloc[:, :5]

In [70]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [71]:
forest = RandomForestClassifier(n_estimators=100, random_state=1)
classifier = MultiOutputClassifier(forest, n_jobs=-1)
classifier.fit(X_train, y_train)

MultiOutputClassifier(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=1, verbose=0, warm_start=False),
           n_jobs=-1)

In [73]:
classifier.score(X_train, y_train)

1.0

In [74]:
pickle.dump(classifier, open("multicategory_pickle.pkl", "wb"))

In [76]:
from sklearn.externals import joblib
joblib.dump(classifier, 'multicategory_joblib2.pkl', compress=9) 

['multicategory_joblib2.pkl']

In [79]:
classifier.predict(X_test)

array([['Básicos', 'Bebidas', 'Doces'],
       ['Básicos', 'Padaria', 'Bebidas'],
       ['Café', 'Doces', 'Bebidas'],
       ['Padaria', 'Massas', 'Doces'],
       ['Hortifrutti', 'Doces', 'Hortifrutti'],
       ['Café', 'Café', 'Massas'],
       ['Massas', 'Massas', 'Padaria'],
       ['Hortifrutti', 'Básicos', 'Padaria'],
       ['Massas', 'Hortifrutti', 'Padaria'],
       ['Padaria', 'Hortifrutti', 'Massas'],
       ['Básicos', 'Hortifrutti', 'Café'],
       ['Massas', 'Doces', 'Padaria'],
       ['Básicos', 'Bebidas', 'Massas'],
       ['Básicos', 'Bebidas', 'Massas'],
       ['Café', 'Laticínios', 'Hortifrutti'],
       ['Hortifrutti', 'Doces', 'Bebidas'],
       ['Padaria', 'Bebidas', 'Massas'],
       ['Padaria', 'Hortifrutti', 'Básicos'],
       ['Hortifrutti', 'Café', 'Café'],
       ['Laticínios', 'Bebidas', 'Básicos'],
       ['Café', 'Padaria', 'Laticínios'],
       ['Doces', 'Hortifrutti', 'Massas'],
       ['Massas', 'Massas', 'Massas'],
       ['Massas', 'Café', 'Café']

In [80]:
produtos = classifier.predict(X_test).tolist()

In [81]:
produtos2 = pd.DataFrame(produtos)