In [1]:
# Import Libraries

import os
import pandas as pd
import joblib

from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import tree

In [2]:
# Set paths

data_set_path = os.environ['DATASET_PATH']
metrics_path = os.environ['METRICS_PATH']
model_path = os.environ['MODEL_PATH']

# Data Extraction

Carregar os dados do arquivo CSV.

In [3]:
# Load product data from CSV

product_data = pd.read_csv(data_set_path)

In [4]:
product_data

Unnamed: 0,product_id,seller_id,query,search_page,position,title,concatenated_tags,creation_date,price,weight,express_delivery,minimum_quantity,view_counts,order_counts,category
0,11394449,8324141,espirito santo,2,6,Mandala Espírito Santo,mandala mdf,2015-11-14 19:42:12,171.890000,1200.0,1,4,244,,Decoração
1,15534262,6939286,cartao de visita,2,0,Cartão de Visita,cartao visita panfletos tag adesivos copos lon...,2018-04-04 20:55:07,77.670000,8.0,1,5,124,,Papel e Cia
2,16153119,9835835,expositor de esmaltes,1,38,Organizador expositor p/ 70 esmaltes,expositor,2018-10-13 20:57:07,73.920006,2709.0,1,1,59,,Outros
3,15877252,8071206,medidas lencol para berco americano,1,6,Jogo de Lençol Berço Estampado,t jogo lencol menino lencol berco,2017-02-27 13:26:03,118.770004,0.0,1,1,180,1.0,Bebê
4,15917108,7200773,adesivo box banheiro,3,38,ADESIVO BOX DE BANHEIRO,adesivo box banheiro,2017-05-09 13:18:38,191.810000,507.0,1,6,34,,Decoração
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37995,13230578,1756482,mochila personalizada galinha pintadinha,1,2,Mochila Galinha Pintadinha M,primaria 2019 1 aninho abdulzinho mochilas ani...,2016-09-17 10:49:39,18.790000,149.0,1,27,321,,Lembrancinhas
37996,6736914,9301388,tag dia dos pais,1,32,30 TAGS DIA DOS PAIS 005,dia pais,2019-06-12 17:03:52,31.680000,7.0,1,1,43,,Papel e Cia
37997,11017911,8732362,kit bolsa maternidade,5,31,Kit bolsa bebê maternidade personalizada,paula carvalho bebe,2018-08-24 11:43:00,543.170000,3006.0,1,4,515,18.0,Bebê
37998,6807331,1869417,festa 15 anos,1,8,Chaveiro Almofada 15 anos,yasmin centro mesa compras 15 anos 2020 lembra...,2017-10-21 18:49:56,10.720000,18.0,1,53,2456,138.0,Lembrancinhas


Verificar quantas categorias existem

In [5]:
product_data.category.value_counts()

Lembrancinhas         17524
Decoração              8723
Bebê                   6930
Papel e Cia            2750
Outros                 1133
Bijuterias e Jóias      940
Name: category, dtype: int64

# Data Formatting

Remover todas colunas irrelevantes, como p.ex. product_id ou creation_date

In [6]:
product_data_prepared = product_data.drop(columns=['product_id','creation_date'])

product_data_prepared

Unnamed: 0,seller_id,query,search_page,position,title,concatenated_tags,price,weight,express_delivery,minimum_quantity,view_counts,order_counts,category
0,8324141,espirito santo,2,6,Mandala Espírito Santo,mandala mdf,171.890000,1200.0,1,4,244,,Decoração
1,6939286,cartao de visita,2,0,Cartão de Visita,cartao visita panfletos tag adesivos copos lon...,77.670000,8.0,1,5,124,,Papel e Cia
2,9835835,expositor de esmaltes,1,38,Organizador expositor p/ 70 esmaltes,expositor,73.920006,2709.0,1,1,59,,Outros
3,8071206,medidas lencol para berco americano,1,6,Jogo de Lençol Berço Estampado,t jogo lencol menino lencol berco,118.770004,0.0,1,1,180,1.0,Bebê
4,7200773,adesivo box banheiro,3,38,ADESIVO BOX DE BANHEIRO,adesivo box banheiro,191.810000,507.0,1,6,34,,Decoração
...,...,...,...,...,...,...,...,...,...,...,...,...,...
37995,1756482,mochila personalizada galinha pintadinha,1,2,Mochila Galinha Pintadinha M,primaria 2019 1 aninho abdulzinho mochilas ani...,18.790000,149.0,1,27,321,,Lembrancinhas
37996,9301388,tag dia dos pais,1,32,30 TAGS DIA DOS PAIS 005,dia pais,31.680000,7.0,1,1,43,,Papel e Cia
37997,8732362,kit bolsa maternidade,5,31,Kit bolsa bebê maternidade personalizada,paula carvalho bebe,543.170000,3006.0,1,4,515,18.0,Bebê
37998,1869417,festa 15 anos,1,8,Chaveiro Almofada 15 anos,yasmin centro mesa compras 15 anos 2020 lembra...,10.720000,18.0,1,53,2456,138.0,Lembrancinhas


Transformar valores de NaN em colunas numéricas para 0

In [7]:

for column in ['search_page','position','price','weight','express_delivery','minimum_quantity','view_counts','order_counts']:
    product_data_prepared[column] = product_data_prepared[column].fillna(0)
    
    
product_data_prepared

Unnamed: 0,seller_id,query,search_page,position,title,concatenated_tags,price,weight,express_delivery,minimum_quantity,view_counts,order_counts,category
0,8324141,espirito santo,2,6,Mandala Espírito Santo,mandala mdf,171.890000,1200.0,1,4,244,0.0,Decoração
1,6939286,cartao de visita,2,0,Cartão de Visita,cartao visita panfletos tag adesivos copos lon...,77.670000,8.0,1,5,124,0.0,Papel e Cia
2,9835835,expositor de esmaltes,1,38,Organizador expositor p/ 70 esmaltes,expositor,73.920006,2709.0,1,1,59,0.0,Outros
3,8071206,medidas lencol para berco americano,1,6,Jogo de Lençol Berço Estampado,t jogo lencol menino lencol berco,118.770004,0.0,1,1,180,1.0,Bebê
4,7200773,adesivo box banheiro,3,38,ADESIVO BOX DE BANHEIRO,adesivo box banheiro,191.810000,507.0,1,6,34,0.0,Decoração
...,...,...,...,...,...,...,...,...,...,...,...,...,...
37995,1756482,mochila personalizada galinha pintadinha,1,2,Mochila Galinha Pintadinha M,primaria 2019 1 aninho abdulzinho mochilas ani...,18.790000,149.0,1,27,321,0.0,Lembrancinhas
37996,9301388,tag dia dos pais,1,32,30 TAGS DIA DOS PAIS 005,dia pais,31.680000,7.0,1,1,43,0.0,Papel e Cia
37997,8732362,kit bolsa maternidade,5,31,Kit bolsa bebê maternidade personalizada,paula carvalho bebe,543.170000,3006.0,1,4,515,18.0,Bebê
37998,1869417,festa 15 anos,1,8,Chaveiro Almofada 15 anos,yasmin centro mesa compras 15 anos 2020 lembra...,10.720000,18.0,1,53,2456,138.0,Lembrancinhas


Transformar as colunas de texto (query, title e concatenated_tags) em contagens de palavaras. Ou seja, se houver "box banheiro adesivo" na coluna "title", haverá colunas com os nomes *title_box*, *title_banheiro*, *title_adesivo* cujo valores para o respective dataset são "1".

In [8]:
# Transform query, title and concatenated_tags field

# Create a transformation function
def generate_word_count_frame(column):
    # Limit the verctorizer to the 1.000 most popular words (for memory & speed reasons)
    cv = CountVectorizer(max_features=1000)

    column = column.fillna('')
    tf = cv.fit_transform(column)

    word_count_frame = pd.DataFrame(tf.toarray(), columns=cv.get_feature_names())
    word_count_frame.columns = [column.name+'_'+str(col) for col in word_count_frame.columns]
    
    return word_count_frame    


In [9]:
    
# Run transformation on each relevant column

word_counts = {}
for column in ['query','title','concatenated_tags']:
    word_counts[column] = generate_word_count_frame(product_data[column])


Testar o resultado da transformação no caso da coluna *title*

In [10]:
word_counts["title"]

Unnamed: 0,title_00,title_01,title_02,title_03,title_05,title_10,title_100,title_10mm,title_12,title_15,...,title_você,title_voil,title_vovó,title_vovô,title_xícara,title_água,title_álbum,title_álcool,title_árvore,title_ímã
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
37996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
37997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
37998,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


Substituir as colunas *query*, *title* e *concatenated_tags* pelas colunas de 'word_count'.


In [11]:
# Drop query, title and concatenated_tags columns & append word_count data frames

product_data_prepared = product_data_prepared.drop(columns=['query','title','concatenated_tags'])

product_data_prepared = pd.concat([product_data_prepared,word_counts['query'],word_counts['title'],word_counts['concatenated_tags']],axis=1)

product_data_prepared

Unnamed: 0,seller_id,search_page,position,price,weight,express_delivery,minimum_quantity,view_counts,order_counts,category,...,concatenated_tags_vinicius,concatenated_tags_vinil,concatenated_tags_vintage,concatenated_tags_visita,concatenated_tags_vitor,concatenated_tags_volta,concatenated_tags_vovo,concatenated_tags_wedding,concatenated_tags_xicaras,concatenated_tags_yasmin
0,8324141,2,6,171.890000,1200.0,1,4,244,0.0,Decoração,...,0,0,0,0,0,0,0,0,0,0
1,6939286,2,0,77.670000,8.0,1,5,124,0.0,Papel e Cia,...,0,0,0,1,0,0,0,0,0,0
2,9835835,1,38,73.920006,2709.0,1,1,59,0.0,Outros,...,0,0,0,0,0,0,0,0,0,0
3,8071206,1,6,118.770004,0.0,1,1,180,1.0,Bebê,...,0,0,0,0,0,0,0,0,0,0
4,7200773,3,38,191.810000,507.0,1,6,34,0.0,Decoração,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37995,1756482,1,2,18.790000,149.0,1,27,321,0.0,Lembrancinhas,...,0,0,0,0,0,0,0,0,0,0
37996,9301388,1,32,31.680000,7.0,1,1,43,0.0,Papel e Cia,...,0,0,0,0,0,0,0,0,0,0
37997,8732362,5,31,543.170000,3006.0,1,4,515,18.0,Bebê,...,0,0,0,0,0,0,0,0,0,0
37998,1869417,1,8,10.720000,18.0,1,53,2456,138.0,Lembrancinhas,...,0,0,0,0,0,0,0,0,0,1


# Modeling

Separar conjunto de dados de treinamento e validação

In [12]:
# Create training and test set

X_train, X_test, Y_train, Y_test = train_test_split(product_data_prepared.drop(columns='category'), product_data_prepared.category, test_size=0.15, random_state=3884)


Criar um classificador de arvore e treina o modelo

In [13]:
clf = DecisionTreeClassifier(random_state = 2232)
clf.fit(X_train, Y_train)

DecisionTreeClassifier(random_state=2232)

# Model Validation

Medir metricas do modelo e mostra-las.

In [14]:
Y_pred = clf.predict(X_test)

classification_report = metrics.classification_report(Y_test,Y_pred)

print(classification_report)

                    precision    recall  f1-score   support

              Bebê       0.82      0.80      0.81      1050
Bijuterias e Jóias       0.87      0.79      0.83       151
         Decoração       0.82      0.85      0.83      1273
     Lembrancinhas       0.90      0.90      0.90      2630
            Outros       0.49      0.51      0.50       167
       Papel e Cia       0.68      0.62      0.65       429

          accuracy                           0.84      5700
         macro avg       0.76      0.75      0.75      5700
      weighted avg       0.84      0.84      0.84      5700



Escreve o reporte para 'metrics_path'.

In [15]:
f = open(metrics_path, "w")
f.write(classification_report)
f.close()

# Model exportation

Exporta o modelo para 'model_path'.

In [16]:
joblib.dump(clf, model_path, compress=9)

['/usr/src/data/model.pkl']