## Projeto Parte 1 - Pipeline de Treinamento
Nome: Flávio Bezerra Pereira (flaviobp@gmail.com)

### 1. Data extraction
Loads a dataset with product data from a specified path available in the environment variable DATASET_PATH.

In [1]:
import numpy as np
import pandas as pd
import os
import sklearn

# variaveis de ambiente
!cat .env

COMPOSE_PROJECT_NAME="categorization"

DATASET_PATH="/usr/src/data/sample_products.csv"
METRICS_PATH="/usr/src/data/metrics.txt"
MODEL_PATH="/usr/src/data/model.pkl"


In [2]:
# data-frame original
DATASET_PATH=os.getenv("DATASET_PATH")
df_original = pd.read_csv(DATASET_PATH)

## 2. Data formatting
Processes the dataset to use it for training and validation.

In [3]:
# X -> concatenated_tags
# y -> category

# remove valores null e seleciona as colunas para X e y e verifica a distribuicao conforme a categoria
df = df_original[['category','concatenated_tags','price','seller_id','weight']].dropna()
print("Tamanho do data frame: "+ str(len(df)))
series = df['category'].value_counts()
print(series/len(df)*100)

Tamanho do data frame: 37940
Lembrancinhas         46.154454
Decoração             22.965208
Bebê                  18.236690
Papel e Cia            7.208751
Outros                 2.957301
Bijuterias e Jóias     2.477596
Name: category, dtype: float64


In [4]:
# extrai o target_names e y
target_names = [category for category in df.category.unique()]
y = [target_names.index(category)
        for category in df.category]
y = pd.DataFrame(y)

In [5]:
# extrai os valores para X
feature_names = ['concatenated_tags']
X = df[feature_names]

In [6]:
from sklearn.model_selection import train_test_split

# divide os dados para treino e teste de modo estratificado
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.1, #10% para teste train
                                                    stratify=y,
                                                    random_state=420) #semente 

print("Tamanho para treino: "+ str(len(y_train)))
dy_train = pd.DataFrame([target_names[v] for v in y_train[0]], columns= ['category'])
print(dy_train['category'].value_counts()/len(dy_train)*100)

print("Tamanho para teste: "+ str(len(y_test)))
dy_test = pd.DataFrame([target_names[v] for v in y_test[0]], columns= ['category'])
print(dy_test['category'].value_counts()/len(dy_test)*100)

Tamanho para treino: 34146
Lembrancinhas         46.154747
Decoração             22.966087
Bebê                  18.236397
Papel e Cia            7.207286
Outros                 2.957887
Bijuterias e Jóias     2.477596
Name: category, dtype: float64
Tamanho para teste: 3794
Lembrancinhas         46.151819
Decoração             22.957301
Bebê                  18.239325
Papel e Cia            7.221929
Outros                 2.952030
Bijuterias e Jóias     2.477596
Name: category, dtype: float64


In [7]:
from sklearn.feature_extraction.text import CountVectorizer

# bag-of-words com bigrams
count_vect = CountVectorizer(ngram_range=(1, 2),max_features=None,max_df=0.5)
X_train_counts = count_vect.fit_transform(X_train['concatenated_tags'])
X_train_counts.shape

(34146, 56764)

In [8]:
from sklearn.feature_extraction.text import TfidfTransformer

# tfidf ajuste na frequencia
tfidf_transformer = TfidfTransformer(norm='l1',use_idf=True)
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(34146, 56764)

## 3. Modeling
Specifies a model to handle the categorization problem.

In [9]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import ComplementNB  
from sklearn import metrics

# abrir o arquivo de metricas
METRICS_PATH=os.getenv("METRICS_PATH")
with open(METRICS_PATH, "w") as f:
    print("MÉTRICAS NO CROSS-VALIDATION (n_splits=5)", file=f)

# cross validation com n_splits = 5
skf = StratifiedKFold(n_splits=5)
acc_dt = []

# pega um pedaco para treino outro para validacao em X e Y
for tr_idx, vl_idx in skf.split(X_train_tfidf, y_train):
    X_train_f, X_valid_f = X_train_tfidf[tr_idx], X_train_tfidf[vl_idx]
    y_train_f, y_valid_f = y_train.iloc[tr_idx], y_train.iloc[vl_idx]
    
    clf = ComplementNB(alpha=0.01).fit(X_train_f, y_train_f.values.ravel())

    y_pred_f = clf.predict(X_valid_f)
    
    acc_dt.append(accuracy_score(y_valid_f, y_pred_f))
    
    with open(METRICS_PATH, "a") as f:
        print(metrics.classification_report(y_valid_f, y_pred_f,target_names=target_names), file=f)

print("Acurácia média =",np.mean(acc_dt)*100," %.")

Acurácia média = 87.11708335414787  %.


## 4. Model validation
Generates metrics about the model accuracy (precision, recall, F1, etc.) for each category and exports them to a specified path available in the environment variable METRICS_PATH.

In [10]:
# treina o modelo com o conjunto de treino
clfAll = ComplementNB(alpha=0.01).fit(X_train_tfidf, y_train.values.ravel())

# calcula o erro do classificador nos dados de teste
X_test_counts = count_vect.transform(X_test['concatenated_tags'])
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

y_pred = clfAll.predict(X_test_tfidf)

print("Acurácia =",accuracy_score(y_test, y_pred)*100," %.")

Acurácia = 88.03373748023195  %.


In [11]:
from sklearn import metrics

# metricas
print(metrics.classification_report(y_test, y_pred,target_names=target_names))

                    precision    recall  f1-score   support

         Decoração       0.88      0.88      0.88       871
       Papel e Cia       0.83      0.66      0.74       274
            Outros       0.86      0.71      0.77       112
              Bebê       0.89      0.85      0.87       692
     Lembrancinhas       0.88      0.94      0.91      1751
Bijuterias e Jóias       0.89      0.95      0.92        94

          accuracy                           0.88      3794
         macro avg       0.87      0.83      0.85      3794
      weighted avg       0.88      0.88      0.88      3794



In [12]:
# exporta para o arquivo as metricas
METRICS_PATH=os.getenv("METRICS_PATH")

with open(METRICS_PATH, "a") as f:
    print("MÉTRICAS NO TESTE", file=f)
    print(metrics.classification_report(y_test, y_pred,target_names=target_names), file=f)

In [13]:
# visualiza o arquivo gerado
!cat {METRICS_PATH}

MÉTRICAS NO CROSS-VALIDATION (n_splits=5)
                    precision    recall  f1-score   support

         Decoração       0.88      0.87      0.88      1569
       Papel e Cia       0.79      0.70      0.74       492
            Outros       0.70      0.69      0.70       202
              Bebê       0.87      0.85      0.86      1245
     Lembrancinhas       0.90      0.92      0.91      3152
Bijuterias e Jóias       0.74      0.90      0.81       170

          accuracy                           0.87      6830
         macro avg       0.81      0.82      0.82      6830
      weighted avg       0.87      0.87      0.87      6830

                    precision    recall  f1-score   support

         Decoração       0.88      0.87      0.88      1569
       Papel e Cia       0.80      0.69      0.74       492
            Outros       0.71      0.74      0.72       202
              Bebê       0.88      0.86      0.87      1245
     Lembrancinhas       0.90     

## 5. Model exportation
Exports a candidate model to a specified path available in the environment variable MODEL_PATH.

In [14]:
# pipeline para exportar o modelo selecionado
from sklearn.pipeline import Pipeline

model = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1, 2),max_features=None,max_df=0.5)),
    ('tfidf', TfidfTransformer(norm='l1',use_idf=True)),
    ('clf', ComplementNB(alpha=0.01)),
])

# treina o modelo com todos os dados do conjunto de treino
model.fit(X_train['concatenated_tags'], y_train.values.ravel())

# predict com os dados de teste
predicted = model.predict(X_test['concatenated_tags'])
np.mean(predicted == y_test.values.ravel())

0.8803373748023194

In [15]:
import pickle

# gera o arquivo pkl
MODEL_PATH=os.getenv("MODEL_PATH")
with open(MODEL_PATH, 'wb') as file:
    pickle.dump(model, file)

In [16]:
# testa o arquivo pkl
with open(MODEL_PATH, 'rb') as file:
    pickle_model = pickle.load(file)

score = pickle_model.score(X_test['concatenated_tags'], y_test)
print("Test score: {0:.2f} %".format(100 * score))
Ypredict = pickle_model.predict(X_test['concatenated_tags'])

Test score: 88.03 %


In [17]:
### OTIMIZACAO DE HIPERPARAMETROS
### TRECHO COMENTADO POR CONTA DO TEMPO DE EXECUCAO 

#from sklearn.model_selection import GridSearchCV

#model = Pipeline([
#    ('vect', CountVectorizer()),
#    ('tfidf', TfidfTransformer()),
#    ('clf', ComplementNB()),
#])

#parameters = {
#    'vect__max_df': (0.5, 0.75, 1.0),
#    'vect__max_features': (None, 5000, 10000, 50000),
#    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
#    'tfidf__use_idf': (True, False),
#    'tfidf__norm': ('l1', 'l2'),
#    'clf__alpha': (0.01, 0.1, 0.5, 1.0, 10.0)
#    #'clf__penalty': ('l2', 'elasticnet'),
#    #'clf__n_iter': (10, 50, 80),
#}

#setting up the grid search
#gs=GridSearchCV(model,parameters,n_jobs=-1,cv=5)

##fitting gs to training data
#gs.fit(X_train['concatenated_tags'], y_train.values.ravel())

#print(gs.best_params_)
##print(gs_clf.best_score_)
#print(gs.best_score_)

#print("Grid scores on development set:")
#print()
#means = gs.cv_results_['mean_test_score']
#stds = gs.cv_results_['std_test_score']
#for mean, std, params in zip(means, stds, gs.cv_results_['params']):
#    print("%0.3f (+/-%0.03f) for %r"
#          % (mean, std * 2, params))
#print()




### RESULTADO DO AJUSTE DE HIPERPARAMETROS
#{'clf__alpha': 0.01, 'tfidf__norm': 'l1', 'tfidf__use_idf': True, 'vect__max_df': 0.5, 'vect__max_features': None, 'vect__ngram_range': (1, 2)}
#0.8778290660120106