Uma breve exploração dos dados pode set encontrado em [exploration.ipynb](./exploration.ipynb)

# Importando bibliotecas

In [29]:
%load_ext dotenv
%dotenv
!python -m spacy download pt_core_news_sm
nltk.download('stopwords')

The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv
Defaulting to user installation because normal site-packages is not writeable
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('pt_core_news_sm')
[nltk_data] Downloading package stopwords to /home/guiss/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [32]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

from sklearn.preprocessing import LabelEncoder

# NLP libs
import nltk
from nltk.corpus import stopwords
import spacy
from gensim.models import KeyedVectors

# Data Extraction

In [4]:

dataset_path = os.environ['DATASET_PATH']
df = pd.read_csv(dataset_path, index_col=0)
df.head()

Unnamed: 0_level_0,seller_id,query,search_page,position,title,concatenated_tags,creation_date,price,weight,express_delivery,minimum_quantity,view_counts,order_counts,category
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
11394449,8324141,espirito santo,2,6,Mandala Espírito Santo,mandala mdf,2015-11-14 19:42:12,171.89,1200.0,1,4,244,,Decoração
15534262,6939286,cartao de visita,2,0,Cartão de Visita,cartao visita panfletos tag adesivos copos lon...,2018-04-04 20:55:07,77.67,8.0,1,5,124,,Papel e Cia
16153119,9835835,expositor de esmaltes,1,38,Organizador expositor p/ 70 esmaltes,expositor,2018-10-13 20:57:07,73.920006,2709.0,1,1,59,,Outros
15877252,8071206,medidas lencol para berco americano,1,6,Jogo de Lençol Berço Estampado,t jogo lencol menino lencol berco,2017-02-27 13:26:03,118.770004,0.0,1,1,180,1.0,Bebê
15917108,7200773,adesivo box banheiro,3,38,ADESIVO BOX DE BANHEIRO,adesivo box banheiro,2017-05-09 13:18:38,191.81,507.0,1,6,34,,Decoração


# Data Formatting

### Set types

In [5]:
# Save types
df['query'] = df['query'].astype('string')
df['title'] = df['title'].astype('string')
df['concatenated_tags'] = df['concatenated_tags'].astype('string')
df['creation_date'] = pd.to_datetime(df['creation_date'])
df['order_counts'][df['order_counts'].isnull()] = 0
df['category'] = df['category'].astype('category')

print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 38000 entries, 11394449 to 6866725
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   seller_id          38000 non-null  int64         
 1   query              38000 non-null  string        
 2   search_page        38000 non-null  int64         
 3   position           38000 non-null  int64         
 4   title              38000 non-null  string        
 5   concatenated_tags  37998 non-null  string        
 6   creation_date      38000 non-null  datetime64[ns]
 7   price              38000 non-null  float64       
 8   weight             37942 non-null  float64       
 9   express_delivery   38000 non-null  int64         
 10  minimum_quantity   38000 non-null  int64         
 11  view_counts        38000 non-null  int64         
 12  order_counts       38000 non-null  float64       
 13  category           38000 non-null  category      
dt

### Category to number

In [28]:
le = LabelEncoder()
df['category_key'] = le.fit_transform(df['category'])
df['category_key']

product_id
11394449    2
15534262    5
16153119    4
15877252    0
15917108    2
           ..
13230578    3
6736914     5
11017911    0
6807331     3
6866725     2
Name: category_key, Length: 38000, dtype: int64

In [33]:
nlp = spacy.load("pt_core_news_sm")
nltk.download('stopwords')
stopwords_pt = set(stopwords.words('portuguese'))

[nltk_data] Downloading package stopwords to /home/guiss/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
def process_string(text):
    letter_text = re.findall(r'[a-zéóáêâãõç]+', text.lower())
    without_stopwords = [p for p in palavras if p not in stopwords_pt]
    return ' '.join(without_stopwords)



In [8]:
cbow_path = os.environ['CBOW_PATH']
model = KeyedVectors.load_word2vec_format(cbow_path)

In [11]:
model['decorar']

array([-1.13307e-01, -1.16743e-01,  1.50455e-01, -3.58695e-01,
        1.28825e-01, -2.45107e-01,  1.33978e-01, -1.02413e-01,
        8.52740e-02,  4.79273e-01,  1.75198e-01,  2.35300e-03,
        6.47560e-02,  6.72160e-02,  3.88620e-02,  8.91740e-02,
        1.71379e-01, -1.34417e-01,  9.45060e-02,  2.44207e-01,
       -3.45270e-02, -1.72445e-01,  6.71440e-02,  4.32640e-02,
       -1.54391e-01,  2.79545e-01, -2.05307e-01, -3.04965e-01,
       -6.19200e-02,  3.59846e-01,  1.50000e-05,  4.58610e-02,
       -5.35870e-02,  4.53210e-02, -7.59300e-02,  1.43144e-01,
        1.82000e-03,  4.57890e-02, -2.18112e-01, -3.03660e-01,
        3.72200e-01, -1.91510e-02,  3.38653e-01, -1.16413e-01,
       -2.25771e-01, -4.94700e-03,  1.87180e-01, -5.17280e-02,
        8.58020e-02, -3.02280e-02, -2.68680e-02, -2.32984e-01,
       -4.27490e-02,  1.23028e-01,  2.08200e-02,  9.91100e-02,
       -5.95780e-02, -1.19358e-01, -3.98400e-02,  8.88800e-03,
       -3.24750e-02, -2.48216e-01,  6.86320e-02, -1.531