In [14]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import *
from sklearn.linear_model import LogisticRegression
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline

In [2]:
df = pd.read_csv('elo7_recruitment_dataset.csv')

In [3]:
df.head()

Unnamed: 0,product_id,seller_id,query,search_page,position,title,concatenated_tags,creation_date,price,weight,express_delivery,minimum_quantity,view_counts,order_counts,category
0,11394449,8324141,espirito santo,2,6,Mandala Espírito Santo,mandala mdf,2015-11-14 19:42:12,171.89,1200.0,1,4,244,,Decoração
1,15534262,6939286,cartao de visita,2,0,Cartão de Visita,cartao visita panfletos tag adesivos copos lon...,2018-04-04 20:55:07,77.67,8.0,1,5,124,,Papel e Cia
2,16153119,9835835,expositor de esmaltes,1,38,Organizador expositor p/ 70 esmaltes,expositor,2018-10-13 20:57:07,73.920006,2709.0,1,1,59,,Outros
3,15877252,8071206,medidas lencol para berco americano,1,6,Jogo de Lençol Berço Estampado,t jogo lencol menino lencol berco,2017-02-27 13:26:03,118.770004,0.0,1,1,180,1.0,Bebê
4,15917108,7200773,adesivo box banheiro,3,38,ADESIVO BOX DE BANHEIRO,adesivo box banheiro,2017-05-09 13:18:38,191.81,507.0,1,6,34,,Decoração


In [4]:
total = df.groupby('category').agg(n = ('category','count')).reset_index()
total['perc'] = 100*total['n']/total['n'].sum()

In [5]:
total

Unnamed: 0,category,n,perc
0,Bebê,7026,18.246033
1,Bijuterias e Jóias,951,2.469681
2,Decoração,8846,22.972447
3,Lembrancinhas,17759,46.118887
4,Outros,1148,2.981276
5,Papel e Cia,2777,7.211676


In [6]:
X = df['title']
y = df['category']

In [7]:
X.isna().sum()

0

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [11]:
vect = CountVectorizer()

In [15]:
LR = make_pipeline(CountVectorizer(), LogisticRegression(solver='lbfgs', max_iter=10000)).fit(X_train, y_train)

In [16]:
print("Acurácia treino LR:", LR.score(X_train,y_train), "\n")

Acurácia treino LR: 0.9178650335284313 



In [17]:
print("Acurácia teste LR:", LR.score(X_test,y_test), "\n")

Acurácia teste LR: 0.8738589864652188 



In [18]:
comparativo = pd.DataFrame(y_test.reset_index(drop=True)).merge(pd.DataFrame(LR.predict(X_test), columns = ['categoria_predita']),left_index=True, right_index=True)

In [19]:
comparativo['acertou'] = np.where(comparativo['category'] == comparativo['categoria_predita'],1,0)

In [20]:
comparativo_sum_count = comparativo.groupby('category').agg(soma = ('acertou','sum'),
                                    count = ('acertou','count')).reset_index()

In [21]:
comparativo_sum_count['perc'] = 100*comparativo_sum_count['soma']/comparativo_sum_count['count']

In [22]:
comparativo_sum_count

Unnamed: 0,category,soma,count,perc
0,Bebê,1937,2355,82.250531
1,Bijuterias e Jóias,255,289,88.235294
2,Decoração,2602,2903,89.631416
3,Lembrancinhas,5518,5876,93.90742
4,Outros,194,366,53.005464
5,Papel e Cia,599,919,65.179543


In [23]:
pickle.dump(LR, open('elo7_category.sav', 'wb'))

In [24]:
loaded_model = pickle.load(open(r'elo7_category.sav', 'rb'))

In [25]:
loaded_model.predict(pd.Series("buceta"))

array(['Lembrancinhas'], dtype=object)