In [None]:
!pip install mysql-connector-python

Collecting mysql-connector-python
[?25l  Downloading https://files.pythonhosted.org/packages/6c/1d/e666f7d43496a2315d3963a2fb7f8df84e7293b4ddbf05e46d6bdb4a8892/mysql_connector_python-8.0.22-cp36-cp36m-manylinux1_x86_64.whl (18.0MB)
[K     |████████████████████████████████| 18.0MB 225kB/s 
Installing collected packages: mysql-connector-python
Successfully installed mysql-connector-python-8.0.22


In [None]:
import pandas as pd 
import numpy as np
import nltk

from joblib import dump
from nltk.corpus import stopwords
from sklearn import model_selection, preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sqlalchemy import create_engine
from sklearn import linear_model
from sklearn.metrics import accuracy_score
from sklearn import svm

np.random.seed(123) #for reprodicible results
%matplotlib inline

In [None]:
nltk.download('stopwords')
stopwords = set(stopwords.words('portuguese'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
DB_PASS = ''
DB_USER = ''
DB_HOST = ''
DB_TABLE = ""

engine = create_engine('mysql+mysqlconnector://{}:{}@{}/{}'.format(DB_USER, DB_PASS, DB_HOST, DB_TABLE), echo=False)

In [None]:
data = pd.read_sql_query('SELECT name, categories_1 \
                        FROM products_supervisioned \
                        GROUP BY name', 
                        engine_tests)

In [None]:
data['categories_1'].value_counts()

Mercearia              22803
Bebidas                12150
Perfumaria E Beleza    11771
Bazar E Utilidades      7062
Limpeza                 6334
Laticínios              4728
Frios E Congelados      2734
Hortifruti              2043
Carnes                  1639
Padaria                 1238
Pet Shop                 716
Flores E Plantas         132
Nao Utilizados            29
Name: categories_1, dtype: int64

In [None]:
selected_categories = data['categories_1'].value_counts().index.tolist()

In [None]:
selected_categories

['Mercearia',
 'Bebidas',
 'Perfumaria E Beleza',
 'Bazar E Utilidades',
 'Limpeza',
 'Laticínios',
 'Frios E Congelados',
 'Hortifruti',
 'Carnes',
 'Padaria',
 'Pet Shop',
 'Flores E Plantas',
 'Nao Utilizados']

In [None]:
train_x, test_x, train_y, test_y = model_selection.train_test_split(data['name'], data['categories_1'])

# label encode the target variable, encode labels to 0, 1, 2
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
test_y = encoder.fit_transform(test_y)

categories_df = pd.DataFrame({"category": selected_categories}, index=encoder.transform(selected_categories))
categories_df

Unnamed: 0,category
8,Mercearia
1,Bebidas
11,Perfumaria E Beleza
0,Bazar E Utilidades
7,Limpeza
6,Laticínios
4,Frios E Congelados
5,Hortifruti
2,Carnes
10,Padaria


In [None]:
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', stop_words = stopwords,
                             max_features=5000, ngram_range=(1,2))
tfidf_vect.fit(data['name'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=5000,
                min_df=1, ngram_range=(1, 2), norm='l2', preprocessor=None,
                smooth_idf=True,
                stop_words={'a', 'ao', 'aos', 'aquela', 'aquelas', 'aquele',
                            'aqueles', 'aquilo', 'as', 'até', 'com', 'como',
                            'da', 'das', 'de', 'dela', 'delas', 'dele', 'deles',
                            'depois', 'do', 'dos', 'e', 'ela', 'elas', 'ele',
                            'eles', 'em', 'entre', 'era', ...},
                strip_accents=None, sublinear_tf=False, token_pattern='\\w{1,}',
                tokenizer=None, use_idf=True, vocabulary=None)

In [None]:
xtrain_tfidf =  tfidf_vect.transform(train_x).toarray()
xtest_tfidf =  tfidf_vect.transform(test_x).toarray()

In [None]:
dump(categories_df, '/content/drive/My Drive/Colab Notebooks/files/categories_df.pkl') 
dump(tfidf_vect, '/content/drive/My Drive/Colab Notebooks/files/tfidf_vect.pkl')

['/content/drive/My Drive/Colab Notebooks/files/tfidf_vect.pkl']

In [None]:
print('Number of training documents: %s' %str(xtrain_tfidf.shape[0]))
print('Number of testing documents: %s' %str(xtest_tfidf.shape[0]))
print('Number of features of each document: %s' %str(xtrain_tfidf.shape[1]))
print('xtrain_tfidf shape: %s' %str(xtrain_tfidf.shape))
print('train_y shape: %s' %str(train_y.shape))
print('xtest_tfidf shape: %s' %str(xtest_tfidf.shape))
print('test_y shape: %s' %str(test_y.shape))

Number of training documents: 55034
Number of testing documents: 18345
Number of features of each document: 5000
xtrain_tfidf shape: (55034, 5000)
train_y shape: (55034,)
xtest_tfidf shape: (18345, 5000)
test_y shape: (18345,)


In [None]:
tfidf_vect.inverse_transform(xtrain_tfidf[13])

[array(['280g', 'damasco', 'diet', 'geleia', 'geleia queensberry',
        'queensberry'], dtype='<U28')]

In [None]:
%%time
logreg = linear_model.LogisticRegression(C=1e5, 
        solver = 'sag', multi_class = 'multinomial')
logreg.fit(xtrain_tfidf, train_y)

CPU times: user 24min 46s, sys: 174 ms, total: 24min 46s
Wall time: 24min 47s




In [None]:
dump(logreg, '/content/drive/My Drive/Colab Notebooks/files/model.joblib')

['/content/drive/My Drive/Colab Notebooks/files/model.joblib']

In [None]:
# test
y_pred = logreg.predict(xtest_tfidf)
print("Accuracy: %.2f %%" %(100*accuracy_score(test_y, y_pred.tolist())))

Accuracy: 95.70 %
