In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
fazenda = pd.read_csv("C:/Users/NB/Desktop/dados_fazenda/dados_fazenda_descricao.csv", delimiter = ',', encoding='utf-8', decimal='.' )

In [3]:
fazenda

Unnamed: 0,NAT_OPER
0,[S] VENDA DE MERCADORIA P-REVENDA
1,105 - Transf. Mercadoria Saida
2,106 - Devolucao Troca/Impropia
3,1202.058
4,1202.064
5,1-Transferencia de Mercadoria Lojas
6,300-VENDA DE PRODUTOS
7,320-BONIFICACAO
8,5102 - VENDA DE MERCADORIA ADQUIRIDA OU RECEBI...
9,5102/6102 Venda de mercadoria adquirida ou rec...


In [4]:
fazenda = [e for e in fazenda['NAT_OPER'] if isinstance(e, str)]
fazenda = pd.DataFrame(fazenda)

In [5]:
vectorizer = TfidfVectorizer() # variar parametros (L1/L2, etc)
X = vectorizer.fit_transform(fazenda[0])
X = normalize(X)

In [7]:
kmeans = KMeans(
    n_clusters = 4, # variar
    n_init = 15, 
    max_iter = 450, 
    init = 'k-means++'
)

In [8]:
kmeans.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=450,
    n_clusters=4, n_init=15, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [9]:
fazenda['cluster'] = kmeans.labels_
fazenda

Unnamed: 0,0,cluster
0,[S] VENDA DE MERCADORIA P-REVENDA,0
1,105 - Transf. Mercadoria Saida,3
2,106 - Devolucao Troca/Impropia,3
3,1202.058,3
4,1202.064,3
5,1-Transferencia de Mercadoria Lojas,0
6,300-VENDA DE PRODUTOS,3
7,320-BONIFICACAO,3
8,5102 - VENDA DE MERCADORIA ADQUIRIDA OU RECEBI...,0
9,5102/6102 Venda de mercadoria adquirida ou rec...,0


In [11]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
vectorizer = CountVectorizer() # variar parametros (L1/L2, etc)
X = vectorizer.fit_transform(fazenda[0])

In [14]:
# vetoriza
vectorizer = TfidfVectorizer() # variar parametros (L1/L2, etc)
X = vectorizer.fit_transform(fazenda[0])

# LSA
lsa = TruncatedSVD(n_components = 10)
X_reduced = lsa.fit_transform(X)

In [15]:
# clusteriza c/ DBSCAN
dbscan = DBSCAN(
    eps = 0.5, # variar
    min_samples = 5 # variar
)
dbscan.fit(X_reduced)

DBSCAN(algorithm='auto', eps=0.5, leaf_size=30, metric='euclidean',
    metric_params=None, min_samples=5, n_jobs=1, p=None)

In [16]:
fazenda['dbscan'] = dbscan.labels_ 

In [17]:
fazenda

Unnamed: 0,0,cluster,dbscan
0,[S] VENDA DE MERCADORIA P-REVENDA,0,0
1,105 - Transf. Mercadoria Saida,3,0
2,106 - Devolucao Troca/Impropia,3,0
3,1202.058,3,0
4,1202.064,3,0
5,1-Transferencia de Mercadoria Lojas,0,0
6,300-VENDA DE PRODUTOS,3,0
7,320-BONIFICACAO,3,0
8,5102 - VENDA DE MERCADORIA ADQUIRIDA OU RECEBI...,0,0
9,5102/6102 Venda de mercadoria adquirida ou rec...,0,0


In [18]:
outliers = pd.DataFrame(fazenda)


In [20]:
print(outliers[dbscan.labels_==-1])

Empty DataFrame
Columns: [0, cluster, dbscan]
Index: []


In [22]:
from sklearn.metrics import silhouette_score
from collections import Counter

In [24]:
fazenda['cluster'].value_counts()

3    431
0    103
2     88
1     41
Name: cluster, dtype: int64

In [25]:
text = ['Vendas de mercadoria']

In [27]:
X_new = vectorizer.transform(text)
fazenda['coseno'] = cosine_similarity(X_new, X)[0]
print(fazenda.sort_values(by = 'coseno'))

                                                     0  cluster  dbscan  \
331                    TRANSF ENTRE FILIAIS(54/53)C/ST        3       0   
350                   TRANSF. MERC. P/ COMERCIALIZACAO        3       0   
349  TRANSF. MERC. ADQ./REC. TERC. C/MERC.SUJ. SUBS...        2       0   
348                       Transf. merc. adq. rec. terc        2       0   
347                             TRANSF. MERC. ADQ. 3 S        2       0   
346                                    TRANSF. FILIAIS        3       0   
342                                 TRANSF. ATIVO FIXO        3       0   
341                     TRANSF PROD DO ESTABELECIMENTO        3       0   
338                        TRANSF MERC RECDA TERCEIROS        3       0   
336                            TRANSF MERC ADQREC TERC        2       0   
335  Transf merc adq/rec terceiros, sujeita regime ...        2       0   
334                           TRANSF MERC ADQ/REC TERC        2       0   
355                      

In [28]:
import nltk
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer
import re
import operator
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import words
from nltk.corpus import webtext
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
# If you would like to work with the raw text you can use 'moby_raw'
with open('C:/Users/NB/Desktop/dados_fazenda/dados_fazenda_descricao.csv',  encoding='utf-8',) as f:
    moby_raw = f.read()
    
# If you would like to work with the novel in nltk.Text format you can use 'text1'
moby_tokens = nltk.word_tokenize(moby_raw)
text1 = nltk.Text(moby_tokens)

In [29]:
texto = [w.lower() for w in webtext.words('C:/Users/NB/Desktop/dados_fazenda/dados_fazenda_descricao.csv')]

In [30]:
procurar = BigramCollocationFinder.from_words(moby_tokens)

In [31]:
procurar.nbest(BigramAssocMeasures.likelihood_ratio, 30)

[('adquirida', 'ou'),
 ('VENDA', 'DE'),
 ('Venda', 'de'),
 ('de', 'mercadoria'),
 ('DE', 'MERCADORIA'),
 ('mercadoria', 'adquirida'),
 ('ou', 'recebida'),
 ('MERC', '.'),
 ('do', 'estabelecimento'),
 ('MERC', 'ADQ'),
 ('DO', 'ESTADO'),
 ('producao', 'do'),
 ('OU', 'RECEBIDA'),
 ('de', 'terceiros'),
 ('ENTRE', 'FILIAIS'),
 ('VENDA', 'MERC'),
 ('DEVOLUCAO', 'DE'),
 ('MERCADORIA', 'ADQUIRIDA'),
 ('DE', 'MERCADORIAS'),
 ('recebida', 'de'),
 ('ADQUIRIDA', 'OU'),
 ('OUTRAS', 'SAIDAS'),
 ('REMESSA', 'EM'),
 ('OU', 'BRINDE'),
 ('de', 'Mercadoria'),
 ('PRODUCAO', 'DO'),
 ('DE', 'MERC'),
 ('LANCAMENTO', 'EFETUADO'),
 ('DOACAO', 'OU'),
 (',', 'DOACAO')]

In [36]:
from nltk.corpus import stopwords
retira = set(stopwords.words('portuguese'))
filtra = lambda w: len(w) <3 or w in retira

In [37]:
procurar.apply_word_filter(filtra)

In [38]:
procurar.nbest(BigramAssocMeasures.likelihood_ratio, 30)

[('mercadoria', 'adquirida'),
 ('MERC', 'ADQ'),
 ('ENTRE', 'FILIAIS'),
 ('VENDA', 'MERC'),
 ('MERCADORIA', 'ADQUIRIDA'),
 ('OUTRAS', 'SAIDAS'),
 ('LANCAMENTO', 'EFETUADO'),
 ('OUTRA', 'SAIDA'),
 ('PARA', 'COMERCIALIZACAO'),
 ('MERCADORIA/', 'VENDA'),
 ('NAO', 'CONTRIBUINTE'),
 ('IMP', 'ICMS'),
 ('nao', 'contribuinte'),
 ('PRODUTO', 'CST'),
 ('DEVA', 'POR'),
 ('POR', 'ELE'),
 ('PRODUCAO', 'ESTABELECIMENTO'),
 ('CUPOM', 'FISCAL'),
 ('ESPECIFICADO', 'OUTRA'),
 ('ESTOQUE', 'DECOR'),
 ('VENDA', 'PROD'),
 ('COMPRA', 'PARA'),
 ('Vnd', 'mer.adq.rec.ter.op.mer.sj'),
 ('Vnd', 'mer.adq.rec.ter.op.mer.sj.rg.sub.trb.cnd.sub'),
 ('amostra', 'gratis'),
 ('entrega', 'futura'),
 ('merc.adq.receb.de', 'terceiros'),
 ('NAO', 'DEVA'),
 ('NAO', 'ESPECIFICADO'),
 ('QUE', 'NAO')]

In [39]:
from nltk.collocations import TrigramCollocationFinder
from nltk.metrics import TrigramAssocMeasures

In [40]:
procurar2 = TrigramCollocationFinder.from_words(moby_tokens)

In [41]:
procurar2.nbest(TrigramAssocMeasures.likelihood_ratio, 30)

[('adquirida', 'ou', 'recebida'),
 ('mercadoria', 'adquirida', 'ou'),
 ('Venda', 'de', 'mercadoria'),
 ('.', 'MERC', '.'),
 ('VENDA', 'DE', 'MERCADORIA'),
 ('de', 'mercadoria', 'adquirida'),
 ('adquirida', 'ou', 'rec'),
 ('VENDA', 'DE', 'MERCADORIAS'),
 ('producao', 'do', 'estabelecimento'),
 ('VENDA', 'DE', 'MERC'),
 ('DEVOLUCAO', 'DE', 'MERCADORIA'),
 ('adquirida', 'ou', 'receb'),
 ('DE', 'MERCADORIA', 'ADQUIRIDA'),
 ('VENDA', 'DE', 'PRODUCAO'),
 ('adquirida', 'ou', 'receb.de'),
 ('adquirida', 'ou', 'recebid'),
 ('VENDA', 'MERC', '.'),
 ('Venda', 'de', 'Mercadoria'),
 ('merc', 'adquirida', 'ou'),
 ('.', 'adquirida', 'ou'),
 ('ou', 'recebida', 'de'),
 ('VENDA', 'DE', 'MERCADORIA/'),
 ('.', 'TRANSF', '.'),
 ('DE', 'MERC', '.'),
 ('.', 'merc', '.'),
 ('MERCADORIA/', 'VENDA', 'DE'),
 ('VENDA', 'MERC', 'ADQ'),
 ('MERC', '.', 'TERC'),
 ('.', 'rec', '.'),
 ('ADQUIRIDA', 'OU', 'RECEBIDA')]

In [42]:
filtra2 = lambda w: len(w) <3 or w in retira

In [43]:
procurar2.apply_word_filter(filtra2)

In [44]:
procurar2.nbest(TrigramAssocMeasures.likelihood_ratio, 30)

[('VENDA', 'MERC', 'ADQ'),
 ('MERC', 'ADQ', 'TERC'),
 ('Transf', 'mercadoria', 'adquirida'),
 ('MERCADORIA/', 'VENDA', 'MERC'),
 ('MERC', 'ADQ', '3OS/'),
 ('MERC', 'ADQ', 'TERCEIROS'),
 ('VDA', 'MERC', 'ADQ'),
 ('TRANSF', 'MERC', 'ADQ'),
 ('TERCEIROS', 'VENDA', 'MERC'),
 ('MERC', 'ADQ', 'TERC./'),
 ('MERC', 'ADQ', 'TERC/'),
 ('VENDAS', 'MERC', 'ADQ'),
 ('TRANSFERENCIA', 'ENTRE', 'FILIAIS'),
 ('TRANSF', 'ENTRE', 'FILIAIS'),
 ('MERC', 'ADQ', 'TERCEI'),
 ('VEN', 'MERC', 'ADQ'),
 ('BONIF', 'MERC', 'ADQ'),
 ('TRANF', 'MERC', 'ADQ'),
 ('ANTERI', 'VENDA', 'MERC'),
 ('SUBSTITUTO', 'VENDA', 'MERC'),
 ('MERC', 'ADQ', 'REC'),
 ('CONTRIBUINTE', 'VENDA', 'MERC'),
 ('VENDA', 'MERC', 'ADQUIRIDA'),
 ('ENTRE', 'FILIAIS', 'TRANSFERENCIA'),
 ('TERC', 'VENDA', 'MERC'),
 ('VENDA', 'MERC', 'ADQU'),
 ('VENDAS/', 'OUTRAS', 'SAIDAS'),
 ('VENDA', 'MERCADORIA', 'ADQUIRIDA'),
 ('DO/', 'VENDA', 'MERC'),
 ('REMESS', 'VENDA', 'MERC')]

In [54]:
fazenda[fazenda['cluster']==3]

Unnamed: 0,0,cluster,dbscan,coseno
1,105 - Transf. Mercadoria Saida,3,0,0.164199
2,106 - Devolucao Troca/Impropia,3,0,0.000000
3,1202.058,3,0,0.000000
4,1202.064,3,0,0.000000
6,300-VENDA DE PRODUTOS,3,0,0.074046
7,320-BONIFICACAO,3,0,0.000000
11,5927 - LANCAMENTO EFETUADO A TITULO DE BAIXA D...,3,0,0.087611
12,5927.111,3,0,0.000000
13,5949 - OUTRA SAIDA DE MERCADORIA OU PRESTACAO ...,3,0,0.199142
14,6.101 - VENDA FORA DO ESTADO,3,0,0.000000
