In [4]:
import nltk
import re
from bs4 import BeautifulSoup
import unicodedata
from nltk.tokenize.toktok import ToktokTokenizer
import pandas as pd
import numpy as np
from gensim import utils
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec
import matplotlib.pyplot as plt
from gensim.models.word2vec import LineSentence
from gensim.models import KeyedVectors

In [None]:
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('portuguese')

In [None]:
#Limpar tags html do texto se tiver
def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text()
    return stripped_text

In [None]:
#Remover palavras com acento
def remove_accent(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

In [None]:
#Remover caracter especial
def remove_special_char(text):
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text

In [None]:
#Remove stopwords e aplica steeming
def remove_stopwords_stemming(text):
    stemmer = nltk.stem.RSLPStemmer()
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)
    return ''.join(stemmer.stem(w) for w in filtered_text)

In [None]:
def normalizador(text):

    #remove html
    text = strip_html_tags(text)

    # remove 'stopword'
    text = remove_stopwords_stemming(text)

    #remove acento
    text = remove_accent(text)

    #remove new line extra
    text = re.sub(r'[\r|\n|\r\n]+', ' ', text)

    #inserir espaco entre caracter especial
    special_char_pattern = re.compile(r'[\}\}\\\(\)\./!-]')
    text = special_char_pattern.sub(" ", text)

    #remove caracter especial
    text = remove_special_char(text)

    # remove extra whitespace
    text = re.sub(' +', ' ', text)
    
    # Remove numero
    text = re.sub('[\d]', '', text)


    return text


In [None]:
df = pd.read_csv('Textos_Modelo.csv', header=None)

In [None]:
df.columns = ['Frases']

In [103]:
df.head()

Unnamed: 0,Frases
0,Como cumprir ordem judicial?
1,Se não cumprir determinação judicial estarei s...
2,"Cumprindo determinação judicial, efetuei bloqu..."
3,Posso recusar o cumprimento de ordem judicial?
4,Tenho uma pendência de cumprimento de ofício m...


In [104]:
frases = [normalizador(frase) for frase in df['Frases'][:10000]]

In [105]:
frases[:5]

['cumprir ordem judicial ',
 'cumprir determinacao judicial estarei sujeito alguma penalidade ',
 'cumprindo determinacao judicial efetuei bloqueio ativos determinado cliente deverei informar fato juizo ',
 'posso recusar cumprimento ordem judicial ',
 'pendencia cumprimento oficio consigo visualizar aof lista']

In [106]:
frases_labeled = []

for i in range(len(frases)):
    frases_labeled.append(TaggedDocument(frases[i].split(), ['Linha_%d' % (i) ]))

In [107]:
frases_labeled[:5]

[TaggedDocument(words=['cumprir', 'ordem', 'judicial'], tags=['Linha_0']),
 TaggedDocument(words=['cumprir', 'determinacao', 'judicial', 'estarei', 'sujeito', 'alguma', 'penalidade'], tags=['Linha_1']),
 TaggedDocument(words=['cumprindo', 'determinacao', 'judicial', 'efetuei', 'bloqueio', 'ativos', 'determinado', 'cliente', 'deverei', 'informar', 'fato', 'juizo'], tags=['Linha_2']),
 TaggedDocument(words=['posso', 'recusar', 'cumprimento', 'ordem', 'judicial'], tags=['Linha_3']),
 TaggedDocument(words=['pendencia', 'cumprimento', 'oficio', 'consigo', 'visualizar', 'aof', 'lista'], tags=['Linha_4'])]

In [108]:
frases_split = []

for i in frases:
    frases_split.append(i.split())

lengths = []
for i in range(len(frases_split)):
    lengths.append(len(frases_split[i]))
lengths = pd.DataFrame(lengths, columns=["count"])

In [109]:
lengths.describe()

Unnamed: 0,count
count,10000.0
mean,5.0544
std,2.662996
min,0.0
25%,3.0
50%,5.0
75%,6.0
max,39.0


In [110]:
np.percentile(lengths['count'], 99)

15.0

In [None]:
alpha = 0.025
# Criacao do modelo
model = Doc2Vec(size=300, min_count=0, alpha=0.025, min_alpha=0.025)
model.build_vocab(frases_labeled)

In [None]:
# Treinamento do modelo
for epoch in range(100):
    model.train(frases_labeled, total_examples=model.corpus_count, epochs=model.iter)
    print("Epoch #{} is complete.".format(epoch+1))
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

In [None]:
embeddings = KeyedVectors.load_word2vec_format('cbow_s50.txt',binary=False,unicode_errors="ignore")

In [None]:
embeddings.most_similar('conta')

In [53]:
embeddings
embeddings.n_similarity(normalizador('gostaria de saber se minha conta e universitaria').split(),normalizador('quero saber o saldo da minha conta'.split())

  if np.issubdtype(vec.dtype, np.int):


0.704392

In [None]:
sentence_obama = 'gostaria de saber se minha conta e universitaria'.lower().split()
sentence_obama2 = 'jeremias, rei delas'.lower().split()

similarity = embeddings.wmdistance(sentence_obama, sentence_obama2)
print(100 - similarity)

In [21]:
normalizador('gostaria de saber se minha conta e universitaria').split()

['gostaria', 'saber', 'conta', 'universitaria']

In [51]:
normalizador('quero saber o saldo da minha conta').split()

['quero', 'saber', 'saldo', 'conta']

In [5]:
new_sentence = "cumprir determinacao judicial estarei sujeito alguma penalidade "
result = model.docvecs.most_similar(positive=[model.infer_vector(normalizador(new_sentence).split())],topn=5)

NameError: name 'model' is not defined

In [150]:
result

[('Linha_9', 0.8152622580528259),
 ('Linha_9553', 0.8133138418197632),
 ('Linha_704', 0.8094474673271179),
 ('Linha_7046', 0.8063158988952637),
 ('Linha_7024', 0.8062857389450073)]

In [144]:
for i in frases_labeled:
    if i[1][0] == result[0][0]:
        print ('Palavras mais recorrentes da ' + str(i[1][0]) +  ': ' + str(', '.join(i[0])))

Palavras mais recorrentes da Linha_6680: deve, ser, feito, registro, instrumentos, alienacao, fiduciaria, to


In [None]:
print(model['Linha_1'])

In [None]:
#Modelo FastText

model_ft = ft(workers=16, window=3, size=150 ,min_count=1, sample=0, min_n=3, max_n=5, negative=5, iter=100)
model_ft.build_vocab(frases_split)
model_ft.train(frases_split,total_examples=model_ft.corpus_count, epochs=model_ft.iter)

In [168]:
model_ft.save('fasttext.model')
model_ft = FastText.load('wiki_model.model')

In [158]:
print(model_ft)

FastText(vocab=30080, size=150, alpha=0.025)


In [159]:
model_ft.wv.most_similar("abertura")

  if np.issubdtype(vec.dtype, np.int):


[('aberrtura', 0.9054057598114014),
 ('deabertura', 0.8803530335426331),
 ('cabertura', 0.866870641708374),
 ('aberturada', 0.8377639055252075),
 ('ebertura', 0.8302403092384338),
 ('bertura', 0.8286916613578796),
 ('abertua', 0.8232411742210388),
 ('abertuta', 0.8053635358810425),
 ('reabertura', 0.7729989290237427),
 ('aberturas', 0.7646409273147583)]

In [160]:
model_ft.wv.most_similar("computador")

  if np.issubdtype(vec.dtype, np.int):


[('caomputador', 0.9365118145942688),
 ('computadorr', 0.9351781010627747),
 ('computadort', 0.9127434492111206),
 ('compuutador', 0.9103546738624573),
 ('cmputador', 0.9095860719680786),
 ('computadoro', 0.9047839641571045),
 ('computtador', 0.9006338715553284),
 ('coimputador', 0.8923019766807556),
 ('computadodr', 0.8905304074287415),
 ('comnputador', 0.8782629370689392)]

In [161]:
model_ft.wv.most_similar("apagar")

  if np.issubdtype(vec.dtype, np.int):


[('qapagar', 0.950679361820221),
 ('cagar', 0.8128378987312317),
 ('pagar', 0.7789885401725769),
 ('pagaar', 0.7066824436187744),
 ('estragar', 0.7054852247238159),
 ('upar', 0.701307475566864),
 ('chupar', 0.6877528429031372),
 ('emtrar', 0.6870281100273132),
 ('erar', 0.6849189400672913),
 ('desboclear', 0.6848874092102051)]

In [162]:
index2word_set = [key for key in model_ft.wv.vocab]

In [163]:
model_ft.wv.similarity("cache","browser")

  if np.issubdtype(vec.dtype, np.int):


0.15573229

In [164]:
from scipy import spatial

In [165]:
def avg_feature_vector_ft(sentence, model, num_features, index2word_set):
    words = sentence.split()
    feature_vec = np.zeros((num_features, ), dtype='float32')
    n_words = 0
    for word in words:
        try:
            word = model.most_similar(word)[0][0]
        except KeyError:
            word = word
        if word in index2word_set:
            n_words += 1
            feature_vec = np.add(feature_vec, model[word])
    if (n_words > 0):
        feature_vec = np.divide(feature_vec, n_words)
    return feature_vec

In [173]:
#calculo de smilariade entre frases FastText

s1_afv = avg_feature_vector_ft(normalizador('Quero limpar o cache'), model=model_ft, num_features=150, index2word_set =index2word_set)
s2_afv = avg_feature_vector_ft(normalizador('Utilizo o F12 para abrir o console?'), model=model_ft, num_features=150, index2word_set =index2word_set)
sim1 = 1 - spatial.distance.cosine(s1_afv, s2_afv)
print(('oi',sim1)[1])

-0.12414616346359253


  import sys
  if np.issubdtype(vec.dtype, np.int):
  if sys.path[0] == '':


In [97]:
def avg_feature_vector_wv(sentence, model, num_features, index2word_set):
    words = sentence.split()
    feature_vec = np.zeros((num_features, ), dtype='float32')
    n_words = 0
    for word in words:
        if word in index2word_set:
            n_words += 1
            feature_vec = np.add(feature_vec, model[word])
    if (n_words > 0):
        feature_vec = np.divide(feature_vec, n_words)
    return feature_vec

In [8]:
#Modelo Word2vec

from gensim.models import Word2Vec
#model_wv = Word2Vec(frases_split, min_count=2, workers=10)

In [467]:
model_wv.train(frases_split, total_examples=len(frases_split), epochs=10)

(13860737, 14448120)

In [473]:
index2word_setwv = model_wv.wv.vocab

In [476]:
s1_wv = avg_feature_vector_wv(normalizador('Hoje o nao tem pastel!'), model=model_wv, num_features=100, index2word_set=index2word_setwv)
s2_wv = avg_feature_vector_wv(normalizador('Hoje tem pastel!'), model=model_wv, num_features=100, index2word_set=index2word_setwv)
sim2 = 1 - spatial.distance.cosine(s1_wv, s2_wv)
print(sim2)

0.9963108897209167


  


In [481]:
model_wv.most_similar('natural')

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('vocabular', 0.5262881517410278),
 ('assombrado', 0.5185117721557617),
 ('quam', 0.5080987215042114),
 ('iucn', 0.5079662799835205),
 ('ecologia', 0.5020368695259094),
 ('artificial', 0.4960860311985016),
 ('representamos', 0.4913288354873657),
 ('fritzmuelleri', 0.4895269572734833),
 ('centramento', 0.4886212646961212),
 ('crucial', 0.48700249195098877)]

In [57]:
import nltk, string
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt') # if necessary...


stemmer = nltk.stem.porter.PorterStemmer()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)

def stem_tokens(tokens):
    return [stemmer.stem(item) for item in tokens]

'''remove punctuation, lowercase, stem'''
def normalize(text):
    return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))

vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words='english')

def cosine_sim(text1, text2):
    tfidf = vectorizer.fit_transform([text1, text2])
    print( ((tfidf * tfidf.T)) )
    return ((tfidf * tfidf.T).A)[0,1]




[nltk_data] Downloading package punkt to /home/gabriel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [58]:
print (cosine_sim('a little bird', 'a little bird black'))

  (0, 1)	0.7092972666062738
  (0, 0)	0.9999999999999998
  (1, 1)	1.0
  (1, 0)	0.7092972666062738
0.7092972666062738


In [9]:
model_w2v = Word2Vec.load_word2vec_format('/home/gabriel/Desktop/wang2vec-master/wang2vec_model')

DeprecationWarning: Deprecated. Use gensim.models.KeyedVectors.load_word2vec_format instead.

In [10]:
model_w2v

NameError: name 'model_w2v' is not defined