# <font color='blue'>Data Science Academy - Machine Learning</font>

# <font color='blue'>Capítulo 10 - Processamento de Linguagem Natural</font>

****** Este Jupyter Notebook foi atualizado para a versão 3.6.1. da Linguagem Python em 05/07/2017 ******

## Bag of Words


O modelo de "saco de palavras" é uma representação simplificada usada no processamento de linguagem natural e recuperação de informação. Neste modelo, um texto (como uma sentença ou um documento) é representado como o saco (multiset) de suas palavras, desconsiderando a gramática e até a ordem das palavras, mas mantendo a multiplicidade.

Na classificação de documentos, um saco de palavras é um vetor esparso de ocorrência de contagens de palavras; Ou seja, um histograma esparso sobre o vocabulário.

### Carregando um Dataset de um Site de E-commerce (em português)

In [None]:
import gzip
import json

In [None]:
# Carregando o dataset
corpus = list()
with gzip.open('ecommerce.json.gz') as fp:
    for line in fp:
        entry = line.decode('utf8')
        corpus.append(json.loads(entry))

In [None]:
from pprint import pprint
pprint(corpus[0])

In [None]:
print (corpus[0]['descr'])

## Gensim - Modelagem de Tópicos

https://github.com/RaRe-Technologies/gensim

In [None]:
# Por enquanto disponíve, apenas para Python 2.7
#!pip install pattern

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
!pip install gensim

In [None]:
import gensim
print (gensim.summarization.summarize(corpus[0]['descr']))

In [None]:
len(corpus)

In [None]:
# Construindo um classificador para produtos e categorias (considerando apenas os 10 mil primeiros produtos)
dataset = list()
for entry in corpus[:50000]:
    if 'cat' in entry:
        dataset.append( (entry['name'], entry['cat'].lower().strip()) )

In [None]:
len(dataset)

In [None]:
pprint(dataset[:10])

In [None]:
# Quantas categorias distintas nós temos e quantos itens por categoria?
from collections import Counter
counter = Counter([cat for prod, cat in dataset])
pprint(counter.most_common())

# Construindo um Classificador SVM com Bag of Words

http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
import nltk as nltk
import pandas as pd
import numpy as np
#nltk.download()

In [None]:
len(corpus)


In [None]:
for i in corpus[:2]:
    print(i['title'].lower().strip())

In [None]:
stopwords = nltk.corpus.stopwords.words('portuguese')
stopwords = nltk.corpus.stopwords.words('english')
stopwords = [ 'r' + "'\b" + s + "\b'" + '|' for s in stopwords]
stopwords = ''.join(stopwords)
#stopwords = '['+ stopwords + ']'

for t, doc in dataset[:15]:
    for w in t.split(' '):
        if w not in stopwords:
            t = t + w
            
    # stem each word
    #sentence_words = [stemmer.stem(word.lower()) for word in sentence_words]
#stopwords

In [None]:
# Construindo o modelo SVM com Pipeline
modelo = Pipeline([('vect', TfidfVectorizer()), ('clf', SVC(kernel = 'linear', probability = True))])

In [None]:
print(modelo)

In [None]:
#?LabelEncoder

In [None]:
# Objeto para Normalização dos labels

encoder = LabelEncoder()

In [None]:
# Obtendo dados e labels
data = [prod for prod, cat in dataset]
labels = [cat for prod, cat in dataset]

In [None]:
# Normalização dos labels
target = encoder.fit_transform(labels)
set(target)

In [None]:
# Items
encoder.classes_.item(1)

In [None]:
# Fit do modelo
modelo.fit(data, target)

In [None]:
pred = modelo.predict(test)
pred

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

print( confusion_matrix(target, pred) ) 
print( '\n acuracia : ', accuracy_score(test_y, pred) )


In [None]:
# Prevendo a categoria a partir da descrição
print (encoder.classes_[1])

In [None]:
# Probabilidades de um produto
probs = modelo.predict_proba(["not recommend","good peoples","wors job"])
print(probs)
for i in probs:
    print(np.argmax(i))

In [None]:
# Probabidades de categorias para o objeto Ventilador
guess = [(class_, probs.item(n)) for n, class_ in enumerate(encoder.classes_)]
pprint(guess)

In [None]:
# Probabidade ajustada de categorias para o objeto Ventilador
from operator import itemgetter
for cat, proba in sorted(guess, key = itemgetter(1), reverse = True):
    print ('{}: {:.4f}'.format(cat, proba))

In [None]:
import re
strings = ["Important text not, me i      !Comment that could be removed", "not Other String"]
[re.sub('i, "", x) for x in strings]

### Fim

### Obrigado - Data Science Academy - <a href=http://facebook.com/dsacademy>facebook.com/dsacademybr</a>

In [None]:
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train', shuffle=True)

In [None]:
twenty_train.target_names #prints all the categories
print("\n".join(twenty_train.data[0].split("\n")[:3])) #prints first line of the first data file


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
"""#print(count_vect)
x = []
y = []
for d in dataset:
    x.append(d[0])
    y.append(d[1])
    
test = [ i['title'] for i in corpus[20000:30000]]
test_y = [ i['recommend'] for i in corpus[20000:30000]]

"""
X_train_counts = count_vect.fit_transform(data)
print(X_train_counts.shape)



In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

In [None]:
twenty_train.target

In [None]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train_tfidf, target)

In [None]:
"""
X = np.random.randint(5, size=(6, 100))
y = np.array([1, 2, 3, 4, 5, 6])
clf = MultinomialNB()
clf.fit(X_train_tfidf, twenty_train.target)
#print(X, '\n\n', X[2:3])
"""
print(test[1],test_y[1])

In [None]:
print(clf.predict(test[1],test_y[1] )  )


In [None]:
print(type( [1,2,.6]) )

In [None]:
import numpy as np
import pandas as pd

twenty_test = fetch_20newsgroups(subset='test', shuffle=True)
#print(twenty_test.data[0])
d = {'col1': [1, 2], 'col2': [3, 4]}
l = pd.DataFrame(data=d)
print(l)

#predicted = clf.predict(l) 
clf.predict(twenty_test.data)
#print(predicted)
#np.mean(predicted == twenty_test.target)

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline

text_clf_svm = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',
                                            alpha=1e-3, n_iter=5, random_state=42)),
 ])
text_clf_svm
text_clf_svm = text_clf_svm.fit(twenty_train.data, twenty_train.target)
predicted_svm = text_clf_svm.predict(twenty_test.data)
np.mean(predicted_svm == twenty_test.target)

# DADOS 

In [None]:
#https://www.shanelynn.ie/select-pandas-dataframe-rows-and-columns-using-iloc-loc-and-ix/


In [36]:
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB

import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

import random
from random import shuffle
from nltk.stem.snowball import SnowballStemmer
import string
import re
from nltk.corpus import stopwords


In [3]:
dados = pd.read_csv('~/Documents/dsa/data/reviewFinal.csv',encoding='latin-1')
corpus = pd.DataFrame()
listTitle = []
listRecom = []
for i, r in dados.iterrows():
    #print(i)
    if r['recommend'] != 'NI' and r['title'] != "" and r['title'] is not np.nan and r['site'] == "glassdoor":
        #corpus.append(r)
        #corpus['title'] = r['title']
        #corpus['recommend'] = r['recommend']
        listTitle.append(r['title'])
        listRecom.append(r['recommend'])

"""
for entry in corpus[:rows_train]:
    #print(entry)
    dataset.append( (entry['title'].lower().strip(), entry['recommend'].lower().strip()) )
"""
encoder = LabelEncoder()

In [4]:
corpus['title'] = listTitle
corpus['recommend'] = listRecom

In [5]:
### GERANDO LINHAS PARA DADOS DE TREINO E TESTE

x = [ i for i in range(len(corpus))]
shuffle(x)

rows_train =  x[0:round( len(corpus) * .75)] #25000
rows_test  = x[round( len(corpus) * .75):(len(corpus))]
dataset = []

In [None]:
#a =
print(len(rows_train)+len(rows_test), len(corpus) )

In [24]:
### LIMPANDO TEXTO
stemmer = SnowballStemmer("english")
sClear = []
tr = str.maketrans("", "", string.punctuation)
i = 0
for i,text in corpus.iterrows():    
    #print(text['title'].split())
    clear = ""
    for word in text['title'].split():        
        word = word.lower() ## CONVERTENDO PARA MINUSCULO
        word = word.translate(tr) ##REMOVENDO PONTUAÇÕES        
        word = re.sub("\d"," ",word) ## REMOVENDO NUMEROS
        word = stemmer.stem(word) ## STEMM 
        
        clear = clear + ' ' + word
        clear = re.sub("\s+"," ",clear) ## REMOVENDO ESPAÇOS DUPLICADOS
        
        
    sClear.append(clear)
    text['title'] = clear
    i = i + 1    

#print(sClear)

In [33]:
train   = corpus.iloc[rows_train, [0]]
train   = train['title'].tolist()
train_y = corpus.iloc[rows_train, [1] ] 
train_y = encoder.fit_transform(train_y)

test   = corpus.iloc[rows_test, [0]]
test   = test['title'].tolist()
test_y = corpus.iloc[rows_test, [1]]
test_y = encoder.fit_transform(test_y)

print(set(train_y), set(test_y) )

{0, 1} {0, 1}


  y = column_or_1d(y, warn=True)


In [31]:
test_y

array([1, 1, 1, ..., 1, 1, 1])

# MODELOS 

### NAIVE BAYES MULTINOMIAL

In [37]:
# Tokenizing
count_vect     = CountVectorizer(stop_words= stopwords.words('english'))
X_train_counts = count_vect.fit_transform(train)
count_vect.vocabulary_.get(u'algorithm')
X_train_counts.shape

(64460, 8943)

In [38]:
# Mesmo resultado da célula anterior, mas combinando as funções
tfidf_transformer = TfidfTransformer()
X_train_tfidf     = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(64460, 8943)

In [39]:
clf = MultinomialNB().fit(X_train_tfidf, train_y)


In [40]:
# Previsões
X_new_counts = count_vect.transform(test)
X_new_tfidf  = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

#print(predicted)
#for doc, category in zip(docs_new, predicted):    print('%r => %s' % (doc, twenty_train.target_names[category]))

In [41]:
print( confusion_matrix(test_y, predicted) ) 
print( '\n acuracia : ', accuracy_score(test_y, predicted) )

[[ 2906  4252]
 [  923 13405]]

 acuracia :  0.759145490086568


### BINOMIAL NAIVE BAYES

In [42]:
nb = BernoulliNB()
nb.fit(X_train_tfidf, train_y)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [43]:
predictBIN = nb.predict(X_new_tfidf)

In [44]:
print( confusion_matrix(test_y, predicted) ) 
print( '\n acuracia : ', accuracy_score(test_y, predictBIN) )

[[ 2906  4252]
 [  923 13405]]

 acuracia :  0.757144186912408


### SVC

In [45]:
modelo  = Pipeline([('vect', TfidfVectorizer(stop_words=stopwords.words('english'))), 
                    ('clf', SVC(kernel = 'linear', probability = True))])

In [46]:
modelo.fit(train, train_y)

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
  ...',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [47]:
predictSVC = modelo.predict(test)


In [48]:
print( confusion_matrix(test_y, predictSVC) ) 
print( '\n acuracia : ', accuracy_score(test_y, predictSVC) )


[[ 3176  3982]
 [ 1205 13123]]

 acuracia :  0.7585869868751746


### SVM

In [49]:
text_clf = Pipeline([('vect', CountVectorizer(stopwords.words('english'))),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-3, random_state=42,
                                           max_iter=500, tol=None)),
])
text_clf.fit(train, train_y)  

predicted = text_clf.predict(test)
print( confusion_matrix(test_y, predicted) ) 
print( '\n acuracia : ', accuracy_score(test_y, predicted) )

[[  915  6243]
 [  276 14052]]

 acuracia :  0.6965931304104999


# DEEP LEARNING

In [51]:
#https://github.com/akshaypai/tfClassifier/blob/master/text_classification/classify_text.py
import nltk
from nltk.stem.lancaster import LancasterStemmer
import numpy as np
!pip install tflearn 
import tflearn
import tensorflow as tf
import random
import json
import string
import unicodedata
import sys

Collecting tflearn
  Downloading tflearn-0.3.2.tar.gz (98kB)
[K    100% |████████████████████████████████| 102kB 850kB/s a 0:00:01
Building wheels for collected packages: tflearn
  Running setup.py bdist_wheel for tflearn ... [?25ldone
[?25h  Stored in directory: /Users/gabriel.gomes/Library/Caches/pip/wheels/fb/06/72/0478c938ca315c6fddcce8233b80ec91a115ce4496a27e8c90
Successfully built tflearn
Installing collected packages: tflearn
Successfully installed tflearn-0.3.2


  from ._conv import register_converters as _register_converters


In [52]:
# a table structure to hold the different punctuation used
tbl = dict.fromkeys(i for i in range(sys.maxunicode)
                    if unicodedata.category(chr(i)).startswith('P'))


# method to remove punctuations from sentences.
def remove_punctuation(text):
    return text.translate(tbl)

# initialize the stemmer
stemmer = LancasterStemmer()
# variable to hold the Json data read from the file
data = None

In [57]:
train_y

array([1, 0, 1, ..., 0, 0, 0])

In [68]:
# get a list of all categories to train for
categories = train_y
words = []
# a list of tuples with words in the sentence and category name
docs = []

c = corpus[0:10]
for i,text in c.iterrows():
    #print(text['title'])
    #print(remove_punctuation(text['title']))
    w = nltk.word_tokenize(text['title'])
    #print(words)
    words.extend(w)
    docs.append((w, text['recommend']))
"""
#for each_category in data.keys():
    for each_sentence in data[each_category]:
        # remove any punctuation from the sentence
        each_sentence = remove_punctuation(each_sentence)
        print(each_sentence)
        # extract words from each sentence and append to the word list
        w = nltk.word_tokenize(each_sentence)
        print("tokenized words: ", w)
        words.extend(w)
        docs.append((w, each_category))
"""

'\n#for each_category in data.keys():\n    for each_sentence in data[each_category]:\n        # remove any punctuation from the sentence\n        each_sentence = remove_punctuation(each_sentence)\n        print(each_sentence)\n        # extract words from each sentence and append to the word list\n        w = nltk.word_tokenize(each_sentence)\n        print("tokenized words: ", w)\n        words.extend(w)\n        docs.append((w, each_category))\n'

In [69]:
# stem and lower each word and remove duplicates
words = [stemmer.stem(w.lower()) for w in words]
words = sorted(list(set(words)))

print(words)
print(docs)


['al', 'awesom', 'but', 'cli', 'commod', 'compan', 'cult', 'del', 'develop', 'emc', 'employ', 'endless', 'fantast', 'for', 'glob', 'good', 'gre', 'man', 'marry', 'not', 'plac', 'poss', 'review', 'salar', 'seny', 'softw', 'technolog', 'their', 'to', 'work']
[(['great', 'technolog', 'compani', 'great', 'cultur'], 'Yes'), (['dell', 'emc', 'marriag', 'awesom', 'for', 'their', 'client', 'but', 'not', 'all', 'employe'], 'Yes'), (['review'], 'Yes'), (['great', 'place', 'to', 'work'], 'Yes'), (['salari'], 'Yes'), (['senior', 'softwar', 'develop'], 'Yes'), (['fantast', 'employ'], 'Yes'), (['global', 'commod', 'manag'], 'No'), (['good', 'place', 'to', 'work'], 'Yes'), (['endless', 'possibl'], 'Yes')]


In [80]:
d = {'d':"gabriel", 'r':"lima"}
print(type(d.keys().index(d[1]) ) )


AttributeError: 'dict_keys' object has no attribute 'index'

In [71]:

# create our training data
training = []
output = []
# create an empty array for our output
output_empty = [0] * len(set(train_y))


for doc in docs:
    # initialize our bag of words(bow) for each document in the list
    bow = []
    # list of tokenized words for the pattern
    token_words = doc[0]
    # stem each word
    token_words = [stemmer.stem(word.lower()) for word in token_words]
    # create our bag of words array
    for w in words:
        bow.append(1) if w in token_words else bow.append(0)

    output_row = list(output_empty)
    output_row[categories.index(doc[1])] = 1

    # our training set will contain a the bag of words model and the output row that tells
    # which catefory that bow belongs to.
    training.append([bow, output_row])


AttributeError: 'numpy.ndarray' object has no attribute 'index'