# <font color='blue'>Data Science Academy - Machine Learning</font>

# <font color='blue'>Capítulo 10 - Processamento de Linguagem Natural</font>

****** Este Jupyter Notebook foi atualizado para a versão 3.6.1. da Linguagem Python em 05/07/2017 ******

## Bag of Words


O modelo de "saco de palavras" é uma representação simplificada usada no processamento de linguagem natural e recuperação de informação. Neste modelo, um texto (como uma sentença ou um documento) é representado como o saco (multiset) de suas palavras, desconsiderando a gramática e até a ordem das palavras, mas mantendo a multiplicidade.

Na classificação de documentos, um saco de palavras é um vetor esparso de ocorrência de contagens de palavras; Ou seja, um histograma esparso sobre o vocabulário.

### Carregando um Dataset de um Site de E-commerce (em português)

In [None]:
import gzip
import json

In [None]:
# Carregando o dataset
corpus = list()
with gzip.open('ecommerce.json.gz') as fp:
    for line in fp:
        entry = line.decode('utf8')
        corpus.append(json.loads(entry))

In [None]:
from pprint import pprint
pprint(corpus[0])

In [None]:
print (corpus[0]['descr'])

## Gensim - Modelagem de Tópicos

https://github.com/RaRe-Technologies/gensim

In [None]:
# Por enquanto disponíve, apenas para Python 2.7
#!pip install pattern

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
!pip install gensim

In [None]:
import gensim
print (gensim.summarization.summarize(corpus[0]['descr']))

In [None]:
len(corpus)

In [None]:
# Construindo um classificador para produtos e categorias (considerando apenas os 10 mil primeiros produtos)
dataset = list()
for entry in corpus[:50000]:
    if 'cat' in entry:
        dataset.append( (entry['name'], entry['cat'].lower().strip()) )

In [None]:
len(dataset)

In [None]:
pprint(dataset[:10])

In [None]:
# Quantas categorias distintas nós temos e quantos itens por categoria?
from collections import Counter
counter = Counter([cat for prod, cat in dataset])
pprint(counter.most_common())

# Construindo um Classificador SVM com Bag of Words

http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
import nltk as nltk
import pandas as pd
import numpy as np
#nltk.download()

In [None]:
len(corpus)


In [None]:
for i in corpus[:2]:
    print(i['title'].lower().strip())

In [None]:
stopwords = nltk.corpus.stopwords.words('portuguese')
stopwords = nltk.corpus.stopwords.words('english')
stopwords = [ 'r' + "'\b" + s + "\b'" + '|' for s in stopwords]
stopwords = ''.join(stopwords)
#stopwords = '['+ stopwords + ']'

for t, doc in dataset[:15]:
    for w in t.split(' '):
        if w not in stopwords:
            t = t + w
            
    # stem each word
    #sentence_words = [stemmer.stem(word.lower()) for word in sentence_words]
#stopwords

In [None]:
# Construindo o modelo SVM com Pipeline
modelo = Pipeline([('vect', TfidfVectorizer()), ('clf', SVC(kernel = 'linear', probability = True))])

In [None]:
print(modelo)

In [None]:
#?LabelEncoder

In [None]:
# Objeto para Normalização dos labels

encoder = LabelEncoder()

In [None]:
# Obtendo dados e labels
data = [prod for prod, cat in dataset]
labels = [cat for prod, cat in dataset]

In [None]:
# Normalização dos labels
target = encoder.fit_transform(labels)
set(target)

In [None]:
# Items
encoder.classes_.item(1)

In [None]:
# Fit do modelo
modelo.fit(data, target)

In [None]:
pred = modelo.predict(test)
pred

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

print( confusion_matrix(target, pred) ) 
print( '\n acuracia : ', accuracy_score(test_y, pred) )


In [None]:
# Prevendo a categoria a partir da descrição
print (encoder.classes_[1])

In [None]:
# Probabilidades de um produto
probs = modelo.predict_proba(["not recommend","good peoples","wors job"])
print(probs)
for i in probs:
    print(np.argmax(i))

In [None]:
# Probabidades de categorias para o objeto Ventilador
guess = [(class_, probs.item(n)) for n, class_ in enumerate(encoder.classes_)]
pprint(guess)

In [None]:
# Probabidade ajustada de categorias para o objeto Ventilador
from operator import itemgetter
for cat, proba in sorted(guess, key = itemgetter(1), reverse = True):
    print ('{}: {:.4f}'.format(cat, proba))

In [None]:
import re
strings = ["Important text not, me i      !Comment that could be removed", "not Other String"]
[re.sub('i, "", x) for x in strings]

### Fim

### Obrigado - Data Science Academy - <a href=http://facebook.com/dsacademy>facebook.com/dsacademybr</a>

In [None]:
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train', shuffle=True)

In [None]:
twenty_train.target_names #prints all the categories
print("\n".join(twenty_train.data[0].split("\n")[:3])) #prints first line of the first data file


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
"""#print(count_vect)
x = []
y = []
for d in dataset:
    x.append(d[0])
    y.append(d[1])
    
test = [ i['title'] for i in corpus[20000:30000]]
test_y = [ i['recommend'] for i in corpus[20000:30000]]

"""
X_train_counts = count_vect.fit_transform(data)
print(X_train_counts.shape)



In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

In [None]:
twenty_train.target

In [None]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train_tfidf, target)

In [None]:
"""
X = np.random.randint(5, size=(6, 100))
y = np.array([1, 2, 3, 4, 5, 6])
clf = MultinomialNB()
clf.fit(X_train_tfidf, twenty_train.target)
#print(X, '\n\n', X[2:3])
"""
print(test[1],test_y[1])

In [None]:
print(clf.predict(test[1],test_y[1] )  )


In [None]:
print(type( [1,2,.6]) )

In [None]:
import numpy as np
import pandas as pd

twenty_test = fetch_20newsgroups(subset='test', shuffle=True)
#print(twenty_test.data[0])
d = {'col1': [1, 2], 'col2': [3, 4]}
l = pd.DataFrame(data=d)
print(l)

#predicted = clf.predict(l) 
clf.predict(twenty_test.data)
#print(predicted)
#np.mean(predicted == twenty_test.target)

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline

text_clf_svm = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',
                                            alpha=1e-3, n_iter=5, random_state=42)),
 ])
text_clf_svm
text_clf_svm = text_clf_svm.fit(twenty_train.data, twenty_train.target)
predicted_svm = text_clf_svm.predict(twenty_test.data)
np.mean(predicted_svm == twenty_test.target)

# DADOS 

In [1]:
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB

import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [2]:
dados = pd.read_csv('data/reviewFinal.csv',encoding='latin-1')
corpus = []

for i, r in dados.iterrows():
    #print(i)
    if r['recommend'] != 'NI' and r['title'] != "" and r['title'] is not np.nan and r['site'] == "glassdoor":
        corpus.append(r)

rows_train = 25000
rows_test  = [50000,(len(corpus)-1)]
dataset = []
"""
for entry in corpus[:rows_train]:
    #print(entry)
    dataset.append( (entry['title'].lower().strip(), entry['recommend'].lower().strip()) )
"""
encoder = LabelEncoder()

In [3]:
train  = [ i['title'] for i in corpus[:rows_train] ]
train_y= [ i['recommend'] for i in corpus[:rows_train] ]
train_y = encoder.fit_transform(train_y)

test   = [ i['title'] for i in corpus[rows_test[0]:rows_test[1]] ]
test_y = [ i['recommend'] for i in corpus[rows_test[0]:rows_test[1]] ]
test_y = encoder.fit_transform(test_y)

print(set(train_y), set(test_y) )

{0, 1} {0, 1}


# MODELOS 

### NAIVE BAYES MULTINOMIAL

In [32]:
# Tokenizing
count_vect     = CountVectorizer()
X_train_counts = count_vect.fit_transform(train)
count_vect.vocabulary_.get(u'algorithm')
X_train_counts.shape

(25000, 5957)

In [33]:
# Mesmo resultado da célula anterior, mas combinando as funções
tfidf_transformer = TfidfTransformer()
X_train_tfidf     = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(25000, 5957)

In [32]:
clf = MultinomialNB().fit(X_train_tfidf, train_y)


In [34]:
# Previsões
X_new_counts = count_vect.transform(test)
X_new_tfidf  = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

#print(predicted)
#for doc, category in zip(docs_new, predicted):    print('%r => %s' % (doc, twenty_train.target_names[category]))

In [35]:
print( confusion_matrix(test_y, predicted) ) 
print( '\n acuracia : ', accuracy_score(test_y, predicted) )

[[ 3960  5400]
 [ 2859 23726]]

 acuracia :  0.7702322993462234


### BINOMIAL NAIVE BAYES

In [37]:
nb = BernoulliNB()
nb.fit(X_train_tfidf, train_y)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [38]:
predictBIN = nb.predict(X_new_tfidf)

In [39]:
print( confusion_matrix(test_y, predicted) ) 
print( '\n acuracia : ', accuracy_score(test_y, predictBIN) )

[[ 3960  5400]
 [ 2859 23726]]

 acuracia :  0.761552371678954


### SVC

In [49]:
modelo  = Pipeline([('vect', TfidfVectorizer(stop_words='english')), ('clf', SVC(kernel = 'linear', probability = True))])

In [50]:
modelo.fit(train, train_y)

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
  ...',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [51]:
predictSVC = modelo.predict(test)


In [52]:
print( confusion_matrix(test_y, predictSVC) ) 
print( '\n acuracia : ', accuracy_score(test_y, predictSVC) )


[[ 3902  5353]
 [ 3251 23122]]

 acuracia :  0.758504546986


### SVM

In [48]:
text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-3, random_state=42,
                                           max_iter=500, tol=None)),
])
text_clf.fit(train, train_y)  

predicted = text_clf.predict(test)
print( confusion_matrix(test_y, predicted) ) 
print( '\n acuracia : ', accuracy_score(test_y, predicted) )

[[ 1555  7700]
 [  714 25659]]

 acuracia :  0.763837431234


In [20]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords=False)


In [53]:
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])

stemmed_count_vect = StemmedCountVectorizer(stop_words='english')
text_mnb_stemmed = Pipeline([('vect', stemmed_count_vect),
                      ('tfidf', TfidfTransformer()),
                      ('mnb', MultinomialNB())])



TypeError: __init__() got an unexpected keyword argument 'stop_words'

In [65]:
stemmer = SnowballStemmer("english")

data = ["caresses","the"," NLTK comes with various stemmers (details on how stemmers work are out of scope for this article)"]# which can help reducing the words to their root form]
target = [1]

[stemmer.stem(plural) for plural in data]

['caress',
 'the',
 ' nltk comes with various stemmers (details on how stemmers work are out of scope for this article)']

In [26]:
text_mnb_stemmed = text_mnb_stemmed.fit(train, train_y)
#text_mnb_stemmed

In [27]:
predicted_mnb_stemmed = text_mnb_stemmed.predict(test)
np.mean(predicted_mnb_stemmed == test_y)

0.76501627933086336

In [54]:
s = ["gabriel . lima @ gomes !"]

In [55]:
import string

In [81]:
punct = str.maketrans("","",string.punctuation)
for i in s:
    print(i.translate(punct).replace("\\S+","*"))
    

gabriel  lima  gomes 
