[View in Colaboratory](https://colab.research.google.com/github/edgarbanhesse/ia369y-affective-computing/blob/master/ia369y_test_note_p3.ipynb)

In [3]:
# https://pythonspot.com/python-sentiment-analysis
# Exemplo de uso do classificados NaiveBayes

import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import names
 
def word_feats(words):
    return dict([(word, True) for word in words])
 
positive_vocab = [ 'awesome', 'outstanding', 'fantastic', 'terrific', 'good', 'nice', 'great', ':)' ]
negative_vocab = [ 'bad', 'terrible','useless', 'hate', ':(' ]
neutral_vocab = [ 'movie','the','sound','was','is','actors','did','know','words','not' ]
 
positive_features = [(word_feats(pos), 'pos') for pos in positive_vocab]
negative_features = [(word_feats(neg), 'neg') for neg in negative_vocab]
neutral_features = [(word_feats(neu), 'neu') for neu in neutral_vocab]
 
train_set = negative_features + positive_features + neutral_features
 
classifier = NaiveBayesClassifier.train(train_set) 
 
# Predict
neg = 0
pos = 0
sentence = "Awesome movie, I liked it"
sentence = sentence.lower()
words = sentence.split(' ')
for word in words:
    classResult = classifier.classify( word_feats(word))
    if classResult == 'neg':
        neg = neg + 1
    if classResult == 'pos':
        pos = pos + 1

# print(positive_vocab)
# print(positive_features)
# print(train_set)

print('Positive: ' + str(float(pos)/len(words)))
print('Negative: ' + str(float(neg)/len(words)))


Positive: 0.6
Negative: 0.2


In [4]:
# Fonte: https://medium.freecodecamp.org/how-to-process-textual-data-using-tf-idf-in-python-cd2bbc0a94a3
# Teste de cálculo do tf
def computeTF(wordDict, bow):
    tfDict = {}
    bowCount = len(bow)
    for word, count in wordDict.items():
        tfDict[word] = count/float(bowCount)
    return tfDict

S1 = "The car is driven on the road"
S2 = "The truck is driven on the highway"

bow1 = S1.split(" ")
bow2 = S2.split(" ")

wordSet = set(bow1).union(set(bow2))

wordDict1 = dict.fromkeys(wordSet, 0) 
wordDict2 = dict.fromkeys(wordSet, 0) 

for word in bow1:
    wordDict1[word]+=1
    
for word in bow2:
    wordDict2[word]+=1
    
import pandas as pd
pd.DataFrame([wordDict1, wordDict2])

tfBow1 = computeTF(wordDict1, bow1)
tfBow2 = computeTF(wordDict2, bow2)

tfBow1

{'The': 0.14285714285714285,
 'car': 0.14285714285714285,
 'driven': 0.14285714285714285,
 'highway': 0.0,
 'is': 0.14285714285714285,
 'on': 0.14285714285714285,
 'road': 0.14285714285714285,
 'the': 0.14285714285714285,
 'truck': 0.0}

In [5]:
tfBow2

{'The': 0.14285714285714285,
 'car': 0.0,
 'driven': 0.14285714285714285,
 'highway': 0.14285714285714285,
 'is': 0.14285714285714285,
 'on': 0.14285714285714285,
 'road': 0.0,
 'the': 0.14285714285714285,
 'truck': 0.14285714285714285}

In [6]:
# Teste de cálculo do idf
def computeIDF(docList):
    import math
    idfDict = {}
    N = len(docList)
    
    idfDict = dict.fromkeys(docList[0].keys(), 0)
    for doc in docList:
        for word, val in doc.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log10(N / float(val))
        
    return idfDict
  
idfs = computeIDF([wordDict1, wordDict2])
idfs

{'The': 0.0,
 'car': 0.3010299956639812,
 'driven': 0.0,
 'highway': 0.3010299956639812,
 'is': 0.0,
 'on': 0.0,
 'road': 0.3010299956639812,
 'the': 0.0,
 'truck': 0.3010299956639812}

In [7]:
# Teste de cálculo do tfidf
def computeTFIDF(tfBow, idfs):
    tfidf = {}
    for word, val in tfBow.items():
        tfidf[word] = val*idfs[word]
    return tfidf
  
tfidfBow1 = computeTFIDF(tfBow1, idfs)
tfidfBow2 = computeTFIDF(tfBow2, idfs)

import pandas as pd
pd.DataFrame([tfidfBow1, tfidfBow2])

Unnamed: 0,The,car,driven,highway,is,on,road,the,truck
0,0.0,0.043004,0.0,0.0,0.0,0.0,0.043004,0.0,0.0
1,0.0,0.0,0.0,0.043004,0.0,0.0,0.0,0.0,0.043004


In [14]:
import re
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

# Classe do T2
class TfidfImpl:

    def __init__(self, stopwords=None):
        self.stopwords = stopwords
    
    def clean_phrase(self, phrase):
        phrase = phrase.lower()
        # Remove pontuação
        phrase = re.sub(r'[\"\'!@#$%&*\(\)-_=+{}\[\]:;>.<,|\\`´]', '', phrase)
        if self.stopwords == 'english':
            # Remover stopwords em inglês e Lematização das palavras
            wordnet_lemmatizer = WordNetLemmatizer()
            # stwords = set(stopwords.words('english'))
            stwords = set(ENGLISH_STOP_WORDS)
            phrase = ' '.join([wordnet_lemmatizer.lemmatize(word) for word in phrase.split() if word not in stwords and len(word) > 2])
        # Remove espaços em branco extras
        phrase = re.sub(r'\s{1,}', ' ', phrase)
        return phrase


    def tokenize(self, phrase):
        # Limpar e retorna trigramas da frase
        return self.clean_phrase(phrase).split()
    
    def bag_of_words(self, phrases):
        bow = []
        for phrase in phrases:
            bow += phrase
        return sorted(set(bow))


    def compute_tf(self, words):
        tf = {}
        lbow = len(words)
        for word in words:
            tf[word] = tf.get(word, 0) + 1
        for word, count in tf.items():
            tf[word] = count / lbow
        return tf


    def compute_idf(self, phrases, N, bow):
        idfs = {}
        for df in bow:
            idfs[df] = idfs.get(df, 0)
            for words in phrases:
                if df in words:
                    idfs[df] = idfs.get(df, 0) + 1
        for df in idfs.keys():
            idfs[df] = np.log10(N / idfs[df])
        return idfs


    def compute(self, phrases):
        # Checagem... e conversão
        assert len(phrases) > 0
        if type(phrases[0]) is str:
            phrases = [self.tokenize(phrase) for phrase in phrases]
        
        tf_idf = {}
        N = len(phrases)
        bow = self.bag_of_words(phrases)
        idf = self.compute_idf(phrases, N, bow)
        for words in phrases:
            tf = self.compute_tf(words)
            for word, val in tf.items():
                tf_idf[word] = val * idf[word]
        return tf_idf

# Teste tfidf

my_phrases = ['The car is driven on the road', 'The truck is driven on the highway']

print('Teste da nossa implementação:')
tfidf = TfidfImpl()
tfidf = tfidf.compute(my_phrases)
tfidf

Teste da nossa implementação:


{'car': 0.043004285094854454,
 'driven': 0.0,
 'highway': 0.043004285094854454,
 'is': 0.0,
 'on': 0.0,
 'road': 0.043004285094854454,
 'the': 0.0,
 'truck': 0.043004285094854454}

In [38]:
# Teste tfidf sklearn

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import normalize
import pandas as pd

# corpus = {1: "The game of life is a game of everlasting learning", 2: "The unexamined life is not worth living", 3: "Never stop learning"}
corpus = {1: "The car is driven on the road", 2: "The truck is driven on the highway"}

cvect = CountVectorizer(ngram_range=(1,1), token_pattern='(?u)\\b\\w+\\b')
counts = cvect.fit_transform(corpus.values())
normalized_counts = normalize(counts, norm='l1', axis=1)

tfidf = TfidfVectorizer(ngram_range=(1,1), token_pattern='(?u)\\b\\w+\\b', smooth_idf=False)
tfs = tfidf.fit_transform(corpus.values())
new_tfs = normalized_counts.multiply(tfidf.idf_)

feature_names = tfidf.get_feature_names()
corpus_index = [n for n in corpus]
df = pd.DataFrame(new_tfs.T.todense(), index=feature_names, columns=corpus_index)

print(df.loc[['The', 'car', 'driven', 'highway', 'is', 'on', 'road', 'the', 'truck']])


                1         2
The           NaN       NaN
car      0.241878  0.000000
driven   0.142857  0.142857
highway  0.000000  0.241878
is       0.142857  0.142857
on       0.142857  0.142857
road     0.241878  0.000000
the      0.285714  0.285714
truck    0.000000  0.241878


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike


In [1]:
import numpy as np
X = np.random.randint(5, size=(6, 100))
y = np.array([1, 2, 3, 4, 5, 6])
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X, y)

print(clf.predict(X[2:3]))

[3]


In [2]:
X = [[0], [1], [2], [3]]
y = [0, 0, 1, 1]
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X, y) 

print(neigh.predict([[1.1]]))

print(neigh.predict_proba([[0.9]]))

[0]
[[0.66666667 0.33333333]]
