In [None]:
import nbimporter
import pandas as pd
import Convert_NRC as nrc
from header import EmotionAnalysis as ea

In [None]:
nrc.create_emolex()

## Original

In [None]:
filepath = 'books/senhora.txt'

In [None]:
raw = ea.open_file(filepath)

In [None]:
tokens = ea.tokenize(raw)

In [None]:
filtered = ea.filter(tokens)

In [None]:
text = ea.convert_to_text(tokens)

In [None]:
dist = ea.tokens_frequency(filtered)

In [None]:
ea.info(tokens, filtered)

In [None]:
freq = ea.most_frequent(filtered, dist, 20)

In [None]:
print(f"20 palavras mais frequentes: {freq}")

In [None]:
print(f"Palavra mais frequente: {dist.max()} - {dist[dist.max()]} vezes")

In [None]:
print("Colocações significantes: \n")
text.collocations()

In [None]:
ea.context(text, dist)

In [None]:
wordList, emotionList = ea.emolex()

In [None]:
emoList = ea.newList(emotionList, filtered)

In [None]:
emotionCounts = ea.generate_count(wordList, filtered)

In [None]:
emotionCounts.most_common()

In [None]:
wordCounts = ea.generate_count(emoList, filtered)

In [None]:
wordCounts.most_common(20)

In [None]:
for w in wordCounts.most_common(20):
    print(w[0])
    print(wordList[w[0]])
    print('------------------------------------')

# Testes

In [None]:
import nltk
from collections import OrderedDict, defaultdict, Counter
import numpy as np

In [None]:
text.concordance("casamento")

In [None]:
text.similar("casamento")

In [None]:
text.common_contexts([freq[0][0], freq[1][0]])

In [None]:
sortedToken = sorted(list(set(filtered)), key=lambda token: dist[token], reverse=True)
text.dispersion_plot(sortedToken[:6])

In [None]:
wordArray = np.asarray(wordCounts.most_common(5))
wordArray = [i[0] for i in wordArray]
text.dispersion_plot(wordArray)

## Tratar negação

In [None]:
def filter(tokens):
    stopwords = nltk.corpus.stopwords.words('portuguese')
    stopwords = [s for s in stopwords if s != 'não']
    
    
    filtered = [t for t in tokens 
                if t not in stopwords 
                and t.isalpha() 
                and len(t) > 1]
    return filtered

In [None]:
new_tokens = filter(tokens)

In [None]:
def generate_count():
    emoCount = Counter()
    t = singular
    for i in range(len(singular) - 1):
        if len(wordList[t[i]]) > 0:
            if t[i-1] == 'não':
                wordList[t[i]] = revert_emotion(wordList[t[i]])
            emoCount += Counter(wordList[t[i]])
    return emoCount

In [None]:
def revert_emotion(wordList):
    newList = []
    for w in wordList:
        if w == 'positivo':
            newList.append('negativo')
        elif w == 'negativo':
            newList.append('positivo')
        elif w == 'alegria':
            newList.append('tristeza')
        elif w == 'tristeza':
            newList.append('alegria')
        elif w == 'antecipação':
            newList.append('surpresa')
        elif w == 'surpresa':
            newList.append('antecipação')
        elif w == 'medo':
            newList.append('raiva')
        elif w == 'raiva':
            newList.append('medo')
        elif w == 'nojo':
            newList.append('confiança')
        elif w == 'confiança':
            newList.append('nojo')
    return newList

In [None]:
newCounts = generate_count()
newCounts

In [None]:
for i in range(len(new_tokens) - 1):
    if new_tokens[i] == 'não':
        new_tokens[i:i+2] = [' '.join(new_tokens[i:i+2])]

## Tratar plural

In [None]:
import re
s = re.compile('s$')
oes = re.compile('ões$|ãos$|ães$')
res = re.compile('res$')
zes = re.compile('zes$')
ses = re.compile('ses$')
ais = re.compile('ais$')
eis = re.compile('éis$')
ois = re.compile('óis$')
uis = re.compile('uis$')
eis = re.compile('is$|eis$')
ns = re.compile('ns$')
nes = re.compile('nes$')

In [None]:
def singularize(tokens):
    new_t = []
    
    for t in tokens:
        if t.endswith('ões') | t.endswith('ãos') | t.endswith('ães') :
            new_t.append(re.sub(oes, 'ão', t))
        elif t.endswith('res'):
            new_t.append(re.sub(res, 'r', t))
        elif t.endswith('zes'):
            new_t.append(re.sub(zes, 'z', t))
        elif t.endswith('ses'):
            new_t.append(re.sub(ses, 's', t))
        elif t.endswith('ais'):
            new_t.append(re.sub(ais, 'al', t))
        elif t.endswith('éis'):
            new_t.append(re.sub(eis, 'el', t))
        elif t.endswith('óis'):
            new_t.append(re.sub(ois, 'ol', t))
        elif t.endswith('uis'):
            new_t.append(re.sub(uis, 'ul', t))
        elif t.endswith('is') | t.endswith('eis') :
            new_t.append(re.sub(eis, 'il', t))
        elif t.endswith('ns'):
            new_t.append(re.sub(ns, 'm', t))
        elif t.endswith('nes'):
            new_t.append(re.sub(nes, 'n', t))
        elif t.endswith('s'):
            new_t.append(re.sub(s, '', t))
        else:
            new_t.append(t)
            
    return new_t

In [None]:
singular = singularize(new_tokens)

## Tratar gênero dos adjetivos

In [None]:
#tree

## Tratar verbos