In [None]:
import nltk
import pandas as pd
import csv
from collections import OrderedDict, defaultdict, Counter
from urllib import request
from nltk import ngrams, FreqDist
from nltk.corpus import floresta

nltk.download('punkt')

In [None]:
# URL de Ubirajara no Project Gutenberg
url = 'http://www.gutenberg.org/cache/epub/38496/pg38496.txt'

In [None]:
response = request.urlopen(url)

In [None]:
raw = response.read().decode('utf8')

In [None]:
# Tokenização
tokens = nltk.word_tokenize(raw)

In [None]:
# Remove o header e o footer do documento
del tokens[0:280]
del tokens[25677:42803]

In [None]:
# Converte os tokens para lowercase
words = [w.lower() for w in tokens]

In [None]:
# Remove os stopwords
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('portuguese')
tokens = [w for w in words if w not in stopwords]

In [None]:
# Remove pontuações e números
tokens = [t for t in tokens if t.isalpha()]

In [None]:
tokens = [t for t in tokens if len(t)>2]

In [None]:
wordList = defaultdict(list)
emotionList = defaultdict(list)

In [None]:
# Importa o léxico e salva em listas
with open('teste.csv', 'r') as f:
    reader = csv.DictReader(f)
    for row in reader:
        if int(row['present']) == 1:
            wordList[row['word']].append(row['emotion'])
            emotionList[row['emotion']].append(row['word'])

In [None]:
# Contabiliza as emoções no documento
def generate_emotion_count(tokens):
    emoCount = Counter()
    for t in tokens:
        if len(wordList[t]) > 0:
            emoCount += Counter(wordList[t])
    return emoCount

In [None]:
emotionCounts = generate_emotion_count(tokens)

In [None]:
emotionCounts

In [None]:
# Lista só com as palavras presentes no documento
emoList = defaultdict(list)
for t in tokens:
    for e in emotionList:
        for w in emotionList[e]:
            if w == t:
                emoList[e].append(w)

In [None]:
# Contabiliza as palavras relacionadas a emoção
def generate_word_count(tokens):
    emoCount = Counter()
    for e in emoList:
        emoCount += Counter(emoList[e])
    return emoCount

In [None]:
wordCounts = generate_word_count(tokens)

In [None]:
wordCounts

In [None]:
for w in wordCounts:
    print(w)
    print(wordList[w])
    print('------------------------------------')

In [None]:
len(tokens)

In [None]:
len(set(tokens))

In [None]:
dist = FreqDist(tokens)

In [None]:
dist

In [None]:
def most_frequent():
    sortedToken = sorted(list(set(tokens)), key=lambda token: dist[token], reverse=True)
    return [(token, dist[token]) for token in sortedToken[:20]]
most_frequent()

In [None]:
dist.max()

In [None]:
def n_grams():
    target_word = dist.max()
    fd = FreqDist(ng
                  for ng in ngrams(tokens, 5)
                  if target_word in ng)
    for hit in fd:
        print(' '.join(hit))

n_grams()

In [None]:
from nltk.corpus import floresta
nltk.download('floresta')
correct_spellings = floresta.words()

In [None]:
def answer_eleven(entries=set(tokens)):
    results = []
    for entry in entries:
        candidates = [w for w in correct_spellings if w[0] == entry[0]]
        results.append(min(candidates, key=
                           lambda candidate:nltk.edit_distance(entry, candidate)))
    return results
    
answer_eleven()

In [None]:
from nltk.corpus import udhr
nltk.download('udhr')
correct = udhr.words('Portuguese_Portugues-Latin1')

In [None]:
correct