## Imports e downloads

In [None]:
import nltk
import pandas as pd
import csv
from collections import OrderedDict, defaultdict, Counter
from urllib import request
from nltk import ngrams, FreqDist
from nltk.corpus import floresta

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('floresta')

## Coleta de documento

In [None]:
def open_file(filepath):
    file = open(filepath, 'r')

    return file.read()

## Pré-processamento

In [None]:
def pre_process(raw):
    stopwords = nltk.corpus.stopwords.words('portuguese')
    
    tokens = nltk.word_tokenize(raw.lower())
    filtered = [t for t in tokens if t not in stopwords and t.isalpha() and len(t) > 1]
    text = nltk.Text(tokens)
    dist = FreqDist(filtered)
    
    return tokens, filtered, text, dist

## Informações básicas sobre o texto

In [None]:
def info(tokens, filtered_tokens):
    print(f"quantidade de palavras: {len(tokens)}")
    print(f"quantidade de palavras após o filtro: {len(filtered_tokens)}")
    print(f"quantidade de palavras únicas: {len(set(tokens))}")
    print(f"quantidade de palavras únicas após o filtro: {len(set(filtered_tokens))}")
    print(f"diversidade léxica: {(len(set(filtered_tokens)) / len(filtered_tokens))*100}")

### Frequência e distribuição de palavras

In [None]:
def most_frequent(tokens, dist):
    sortedToken = sorted(list(set(tokens)), key=lambda token: dist[token], reverse=True)
    frequent_tokens = [(token, dist[token]) for token in sortedToken[:20]]
    
    return frequent_tokens

### Análise de contexto

In [None]:
def n_grams(text, dist):
    target_word = dist.max()
    fd = FreqDist(ng for ng in ngrams(text, 2) if target_word in ng)
    for hit in fd:
        print(' '.join(hit))

In [None]:
# O método concordance permite ver palavras em um contexto
def context(text, dist):
    target_word = dist.max()
    
    return text.concordance(target_word)

## Análise de Emoção

In [None]:
def emolex():
    wordList = defaultdict(list)
    emotionList = defaultdict(list)

    with open('lexico/emolex.csv', 'r') as f:
        reader = csv.DictReader(f)
        for row in reader:
            if int(row['present']) == 1:
                wordList[row['word']].append(row['emotion'])
                emotionList[row['emotion']].append(row['word'])
    
    return wordList, emotionList

In [None]:
def generate_count(word_list, filtered_tokens):
    emoCount = Counter()
    for t in filtered_tokens:
        if len(word_list[t]) > 0:
            emoCount += Counter(word_list[t])
    return emoCount

In [None]:
def newList(emotionList, filtered_tokens):
    emoList = defaultdict(list)
    for t in filtered_tokens:
        for e in emotionList:
            for w in emotionList[e]:
                if w == t:
                    emoList[e].append(w)
    
    return emoList