In [None]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from collections import Counter


In [None]:
N_GRAM = 2
EXCEL_FILE = 'sentences.xlsx'

# Lendo arquivo

Lendo arquivo CETEN.xml e adicionando _dummy characters_ nas linhas com o tamanho do n-gram

- `<s>`: início da sentença
- `</s>`: fim da sentença

In [None]:
def process_line(line, n_gram=3):
    line = line.replace('\n', '').strip()
    line = line.replace('<s> ', '<s> '*(n_gram-1))
    line = line.replace(' </s>', ' </s>'*(n_gram-1))
    return tuple(line.split(' '))

In [None]:
filepath = './data/final.txt'
lines = []
n_lines = -1
i = 0
with open(filepath) as f:
    for line in tqdm(f.readlines()[:n_lines]):
        line = process_line(line, N_GRAM+1)
        lines.append(line)

In [None]:
print(len(lines))#2835994
lines[2758962]

## Definindo vocabulário

In [None]:
%%time
def get_vocabulary(lines):
    vocabulary = {}
    for sentence in tqdm(lines):
        for word in sentence:
            if word not in ['<s>', '</s>']:
                vocabulary[word] = vocabulary.get(word, 0) + 1
    return vocabulary

vocabulary = get_vocabulary(lines)
print ('Vocabulary size:', len(vocabulary))
print('qtd ocorrências:', vocabulary['jobim'])

max_value = max(vocabulary, key=vocabulary.get)
print('A palavra mais frequente é:', max_value)
print('Quantidade de ocorrências:', vocabulary[max_value])

In [None]:
most_common_words = Counter(vocabulary).most_common(10)

print('As 10 palavras mais frequentes são:')
for word, count in most_common_words:
    print(f'{word}: {count}')

# Extraindo N-grams

Os n-gramas de um texto são todas as sentenças formadas por `N` palavras

In [None]:
def get_n_grams(lines, n=3):
    n_grams = {}
    for sentence in tqdm(lines):
        sentence_length = len(sentence)
        start_index = 0
        end_index = start_index + n
        while end_index <= sentence_length:

            n_gram = sentence[start_index:end_index]
            n_grams[n_gram] = n_grams.get(n_gram, 0) + 1

            start_index +=1
            end_index = start_index + n

    return n_grams

#sentence = [
#    ('<s>', 'hoje', 'choveu', 'muito', 'no', 'rio', 'de', 'janeiro', 'hoje' '</s>'),
   # ('<s>', 'hoje', 'choveu', 'muito', 'no', 'espírito', 'santo', '</s>'),
   # ('<s>', 'hoje', 'nevou', 'muito', 'na', 'bahía', '</s>'),
#]
#xpto = get_n_grams(sentence, n=1)
#xpto.get(('hoje', 'choveu'), 0)
#xpto.get(('hoje'), 0)
#len(xpto)

# Calculando a _Forward Probability_

$$P(word|sentence) = \frac{Count(sentence+word)}{Count(sentence)}$$

In [None]:
def get_forward_probability(sentence, word, n_grams, n_plus1_grams, vocabulary):
    if word not in vocabulary.keys():
        print (f'Word {word} not in vocabulary')
        return 0
    
    sentence_occurrences = 0
    if isinstance(sentence, tuple):
        full_sentence = (*sentence, word)
        sentence_occurrences = n_grams.get(sentence, 0)
    else :
        full_sentence = (sentence, word)
        if (sentence,) in n_grams:
            sentence_occurrences = n_grams[sentence,]
        
    
    full_sentence_occurences = n_plus1_grams.get(full_sentence, 0)

    print (f'Ocorrências da sentença {sentence}: {sentence_occurrences}')
    print (f'Ocorrências da sentença {full_sentence}: {full_sentence_occurences}')

    if sentence_occurrences == 0:
        return 0

    return full_sentence_occurences/sentence_occurrences

#N_GRAM = 1
#n_grams = get_n_grams(lines, n=N_GRAM)
#n_plus1_grams = get_n_grams(lines, n=N_GRAM+1)
#sentence = ('que', 'lhe')
#next_word = 'agradeço'
#prob = get_forward_probability(sentence, next_word, n_grams, n_plus1_grams, vocabulary)
#prob = get_forward_probability('se', 'descontrolou', n_grams, n_plus1_grams, vocabulary) #'se', 'descontrolou'
#prob

# Calculando a _backward probability_

$$P(word|sentence) = \frac{Count(word+sentence)}{Count(sentence)}$$

In [None]:
def get_backward_probability(sentence, word, n_grams, n_plus1_grams, vocabulary):
    if word not in vocabulary.keys():
        print (f'Word {word} not in vocabulary')
        return 0

    sentence_occurrences = 0
    if isinstance(sentence, tuple):
        full_sentence = (word, *sentence)
        sentence_occurrences = n_grams.get(sentence, 0)
    else :
        full_sentence = (word, sentence)
        if (sentence,) in n_grams:
            sentence_occurrences = n_grams[sentence,]
    
    full_sentence_occurences = n_plus1_grams.get(full_sentence, 0)

    print (f'Ocorrências da sentença {sentence}: {sentence_occurrences}')
    print (f'Ocorrências da sentença {full_sentence}: {full_sentence_occurences}')

    if sentence_occurrences == 0:
        return 0

    return full_sentence_occurences/sentence_occurrences

#sentence = ('que', 'lhe')
#previous_word = 'honra'
#prob = get_backward_probability(sentence, previous_word, n_grams, n_plus1_grams, vocabulary)
#prob

In [None]:
df_sentences = pd.read_excel(EXCEL_FILE, engine='openpyxl')
df_sentences.tail()

In [None]:
def get_ngrams(sentence, n):
    return tuple(sentence.split(' ')[-n:])
#df_sentences['n_grams'] = df_sentences['target'].apply(lambda x: get_ngrams(x, N_GRAM))#N_GRAM =2
#df_sentences[['target', 'n_grams']]

## Monta tuplas das sentencas

In [None]:
def get_wordsFromSentence(sentence):
    palavras = sentence.lower().split()
    return tuple(palavras)

sentences_ngrams = df_sentences['target'].apply(lambda x: get_wordsFromSentence(x))
df_sentences['n_grams_sentences'] = sentences_ngrams;

print(df_sentences.iloc[0][['target', 'n_grams']])
df_sentences.to_excel(EXCEL_FILE, index=False)

### Obtendo frequência de cada palavra

In [None]:
def getFrequency(words):
    word_freq = []
    for word in words:
        qtdFreq = vocabulary.get(word, 0)
        word_freq.append((word, qtdFreq))
    return word_freq


df_sentences['word-freq'] = sentences_ngrams.apply(lambda x: getFrequency(x))
df_sentences.to_excel(EXCEL_FILE, index=False)

## Carregar 3 gram

In [None]:
N_GRAM = 2

n_grams = get_n_grams(lines, n=N_GRAM)
n_plus1_grams = get_n_grams(lines, n=N_GRAM+1)

In [None]:
print("n_grams size:", len(n_grams))
print("n_plus1_grams size:", len(n_plus1_grams))

## Calcular 3 gram forward

In [None]:
def calculateProbForward(t):
    results = []
    for i in range(2, len(t)):
        prev1, prev2, current = t[i-2:i+1]
        prob = get_forward_probability((prev1, prev2), current, n_grams, n_plus1_grams, vocabulary)
        results.append((prev1, prev2, current, prob))
    return results

df_sentences['3-gram-forward'] = sentences_ngrams.apply(lambda x: calculateProbForward(x))
df_sentences.to_excel(EXCEL_FILE, index=False)

## Calcular 3 gram backward

In [None]:
def calculateProbBackward(t):
    results = []
    for i in range(2, len(t)):
        prev1, prev2, current = t[i-2:i+1]
        prob = get_backward_probability((prev2, current), prev1, n_grams, n_plus1_grams, vocabulary)
        results.append((prev1, prev2, current, prob))
    return results

df_sentences['3-gram-backward'] = sentences_ngrams.apply(lambda x: calculateProbBackward(x))
df_sentences.to_excel(EXCEL_FILE, index=False)

## Carregar 2 gram

In [None]:
N_GRAM = 1

n_plus1_grams = n_grams
n_grams = get_n_grams(lines, n=N_GRAM)

In [None]:
print("n_grams size:", len(n_grams))
print("n_plus1_grams size:", len(n_plus1_grams))

## Calcular 2 gram forward

In [None]:
def calculateProb2Forward(t):
    results = []
    for i in range(len(t) - 1):
        prev1 = t[i]
        current = t[i+1]
        prob = get_forward_probability(prev1, current, n_grams, n_plus1_grams, vocabulary)
        results.append((prev1, current, prob))
    return results

df_sentences['2-gram-forward'] = sentences_ngrams.apply(lambda x: calculateProb2Forward(x))
df_sentences.to_excel(EXCEL_FILE, index=False)

## Calcular 2 gram backward

In [None]:
def calculateProb2Backward(words):
    results = []
    for i in range(1, len(words)):
        current = words[i]
        prev = words[i-1]
        prob = get_backward_probability(current, prev, n_grams, n_plus1_grams, vocabulary)
        results.append((prev, current, prob))
    return results


df_sentences['2-gram-backward'] = sentences_ngrams.apply(lambda x: calculateProb2Backward(x))
df_sentences.to_excel(EXCEL_FILE, index=False)

## teste forward_probability

In [None]:
sentence = ('homem', 'é')
next_word = 'preto'
prob = get_forward_probability(sentence, next_word, n_grams, n_plus1_grams, vocabulary)
prob

### Apenas pesquisa de quais palavras aparecem depois de uma determinada sentença

In [None]:
def get_next_words(sentence, n_grams, n_plus1_grams):
    full_sentences = {}
    if n_grams.get(sentence, 0) == 0:
        print (f'Sentença {sentence} não encontrada')
        return full_sentences

    for full_sentence in tqdm(n_plus1_grams.keys()):
        if full_sentence[:-1] == sentence:
            full_sentences[full_sentence] = n_plus1_grams.get(full_sentence)

    return full_sentences

sentence = ('homem', 'é')
next_words = get_next_words(sentence, n_grams, n_plus1_grams)
next_words


### Apenas pesquisa de quais palavras aparecem antes de uma determinada sentença

In [None]:
def get_previous_words(sentence, n_grams, n_plus1_grams):
    full_sentences = {}
    if n_grams.get(sentence, 0) == 0:
        print (f'Sentença {sentence} não encontrada')
        return full_sentences

    for full_sentence in tqdm(n_plus1_grams.keys()):
        if full_sentence[1:] == sentence:
            full_sentences[full_sentence] = n_plus1_grams.get(full_sentence)

    return full_sentences

sentence = ('com', 'a')
get_previous_words(sentence, n_grams, n_plus1_grams)
