# Construindo uma enciclopedia

Tarefas:

1 . Leia os documentos do corpus `reuters` e monte uma lista de palavras que:
- Passe todas as palavras para letra minuscula
- tem mais que 5 contagens,
- tem mais que 2 caracteres, e
- não contem digitos

2 . Ordene a lista de palavras e grave no disco. Abra em um editor de texto e explore a lista de palavras. Implemente medidas extras de limpeza do vocabulário.

3 . Agora sua tarefa é limpar cada texto associado às várias palavras do nosso vocabulário usando seu arsenal de ferramentas de NLP!
Para cada texto:
- Limpe o texto para remover caracteres estranhos.
- Divida o texto em sentenças usando o tokenizador Punkt https://www.nltk.org/_modules/nltk/tokenize/punkt.html
- Pense em uma forma de identificar a primeira sentença do texto que não seja "estranha" (como os cabeçalhos de documento)
- Use um modelo de linguagem treinado no corpus reuters.
- Pense em usar a perplexidade de cada sentença como medida de "estranheza" da sentença.
- Talvez usar a primeira sentença de perplexidade "baixa"?

## Inicializa o código

In [4]:
# Faz os imports relevantes
import re
import nltk
from nltk.corpus import reuters
from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.lm import MLE, Laplace

import requests

import json
from joblib import Parallel, delayed

import math
from time import sleep

In [2]:
#nltk.download()

## Importa o dataset de palavras a serem estudadas

In [3]:
# Cria a lista de palavras
words = list(map(str.lower, reuters.words()))
words[0:10]

['asian', 'exporters', 'fear', 'damage', 'from', 'u', '.', 's', '.-', 'japan']

In [4]:
# filtra as palavras de acordo com os critérios definidos
freq_reuters = nltk.FreqDist(words)
filtered = list(set(word for word in words if (freq_reuters[word] > 5 and len(word) > 2 and re.fullmatch(r"[a-z]*", word))))

filtered[0: 10]

['atico',
 'tax',
 'utilities',
 'tactics',
 'retreat',
 'nat',
 'centers',
 'imbalances',
 'merely',
 'reaffirmation']

In [5]:
# ordena a lista e salva em um arquivo para evitar uma nova leitura
filtered = sorted(filtered)

with open("storage/good_words.txt", "w") as dump:
    for word in filtered:
        dump.write(f"{word}\n")

## Faz os downloads das paginas da wikipédia para construir o dataset

In [6]:
# Le o bloco de notas com as palavras
with open("storage/good_words.txt", "r") as load:
    good_words = load.read().split("\n")

In [7]:
# Define os parametros para a API
BATCH_SIZE = 50
URL = "https://en.wikipedia.org/w/api.php"
PARAMS = {
    "action": "query",
    "prop": "revisions",
    "rvprop": "content",
    "rvslots": "main",
    "rvsection": "0",
    "titles": "",
    "format": "json",
}

# Define uma função para dividir os batches
def split_batches(words, batch_size=BATCH_SIZE):
    k = 0
    while k < len(words):
        yield words[k:(k + batch_size)]
        k += batch_size

main_texts = {}
error_log = []

# Inicia a seção
S = requests.Session()

In [8]:
print(f"{len(good_words)} palavras boas para baixar")
print()

# realiza o fetch para cada batch das palavras inicialmente boas
for k, batch in enumerate(split_batches(good_words)):
    try:
        print(f'\rProcessando batch #{k + 1:05}', end='')
        PARAMS['titles'] = '|'.join(batch)
        r = S.get(url=URL, params=PARAMS)
        r_json = r.json()

        # Reverse map of normalized titles.
        title_map = {}
        for item in r_json['query']['normalized']:
            title_map[item['to']] = item['from']
            
        # Get texts.
        texts = {}
        for pageid, page_content in r_json['query']['pages'].items():
            if int(pageid) < 0:
                continue
            text = page_content['revisions'][0]['slots']['main']['*']
            if page_content['title'] in title_map:
                w = title_map[page_content['title']]
            else:
                w = page_content['title']
                
            texts[w] = text

        # Add to global dict.
        main_texts.update(texts)

    except Exception as e:
        error_log.append((e, r))
        
print()
print()
print("Download concluido")

9179 palavras boas para baixar

 Processando batch #00184
Download concluido


In [9]:
# Fução para limpar as palavras encontradas no redirected
def treat_redirected(text):
    
    text = text.split("[")[-1].split("]")[0].split(" (")[0]
    text = text.strip(" ")
    
    if (re.fullmatch(r"[a-z_ ]*", text.lower())):
        return text
    
    return "Remove Me"

In [10]:
# Procura nos textos recebidos palavras que não tiveram a pagina enontrada
redirected_words = list(set(treat_redirected(main_texts[word]) for word in main_texts.keys() if "#redirect" in main_texts[word][:50].lower()))
try:
    redirect.remove("Remove Me")
except:
    pass

print(f"{len(redirected_words)} palavras redirecionadas para baixar")
print()

# realiza o fetch para cada batch das palavras redirecionadas
for k, batch in enumerate(split_batches(redirected_words)):
    try:
        print(f'\rProcessando batch #{k + 1:05}', end='')
        PARAMS['titles'] = '|'.join(batch)
        r = S.get(url=URL, params=PARAMS)
        r_json = r.json()

        # Reverse map of normalized titles.
        title_map = {}
        for item in r_json['query']['normalized']:
            title_map[item['to']] = item['from']
            
        # Get texts.
        texts = {}
        for pageid, page_content in r_json['query']['pages'].items():
            if int(pageid) < 0:
                continue
            text = page_content['revisions'][0]['slots']['main']['*']
            if page_content['title'] in title_map:
                w = title_map[page_content['title']]
            else:
                w = page_content['title']
                
            texts[w.lower] = text

        # Add to global dict.
        main_texts.update(texts)

    except Exception as e:
        error_log.append((e, r))
        
print()
print()
print("Download concluido")

2169 palavras redirecionadas para baixar

 Processando batch #00044
Download concluido


In [11]:
# Retira da lista palavras que foram redirecionadas
words = main_texts.keys()
not_good_data = list(set(word for word in words if "#redirect" in main_texts[word][:50].lower()))
print(f"{len(not_good_data)} paginas redirecionadas para remover")
for word in not_good_data:
    main_texts.pop(word)
    
# Retira palavras que vão para as paginas de desambiguação
words = main_texts.keys()
not_good_data = list(set(word for word in words if "may refer to:" in main_texts[word]))
print(f"{len(not_good_data)} paginas de desambiguação para remover")
for word in not_good_data:
    main_texts.pop(word)
    
# Retira palavras que vão para as paginas muito curtas
words = main_texts.keys()
not_good_data = list(set(word for word in words if "{{Short pages monitor}}" in main_texts[word]))
not_good_data = list(set(not_good_data + [word for word in words if "{{wiktionary redirect}}" in main_texts[word]]))

print(f"{len(not_good_data)} paginas com erro")
for word in not_good_data:
    main_texts.pop(word)

2882 paginas redirecionadas para remover
3049 paginas de desambiguação para remover
31 paginas com erro


In [12]:
# Exporta os resultados para arquivos
with open('storage/texts.json', 'w') as f:
    json.dump(main_texts, f, indent=4)

with open('storage/errors.txt', 'w') as f:
    for e, r in error_log:
        f.write(f'{e} ({type(e)})\nConteudo:\n{r.headers}\n{"*"*100}\n')

## Realia a limpeza dos textos

In [68]:
# Le o bloco de notas com as palavras
with open("storage/texts.json", "r") as load:
    main_texts = json.load(load)

### Demonstração da limpeza

In [104]:
sample = "hanover"

In [105]:
# Define a variavel de teste
clean = main_texts[sample]
clean

''

In [106]:
# Remove referencias do HTML
clean = re.sub(r"<ref.*?(/ref>|/>)", "", clean, flags=re.DOTALL|re.MULTILINE)
clean = re.sub(r"<sup.*?(/sup>|/>)", "", clean, flags=re.DOTALL|re.MULTILINE)
clean

''

In [84]:
# Remove marcas de objetos desenhados na pagina (entre chaves)
clean = re.sub(r"\{\{(?:[^\'\'\'])*?\}\}", "", clean, flags=re.DOTALL|re.MULTILINE)
clean

'\n\n'

In [85]:
# Corta as legendas pré texto
splited_clean = clean.split("'''")[1:]

if len(splited_clean) == 0:
    pass
else:
    clean = "".join(splited_clean)

clean

'\n\n'

In [86]:
# Subistitui palavras entre colxete pela própria palavra
clean = re.sub(r"\[\[((?:[^|])*?)\]\]", r"\1", clean, flags=re.DOTALL|re.MULTILINE)
clean

'\n\n'

In [87]:
# Subistitui palavras entre colxete com pipe pela palavra depois do pipe
clean = re.sub(r"\[\[(?:[^|]|)*(.*?)\]\]", r"\1", clean, flags=re.DOTALL|re.MULTILINE)
clean

'\n\n'

In [88]:
# limpa algumas outras variaveis irrelevantes
clean = re.sub(r"\n|\'", r"", clean, flags=re.DOTALL|re.MULTILINE)
clean = re.sub(r"}}", r"", clean, flags=re.DOTALL|re.MULTILINE)
clean = re.sub(r"\|", r"", clean, flags=re.DOTALL|re.MULTILINE)
clean = re.sub(r"\(;|\(,", r"(SPECIALCHAR", clean, flags=re.DOTALL|re.MULTILINE)
clean = re.sub(r"\(SPECIALCHAR.*?\)", "", clean, flags=re.DOTALL|re.MULTILINE)
clean

''

### Limpeza total

In [89]:
def clean_string(clean, remove_special=True):
    # Remove referencias do HTML
    clean = re.sub(r"<ref.*?(/ref>|/>)", "", clean, flags=re.DOTALL|re.MULTILINE)
    clean = re.sub(r"<sup.*?(/sup>|/>)", "", clean, flags=re.DOTALL|re.MULTILINE)

    # Remove marcas de objetos desenhados na pagina (entre chaves)
    clean = re.sub(r"\{\{(?:[^\'\'\'])*?\}\}", "", clean, flags=re.DOTALL|re.MULTILINE)

    # Corta as legendas pré texto
    splited_clean = clean.split("'''")[1:]

    if len(splited_clean) == 0:
        pass
    else:
        clean = "".join(splited_clean)

    # Subistitui palavras entre colxete pela própria palavra
    clean = re.sub(r"\[\[((?:[^|])*?)\]\]", r"\1", clean, flags=re.DOTALL|re.MULTILINE)

    # Subistitui palavras entre colxete com pipe pela palavra depois do pipe
    clean = re.sub(r"\[\[(?:[^|]|)*(.*?)\]\]", r"\1", clean, flags=re.DOTALL|re.MULTILINE)

    # limpa algumas outras variaveis irrelevantes
    clean = re.sub(r"\n|\'", r"", clean, flags=re.DOTALL|re.MULTILINE)
    clean = re.sub(r"}}", r"", clean, flags=re.DOTALL|re.MULTILINE)
    clean = re.sub(r"\|", r"", clean, flags=re.DOTALL|re.MULTILINE)
    clean = re.sub(r"\(;|\(,", r"(SPECIALCHAR", clean, flags=re.DOTALL|re.MULTILINE)
    
    if remove_special:
        clean = re.sub(r"\(SPECIALCHAR.*?\)", "", clean, flags=re.DOTALL|re.MULTILINE)
    
    return clean

In [108]:
for word in list(main_texts.keys()):
    
    new_text = clean_string(main_texts[word])
    
    if len(new_text) == 0:
        main_texts.pop(word)
    else:
        main_texts[word] = new_text

In [109]:
# Exporta os resultados para arquivos
with open('storage/clean_texts.json', 'w') as f:
    json.dump(main_texts, f, indent=4)

## Divide o texto em sentenças usando o tokenizador Punkt

In [110]:
# Le o bloco de notas com as palavras
with open("storage/clean_texts.json", "r") as load:
    main_texts = json.load(load)

In [111]:
sample = "hanover"

In [112]:
main_sents = {}
sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
for key, value in main_texts.items():
    main_sents[key] = sent_tokenizer.tokenize(value)

In [113]:
main_sents[sample]

['Hanover or Hannover  is the capital and largest city of the German state of Lower Saxony.',
 'Its 535,061 (2017) inhabitants make it the thirteenth-largest city in Germany as well as the third-largest city in Northern Germany after Hamburg and Bremen.',
 'The city lies at the confluence of the River Leine (progression: ) and its tributary Ihme, in the south of the North German Plain, and is the largest city in the Hannover–Braunschweig–Göttingen–Wolfsburg Metropolitan Region.',
 'It is the fifth-largest city in the Low German dialect area after Hamburg, Dortmund, Essen and Bremen.Before it became the capital of Lower Saxony in 1946 Hanover was the capital of the Principality of Calenberg (1636–1692), the Electorate of Hanover (1692–1814), the Kingdom of Hanover (1814–1866), the Province of Hanover of the Kingdom of Prussia (1868–1918), the Province of Hanover of the Free State of Prussia (1918–1946) and of the State of Hanover (1946).',
 'From 1714 to 1837 Hanover was by personal uni

In [150]:
# Exporta os resultados para arquivos
with open('storage/def_sents.json', 'w') as f:
    json.dump(main_sents, f, indent=4)

## Análise por Perplexidade

In [3]:
# Le o bloco de notas com as palavras
with open("storage/def_sents.json", "r") as load:
    main_sents = json.load(load)

### Utilizando o corpus reuters de base

In [168]:
# Prepraração dos dados
reuters_sentences = reuters.sents()
reuters_train, reuters_vocab = padded_everygram_pipeline(2, reuters_sentences)

reuters_train = list(list(t) for t in reuters_train)
reuters_vocab = list(reuters_vocab)

#Treinamento do Modelo
lm_reuters = Laplace(2)
lm_reuters.fit(reuters_train, reuters_vocab)

In [169]:
sample = "hanover"

In [170]:
# Encontra as perplexidades da amostra
text_sentences = [t.split() for t in main_sents[sample]]
test, _ = padded_everygram_pipeline(2, text_sentences)
test = list(list(t) for t in test)

# Calcula a perplexidade das sentenças
idx = 0 
min_value = math.inf
for i, s in enumerate(test):
    px = lm_reuters.perplexity(s)
    
    if(min_value > px):
        min_value = px
        idx = i

    print(px)

3430.347882606758
8372.738449782717
3529.6318222835257
7686.65569545711
10478.996008500379
14306.610566047464
13130.070098942655
7638.787064473866
11221.220325754444
5488.853729160141
19418.311305672054
10551.84100606817
4051.3891598621776
5014.145133997854


In [215]:
def find_lowest_perplexity_reuter_parallel(main_sents, words, do_print=False):
    
    encyclopedia_reuters = {}
    for i, word in enumerate(words):
        idx, min_value = find_lowest_perplexity_reuters(main_sents, word)
        encyclopedia_reuters[word] = main_sents[word][idx]
        
    return encyclopedia_reuters
        
def find_lowest_perplexity_reuters(main_sents, word, do_print=False):

    # Prepraração dos dados
    text_sentences = [t.split() for t in main_sents[word]]
    test, _ = padded_everygram_pipeline(2, text_sentences)
    test = list(list(t) for t in test)

    # Calcula a perplexidade das sentenças
    idx = 0 
    min_value = math.inf
    for i, s in enumerate(test):
        px = lm_reuters.perplexity(s)

        if(min_value > px):
            min_value = px
            idx = i

        if do_print:
            print(px)
            
    return idx, min_value 

#### Rodando em serie

In [174]:
encyclopedia_reuters = {}
words = list(main_sents.keys())
total = len(words)
for i, word in enumerate(words):
    idx, min_value = find_lowest_perplexity_reuters(main_sents, word)
    encyclopedia_reuters[word] = main_sents[word][idx]
    
    print(f"\r{i+1} out of {total}", end='')

35 out of 3832

KeyboardInterrupt: 

#### Rodando em Paralelo

In [216]:
n_jobs = 12
encyclopedia_reuters = {}
words = list(main_sents.keys())
part = round(len(words)/n_jobs)

words_batch = []
for i in range(n_jobs):
    if ((i+1)*part > len(words)):
        words_batch.append(words[i*part:])
    else:
        words_batch.append(words[i*part:(i+1)*part])

In [None]:
encyclopedia_reuters = {}
results = Parallel(n_jobs=n_jobs, verbose=50)(delayed(find_lowest_perplexity_reuter_parallel)(main_sents, words) for words in words_batch)

In [None]:
# Exporta os resultados para arquivos
with open('storage/encyclopedia_reuters.json', 'w') as f:
    json.dump(encyclopedia_reuters, f, indent=4)

### Usando o próprio texto de base

In [5]:
sample = "hanover"

In [6]:
# Prepraração dos dados
text_sentences = [t.split() for t in main_sents[sample]]
train, vocab = padded_everygram_pipeline(2, text_sentences)

train = list(list(t) for t in train)
vocab = list(vocab)

# Treinamento do Modelo
lm = Laplace(2)
lm.fit(train, vocab)

# Calcula a perplexidade das sentenças
idx = 0 
min_value = math.inf
for i, s in enumerate(train):
    px = lm.perplexity(s)
    
    if(min_value > px):
        min_value = px
        idx = i

    print(px)

79.7550009448657
124.01750308067002
93.11147167318235
83.93973690163187
121.94755697767927
110.54408863750604
128.84281222783548
116.52755746298136
150.0887349015754
113.86013794070298
106.94385975498913
117.51953388139117
114.44498585763819
123.91459807786784


In [7]:
def find_lowest_perplexity_same_parallel(main_sents, words, do_print=False):
    
    encyclopedia_same = {}
    for i, word in enumerate(words):
        idx, min_value = find_lowest_perplexity_same(main_sents, word)
        encyclopedia_same[word] = main_sents[word][idx]
        
    return encyclopedia_same

def find_lowest_perplexity_same(main_sents, word, do_print=False):

    # Prepraração dos dados
    text_sentences = [t.split() for t in main_sents[word]]
    train, vocab = padded_everygram_pipeline(2, text_sentences)

    train = list(list(t) for t in train)
    vocab = list(vocab)

    # Treinamento do Modelo
    lm = Laplace(2)
    lm.fit(train, vocab)
    
    # Calcula a perplexidade das sentenças
    idx = 0 
    min_value = math.inf
    for i, s in enumerate(train):
        px = lm.perplexity(s)
        
        if(min_value > px):
            min_value = px
            idx = i
        
        if do_print:
            print(px)
            
    return idx, min_value 

In [16]:
encyclopedia_same = {}
words = list(main_sents.keys())
total = len(words)
for i, word in enumerate(words):
    idx, min_value = find_lowest_perplexity_same(main_sents, word)
    encyclopedia_same[word] = main_sents[word][idx]
    
    print(f"\r{i+1} out of {total}", end='')

3832 out of 3832

#### Rodando em paralelo

In [9]:
n_jobs = 12
encyclopedia_reuters = {}
words = list(main_sents.keys())
part = round(len(words)/n_jobs)

words_batch = []
for i in range(n_jobs):
    if ((i+1)*part > len(words)):
        words_batch.append(words[i*part:])
    else:
        words_batch.append(words[i*part:(i+1)*part])

In [10]:
encyclopedia_same = {}
results = Parallel(n_jobs=n_jobs, verbose=50)(delayed(find_lowest_perplexity_same_parallel)(main_sents, words) for words in words_batch)

for result in results:
    encyclopedia_same.update(result)

[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   1 tasks      | elapsed:  1.1min
[Parallel(n_jobs=12)]: Done   2 out of  12 | elapsed:  1.1min remaining:  5.6min
[Parallel(n_jobs=12)]: Done   3 out of  12 | elapsed:  1.1min remaining:  3.4min
[Parallel(n_jobs=12)]: Done   4 out of  12 | elapsed:  1.1min remaining:  2.3min
[Parallel(n_jobs=12)]: Done   5 out of  12 | elapsed:  1.2min remaining:  1.7min
[Parallel(n_jobs=12)]: Done   6 out of  12 | elapsed:  1.2min remaining:  1.2min
[Parallel(n_jobs=12)]: Done   7 out of  12 | elapsed:  1.2min remaining:   53.0s
[Parallel(n_jobs=12)]: Done   8 out of  12 | elapsed:  1.3min remaining:   37.5s
[Parallel(n_jobs=12)]: Done   9 out of  12 | elapsed:  1.3min remaining:   25.3s
[Parallel(n_jobs=12)]: Done  10 out of  12 | elapsed:  1.3min remaining:   15.2s
[Parallel(n_jobs=12)]: Done  12 out of  12 | elapsed:  1.4min remaining:    0.0s
[Parallel(n_jobs=12)]: Done  12 out of  12 | elapse

In [13]:
# Exporta os resultados para arquivos
with open('storage/encyclopedia_same.json', 'w') as f:
    json.dump(encyclopedia_same, f, indent=4)

## Explora as encyclopedias

In [165]:
# Le o bloco de notas com os dados
with open("storage/encyclopedia_same.json", "r") as load:
    encyclopedia_same = json.load(load)
    
with open("storage/encyclopedia_reuters.json", "r") as load:
    encyclopedia_reuters = json.load(load)

FileNotFoundError: [Errno 2] No such file or directory: 'storage/encyclopedia_reuters.json'

In [166]:
encyclopedia_same["hanover"]

'Hanover or Hannover  is the capital and largest city of the German state of Lower Saxony.'

In [167]:
encyclopedia_reuters["hanover"]

NameError: name 'encyclopedia_reuters' is not defined