# Construindo uma enciclopedia

Tarefas:

1 . Leia os documentos do corpus `reuters` e monte uma lista de palavras que:
- Passe todas as palavras para letra minuscula
- tem mais que 5 contagens,
- tem mais que 2 caracteres, e
- não contem digitos

2 . Ordene a lista de palavras e grave no disco. Abra em um editor de texto e explore a lista de palavras. Implemente medidas extras de limpeza do vocabulário.

3 . Agora sua tarefa é limpar cada texto associado às várias palavras do nosso vocabulário usando seu arsenal de ferramentas de NLP!
Para cada texto:
- Limpe o texto para remover caracteres estranhos.
- Divida o texto em sentenças usando o tokenizador Punkt https://www.nltk.org/_modules/nltk/tokenize/punkt.html
- Pense em uma forma de identificar a primeira sentença do texto que não seja "estranha" (como os cabeçalhos de documento)
- Use um modelo de linguagem treinado no corpus reuters.
- Pense em usar a perplexidade de cada sentença como medida de "estranheza" da sentença.
- Talvez usar a primeira sentença de perplexidade "baixa"?

## Inicializa o código

In [2]:
# Faz os imports relevantes

from nltk.corpus import reuters
import nltk
import re

import requests
import json
from time import sleep

In [2]:
#nltk.download()

## Importa o dataset de palavras a serem estudadas

In [3]:
# Cria a lista de palavras
words = list(map(str.lower, reuters.words()))
words[0:10]

['asian', 'exporters', 'fear', 'damage', 'from', 'u', '.', 's', '.-', 'japan']

In [4]:
# filtra as palavras de acordo com os critérios definidos
freq_reuters = nltk.FreqDist(words)
filtered = list(set(word for word in words if (freq_reuters[word] > 5 and len(word) > 2 and re.fullmatch(r"[a-z]*", word))))

filtered[0: 10]

['modestly',
 'conditions',
 'associates',
 'speaking',
 'metromail',
 'john',
 'recover',
 'know',
 'pont',
 'intervene']

In [5]:
# ordena a lista e salva em um arquivo para evitar uma nova leitura
filtered = sorted(filtered)

with open("storage/good_words.txt", "w") as dump:
    for word in filtered:
        dump.write(f"{word}\n")

## Faz os downloads das paginas da wikipédia para construir o dataset

In [6]:
# Le o bloco de notas com as palavras
with open("storage/good_words.txt", "r") as load:
    good_words = load.read().split("\n")

In [7]:
# Define os parametros para a API
BATCH_SIZE = 50
URL = "https://en.wikipedia.org/w/api.php"
PARAMS = {
    "action": "query",
    "prop": "revisions",
    "rvprop": "content",
    "rvslots": "main",
    "rvsection": "0",
    "titles": "",
    "format": "json",
}

# Define uma função para dividir os batches
def split_batches(words, batch_size=BATCH_SIZE):
    k = 0
    while k < len(words):
        yield words[k:(k + batch_size)]
        k += batch_size

main_texts = {}
error_log = []

# Inicia a seção
S = requests.Session()

In [8]:
print(f"{len(good_words)} palavras boas para baixar")
print()

# realiza o fetch para cada batch das palavras inicialmente boas
for k, batch in enumerate(split_batches(good_words)):
    try:
        print(f'\r Processando batch #{k + 1:05}', end='')
        PARAMS['titles'] = '|'.join(batch)
        r = S.get(url=URL, params=PARAMS)
        r_json = r.json()

        # Reverse map of normalized titles.
        title_map = {}
        for item in r_json['query']['normalized']:
            title_map[item['to']] = item['from']
            
        # Get texts.
        texts = {}
        for pageid, page_content in r_json['query']['pages'].items():
            if int(pageid) < 0:
                continue
            text = page_content['revisions'][0]['slots']['main']['*']
            if page_content['title'] in title_map:
                w = title_map[page_content['title']]
            else:
                w = page_content['title']
                
            texts[w] = text

        # Add to global dict.
        main_texts.update(texts)

    except Exception as e:
        error_log.append((e, r))
    
print()
print("Download concluido")

9179 palavras boas para baixar

 Processando batch #00184
Download concluido


In [9]:
# Procura nos textos recebidos palavras que não tiveram a pagina enontrada
redirected_words = list(set(word.split("[")[-1].split("]")[0] for word in main_texts.keys() if "#REDIRECT" in main_texts[word][:15]))
redirected_words = redirected_words + list(set(word.split("[")[-1].split("]")[0] for word in main_texts.keys() if "#redirect" in main_texts[word][:15]))
redirected_words = sorted(redirected_words)
redirected_words

print(f"{len(redirected_words)} palavras redirecionadas para baixar")
print()

# realiza o fetch para cada batch das palavras redirecionadas
for k, batch in enumerate(split_batches(redirected_words)):
    try:
        print(f'\r Processando batch #{k + 1:05}', end='')
        PARAMS['titles'] = '|'.join(batch)
        r = S.get(url=URL, params=PARAMS)
        r_json = r.json()

        # Reverse map of normalized titles.
        title_map = {}
        for item in r_json['query']['normalized']:
            title_map[item['to']] = item['from']
            
        # Get texts.
        texts = {}
        for pageid, page_content in r_json['query']['pages'].items():
            if int(pageid) < 0:
                continue
            text = page_content['revisions'][0]['slots']['main']['*']
            if page_content['title'] in title_map:
                w = title_map[page_content['title']]
            else:
                w = page_content['title']
                
            texts[w] = text

        # Add to global dict.
        main_texts.update(texts)

    except Exception as e:
        error_log.append((e, r))
        
print()
print("Download concluido")

2769 palavras redirecionadas para baixar

 Processando batch #00056
Download concluido


In [10]:
# # remove da lista palavras que foram redirecionadas
not_good_data = list(set(word.split("[")[-1].split("]")[0] for word in main_texts.keys() if "#REDIRECT" in main_texts[word][:15]))
not_good_data = not_good_data + list(set(word.split("[")[-1].split("]")[0] for word in main_texts.keys() if "#redirect" in main_texts[word][:15]))

print(f"{len(not_good_data)} paginas redirecionadas para remover")
for word in not_good_data:
    main_texts.pop(word)
    
# Retira palavras que vão para as pagins de desambiguação
not_good_data = list(set(word.split("[")[-1].split("]")[0] for word in main_texts.keys() if "may refer to:" in main_texts[word]))

print(f"{len(not_good_data)} paginas de desambiguação para remover")
for word in not_good_data:
    main_texts.pop(word)

2769 paginas redirecionadas para remover
2339 paginas de desambiguação para remover


In [11]:
# Exporta os resultados para arquivos
with open('storage/texts.json', 'w') as f:
    json.dump(main_texts, f, indent=4)

with open('storage/errors.txt', 'w') as f:
    for e, r in error_log:
        f.write(f'{e} ({type(e)})\nConteudo:\n{r.headers}\n{"*"*100}\n')

## Realia a limpeza dos textos

In [12]:
# Le o bloco de notas com as palavras
with open("storage/texts.json", "r") as load:
    main_texts = json.load(load)

### Demonstração da limpeza

In [13]:
# Define a variavel de teste
clean = main_texts['hanover']
clean

'{{about|the German city|other uses|Hanover (disambiguation)}}\n{{Redirect|Hannover}}\n{{short description|City in Lower Saxony, Germany}}\n{{Infobox German location\n|name               = Hanover\n|German_name        = Hannover\n|type               = City\n|image_photo        = {{Photomontage|position=center\n| photo1a = Neues Rathaus Hannover abends.jpg\n| photo2a = HanoverMarktkirche.JPG\n| photo2b = TelemaxHannover.jpg\n| photo3a = Kröpcke_Uhr_Hannover.jpg\n| photo3b = Dschungelpalast.jpg\n| photo4a = Inselgraben_Eilenriede.jpg\n   | size = 280\n   | spacing = 2\n   | color = #FFFFFF\n   | border = 0\n   | foot_montage = {{nobreak|From top: [[New Town Hall (Hanover)|New Town Hall]] of Hanover}}<br/>[[Marktkirche, Hanover|Market Church]], [[Telemax]], Kröpcke-Clock, an [[Asian elephant]] at the [[Hanover Zoo]] and the [[Eilenriede]] forest\n}}\n|image_flag         = Flag of Hanover (city).png\n|image_coa          = Coat of arms of Hannover.svg\n|coordinates        = {{coord|52|22|N|

In [14]:
# Remove referencias do HTML
clean = re.sub(r"<ref.*?/ref>", "", clean, flags=re.DOTALL|re.MULTILINE)
clean = re.sub(r"<sup.*?/sup>", "", clean, flags=re.DOTALL|re.MULTILINE)
clean

"{{about|the German city|other uses|Hanover (disambiguation)}}\n{{Redirect|Hannover}}\n{{short description|City in Lower Saxony, Germany}}\n{{Infobox German location\n|name               = Hanover\n|German_name        = Hannover\n|type               = City\n|image_photo        = {{Photomontage|position=center\n| photo1a = Neues Rathaus Hannover abends.jpg\n| photo2a = HanoverMarktkirche.JPG\n| photo2b = TelemaxHannover.jpg\n| photo3a = Kröpcke_Uhr_Hannover.jpg\n| photo3b = Dschungelpalast.jpg\n| photo4a = Inselgraben_Eilenriede.jpg\n   | size = 280\n   | spacing = 2\n   | color = #FFFFFF\n   | border = 0\n   | foot_montage = {{nobreak|From top: [[New Town Hall (Hanover)|New Town Hall]] of Hanover}}<br/>[[Marktkirche, Hanover|Market Church]], [[Telemax]], Kröpcke-Clock, an [[Asian elephant]] at the [[Hanover Zoo]] and the [[Eilenriede]] forest\n}}\n|image_flag         = Flag of Hanover (city).png\n|image_coa          = Coat of arms of Hannover.svg\n|coordinates        = {{coord|52|22|N|

In [15]:
# Remove marcas de objetos desenhados na pagina (entre chaves)
clean = re.sub(r"{{.*?}}", "", clean, flags=re.DOTALL|re.MULTILINE)
clean = re.sub(r"{{|}}", "", clean, flags=re.DOTALL|re.MULTILINE)
clean

"\n\n\n<br/>[[Marktkirche, Hanover|Market Church]], [[Telemax]], Kröpcke-Clock, an [[Asian elephant]] at the [[Hanover Zoo]] and the [[Eilenriede]] forest\n\n|image_flag         = Flag of Hanover (city).png\n|image_coa          = Coat of arms of Hannover.svg\n|coordinates        = \n|image_plan         = Hannover in H.svg\n|state              = Lower Saxony\n|district           = Hannover\n|elevation          = 55\n|area               = 204.01\n|area_metro         = <!-- Metropolitan area, in km². XXX.XX (no commas or other text) -->\n<!-- |population         = 518386  filled via Gemeindeschlüssel \n|pop_date              = 31 December 2013\n|pop_ref            = \n-->\n|pop_metro          = 1119032\n|postal_code        = 30001 - 30669\n|area_code          = 0511\n|licence            = H\n|Gemeindeschlüssel  = 03 2 41 001\n|NUTS               = <!-- NUTS value: DEXXX  - WILL ''not'' BE DISPLAYED -->\n|LOCODE             = DE HAJ\n|divisions          = 13 districts\n|website            

In [16]:
# Corta as legendas pré texto
splited_clean = clean.split("'''")[1:]
clean = "".join(splited_clean)
clean

"Hanover or Hannover (;  ; ) is the capital and largest city of the German [[States of Germany|state]] of [[Lower Saxony]]. Its 535,061 (2017) inhabitants make it the [[List of cities in Germany by population|thirteenth-largest city]] in Germany as well as the third-largest city in [[Northern Germany]] after [[Hamburg]] and [[Bremen]]. The city lies at the [[confluence]] of the [[River Leine]] (progression: ) and its [[tributary]] [[Ihme]], in the south of the [[North German Plain]], and is the largest city in the [[Hannover–Braunschweig–Göttingen–Wolfsburg Metropolitan Region]]. It is the fifth-largest city in the [[Low German]] dialect area after Hamburg, [[Dortmund]], [[Essen]] and Bremen.\n\nBefore it became the capital of Lower Saxony in 1946 Hanover was the capital of the [[Principality of Calenberg]] (1636–1692), the [[Electorate of Hanover]] (1692–1814), the [[Kingdom of Hanover]] (1814–1866), the [[Province of Hanover]] of the [[Kingdom of Prussia]] (1868–1918), the [[Province

In [17]:
# Subistitui palavras entre colxete pela própria palavra
clean = re.sub(r"\[\[((?:[^|])*?)\]\]", r"\1", clean, flags=re.DOTALL|re.MULTILINE)
clean

"Hanover or Hannover (;  ; ) is the capital and largest city of the German [[States of Germany|state]] of Lower Saxony. Its 535,061 (2017) inhabitants make it the [[List of cities in Germany by population|thirteenth-largest city]] in Germany as well as the third-largest city in Northern Germany after Hamburg and Bremen. The city lies at the confluence of the River Leine (progression: ) and its tributary Ihme, in the south of the North German Plain, and is the largest city in the Hannover–Braunschweig–Göttingen–Wolfsburg Metropolitan Region. It is the fifth-largest city in the Low German dialect area after Hamburg, Dortmund, Essen and Bremen.\n\nBefore it became the capital of Lower Saxony in 1946 Hanover was the capital of the Principality of Calenberg (1636–1692), the Electorate of Hanover (1692–1814), the Kingdom of Hanover (1814–1866), the Province of Hanover of the Kingdom of Prussia (1868–1918), the Province of Hanover of the Free State of Prussia (1918–1946) and of the State of H

In [18]:
# Subistitui palavras entre colxete com pipe pela palavra depois do pipe
clean = re.sub(r"\[\[(?:[^|]|)*(.*?)\]\]", r"\1", clean, flags=re.DOTALL|re.MULTILINE)
clean

"Hanover or Hannover (;  ; ) is the capital and largest city of the German |state of Lower Saxony. Its 535,061 (2017) inhabitants make it the |thirteenth-largest city in Germany as well as the third-largest city in Northern Germany after Hamburg and Bremen. The city lies at the confluence of the River Leine (progression: ) and its tributary Ihme, in the south of the North German Plain, and is the largest city in the Hannover–Braunschweig–Göttingen–Wolfsburg Metropolitan Region. It is the fifth-largest city in the Low German dialect area after Hamburg, Dortmund, Essen and Bremen.\n\nBefore it became the capital of Lower Saxony in 1946 Hanover was the capital of the Principality of Calenberg (1636–1692), the Electorate of Hanover (1692–1814), the Kingdom of Hanover (1814–1866), the Province of Hanover of the Kingdom of Prussia (1868–1918), the Province of Hanover of the Free State of Prussia (1918–1946) and of the State of Hanover (1946). From 1714 to 1837 Hanover was by personal union t

In [19]:
# limpa algumas outras variaveis irrelevantes
clean = re.sub(r"\n|\'", r"", clean, flags=re.DOTALL|re.MULTILINE)
clean = re.sub(r"\|", r"", clean, flags=re.DOTALL|re.MULTILINE)
clean = re.sub(r"\(;|\(,", r"(SPECIALCHAR", clean, flags=re.DOTALL|re.MULTILINE)
clean

'Hanover or Hannover (SPECIALCHAR  ; ) is the capital and largest city of the German state of Lower Saxony. Its 535,061 (2017) inhabitants make it the thirteenth-largest city in Germany as well as the third-largest city in Northern Germany after Hamburg and Bremen. The city lies at the confluence of the River Leine (progression: ) and its tributary Ihme, in the south of the North German Plain, and is the largest city in the Hannover–Braunschweig–Göttingen–Wolfsburg Metropolitan Region. It is the fifth-largest city in the Low German dialect area after Hamburg, Dortmund, Essen and Bremen.Before it became the capital of Lower Saxony in 1946 Hanover was the capital of the Principality of Calenberg (1636–1692), the Electorate of Hanover (1692–1814), the Kingdom of Hanover (1814–1866), the Province of Hanover of the Kingdom of Prussia (1868–1918), the Province of Hanover of the Free State of Prussia (1918–1946) and of the State of Hanover (1946). From 1714 to 1837 Hanover was by personal uni

### Limpeza total

In [20]:
def clean_string(clean):
    # Remove referencias do HTML
    clean = re.sub(r"<ref.*?/ref>", "", clean, flags=re.DOTALL|re.MULTILINE)
    clean = re.sub(r"<sup.*?/sup>", "", clean, flags=re.DOTALL|re.MULTILINE)

    # Remove marcas de objetos desenhados na pagina (entre chaves)
    clean = re.sub(r"{{.*?}}", "", clean, flags=re.DOTALL|re.MULTILINE)
    clean = re.sub(r"{{|}}", "", clean, flags=re.DOTALL|re.MULTILINE)

    # Corta as legendas pré texto
    splited_clean = clean.split("'''")[1:]
    clean = "".join(splited_clean)

    # Subistitui palavras entre colxete pela própria palavra
    clean = re.sub(r"\[\[((?:[^|])*?)\]\]", r"\1", clean, flags=re.DOTALL|re.MULTILINE)

    # Subistitui palavras entre colxete com pipe pela palavra depois do pipe
    clean = re.sub(r"\[\[(?:[^|]|)*(.*?)\]\]", r"\1", clean, flags=re.DOTALL|re.MULTILINE)

    # limpa algumas outras variaveis irrelevantes
    clean = re.sub(r"\n|\'", r"", clean, flags=re.DOTALL|re.MULTILINE)
    clean = re.sub(r"\|", r"", clean, flags=re.DOTALL|re.MULTILINE)
    clean = re.sub(r"\(;|\(,", r"(SPECIALCHAR", clean, flags=re.DOTALL|re.MULTILINE)
    
    return clean

In [21]:
for word in main_texts.keys():
    main_texts[word] = clean_string(main_texts[word])

In [22]:
# Exporta os resultados para arquivos
with open('storage/clean_texts.json', 'w') as f:
    json.dump(main_texts, f, indent=4)

## Divide o texto em sentenças usando o tokenizador Punkt

In [3]:
# Le o bloco de notas com as palavras
with open("storage/clean_texts.json", "r") as load:
    main_texts = json.load(load)

In [4]:
main_sents = {}
sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
for key, value in main_texts.items():
    main_sents[key] = sent_tokenizer.tokenize(value)

In [5]:
main_sents["hanover"][0] = re.sub(r"\(SPECIALCHAR.*?\)", "", main_sents["hanover"][0])

In [6]:
main_sents["hanover"]

['Hanover or Hannover  is the capital and largest city of the German state of Lower Saxony.',
 'Its 535,061 (2017) inhabitants make it the thirteenth-largest city in Germany as well as the third-largest city in Northern Germany after Hamburg and Bremen.',
 'The city lies at the confluence of the River Leine (progression: ) and its tributary Ihme, in the south of the North German Plain, and is the largest city in the Hannover–Braunschweig–Göttingen–Wolfsburg Metropolitan Region.',
 'It is the fifth-largest city in the Low German dialect area after Hamburg, Dortmund, Essen and Bremen.Before it became the capital of Lower Saxony in 1946 Hanover was the capital of the Principality of Calenberg (1636–1692), the Electorate of Hanover (1692–1814), the Kingdom of Hanover (1814–1866), the Province of Hanover of the Kingdom of Prussia (1868–1918), the Province of Hanover of the Free State of Prussia (1918–1946) and of the State of Hanover (1946).',
 'From 1714 to 1837 Hanover was by personal uni

## Análise por Perplexidade

In [7]:
# Prepraração dos dados
from nltk.lm.preprocessing import padded_everygram_pipeline

tmp = [t.split() for t in main_sents["hanover"]]
train, vocab = padded_everygram_pipeline(2, tmp)

train = list(list(t) for t in train)
vocab = list(vocab)

In [8]:
tmp

[['Hanover',
  'or',
  'Hannover',
  'is',
  'the',
  'capital',
  'and',
  'largest',
  'city',
  'of',
  'the',
  'German',
  'state',
  'of',
  'Lower',
  'Saxony.'],
 ['Its',
  '535,061',
  '(2017)',
  'inhabitants',
  'make',
  'it',
  'the',
  'thirteenth-largest',
  'city',
  'in',
  'Germany',
  'as',
  'well',
  'as',
  'the',
  'third-largest',
  'city',
  'in',
  'Northern',
  'Germany',
  'after',
  'Hamburg',
  'and',
  'Bremen.'],
 ['The',
  'city',
  'lies',
  'at',
  'the',
  'confluence',
  'of',
  'the',
  'River',
  'Leine',
  '(progression:',
  ')',
  'and',
  'its',
  'tributary',
  'Ihme,',
  'in',
  'the',
  'south',
  'of',
  'the',
  'North',
  'German',
  'Plain,',
  'and',
  'is',
  'the',
  'largest',
  'city',
  'in',
  'the',
  'Hannover–Braunschweig–Göttingen–Wolfsburg',
  'Metropolitan',
  'Region.'],
 ['It',
  'is',
  'the',
  'fifth-largest',
  'city',
  'in',
  'the',
  'Low',
  'German',
  'dialect',
  'area',
  'after',
  'Hamburg,',
  'Dortmund,',


In [85]:
train

[[('<s>',),
  ('Hanover',),
  ('or',),
  ('Hannover',),
  ('is',),
  ('the',),
  ('capital',),
  ('and',),
  ('largest',),
  ('city',),
  ('of',),
  ('the',),
  ('German',),
  ('state',),
  ('of',),
  ('Lower',),
  ('Saxony.',),
  ('</s>',),
  ('<s>', 'Hanover'),
  ('Hanover', 'or'),
  ('or', 'Hannover'),
  ('Hannover', 'is'),
  ('is', 'the'),
  ('the', 'capital'),
  ('capital', 'and'),
  ('and', 'largest'),
  ('largest', 'city'),
  ('city', 'of'),
  ('of', 'the'),
  ('the', 'German'),
  ('German', 'state'),
  ('state', 'of'),
  ('of', 'Lower'),
  ('Lower', 'Saxony.'),
  ('Saxony.', '</s>')],
 [('<s>',),
  ('Its',),
  ('535,061',),
  ('(2017)',),
  ('inhabitants',),
  ('make',),
  ('it',),
  ('the',),
  ('thirteenth-largest',),
  ('city',),
  ('in',),
  ('Germany',),
  ('as',),
  ('well',),
  ('as',),
  ('the',),
  ('third-largest',),
  ('city',),
  ('in',),
  ('Northern',),
  ('Germany',),
  ('after',),
  ('Hamburg',),
  ('and',),
  ('Bremen.',),
  ('</s>',),
  ('<s>', 'Its'),
  ('Its

In [86]:
#Treinamento do Modelo
from nltk.lm import MLE, Laplace
lm = Laplace(2)
lm.fit(train, vocab)

#lm = MLE(2)
#lm = KneserNeyInterpolated(2)

## Selecão das sentenças com menor perplexidade 

In [92]:
import math

idx = 0 
min_value = math.inf
for i, s in enumerate(train):
    px = lm.perplexity(s)
    if(min_value > px):
        min_value = px
        idx = i
        
    print(lm.perplexity(s))
    
print(f"Index: {idx}, Value {min_value}")

79.7550009448657
124.01750308067002
93.11147167318235
83.93973690163187
121.94755697767927
110.54408863750604
128.84281222783548
116.52755746298136
150.0887349015754
113.86013794070298
106.94385975498913
117.51953388139117
114.44498585763819
123.91459807786784
Index: 0, Value 79.7550009448657


In [93]:
encyclopedia = {}

## Próximos passos

A fazer:
    
- Pense em uma forma de identificar a primeira sentença do texto que não seja "estranha" (como os cabeçalhos de documento)
- Use um modelo de linguagem treinado no corpus reuters.
- Pense em usar a perplexidade de cada sentença como medida de "estranheza" da sentença.
- Talvez usar a primeira sentença de perplexidade "baixa"?