<a href="https://colab.research.google.com/github/guilhermelaviola/NaturalLanguageProcessing/blob/main/Class05.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Text summarization**

In [10]:
# Installing the package to work with Wikipedia pages:
! pip3 install wikipedia

import wikipedia
wikipedia.set_lang('en')

document = wikipedia.page('Brazil')

document.title

document.content

document = document.content
document = document.replace('\n', ' ')



In [12]:
# Text segmentation with NLTK:
import nltk
import string
from nltk.tokenize import word_tokenize, sent_tokenize

# Download the 'punkt_tab' resource
nltk.download('punkt') # This line downloads the punkt resource, which is used for tokenization
nltk.download('punkt_tab') # This line downloads the 'punkt_tab' resource
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
punctuation = string.punctuation

tokens = word_tokenize(document)

print(tokens)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['Brazil', ',', 'officially', 'the', 'Federative', 'Republic', 'of', 'Brazil', ',', 'is', 'the', 'largest', 'and', 'easternmost', 'country', 'in', 'South', 'America', '.', 'It', 'is', 'the', 'world', "'s", 'fifth-largest', 'country', 'by', 'area', 'and', 'the', 'seventh', 'largest', 'by', 'population', ',', 'with', 'over', '212', 'million', 'people', '.', 'The', 'country', 'is', 'a', 'federation', 'composed', 'of', '26', 'states', 'and', 'a', 'Federal', 'District', ',', 'which', 'hosts', 'the', 'capital', ',', 'Brasília', '.', 'Its', 'most', 'populous', 'city', 'is', 'São', 'Paulo', ',', 'followed', 'by', 'Rio', 'de', 'Janeiro', '.', 'Brazil', 'has', 'the', 'most', 'Portuguese', 'speakers', 'in', 'the', 'world', 'and', 'is', 'the', 'only', 'country', 'in', 'the', 'Americas', 'where', 'Portuguese', 'is', 'an', 'official', 'language', '.', 'Bounded', 'by', 'the', 'Atlantic', 'Ocean', 'on', 'the', 'east', ',', 'Brazil', 'has', 'a', 'coastline', 'of', '7,491', 'kilometers', '(', '4,655', '

In [13]:
# Using NLTK to get the 'root' from the words:
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer('english')

roots = []

for token in tokens:
  if not token in punctuation and not token in stopwords:
    roots.append(stemmer.stem(token))

print(roots)

print(len(roots))
print(len(set(roots)))

['brazil', 'offici', 'feder', 'republ', 'brazil', 'largest', 'easternmost', 'countri', 'south', 'america', 'it', 'world', "'s", 'fifth-largest', 'countri', 'area', 'seventh', 'largest', 'popul', '212', 'million', 'peopl', 'the', 'countri', 'feder', 'compos', '26', 'state', 'feder', 'district', 'host', 'capit', 'brasília', 'it', 'popul', 'citi', 'são', 'paulo', 'follow', 'rio', 'de', 'janeiro', 'brazil', 'portugues', 'speaker', 'world', 'countri', 'america', 'portugues', 'offici', 'languag', 'bound', 'atlant', 'ocean', 'east', 'brazil', 'coastlin', '7,491', 'kilomet', '4,655', 'mi', 'cover', 'rough', 'half', 'south', 'america', "'s", 'land', 'area', 'border', 'countri', 'territori', 'contin', 'except', 'ecuador', 'chile', 'brazil', 'encompass', 'wide', 'rang', 'tropic', 'subtrop', 'landscap', 'well', 'wetland', 'savanna', 'plateaus', 'low', 'mountain', 'it', 'contain', 'amazon', 'basin', 'includ', 'world', '’', 'largest', 'river', 'system', 'extens', 'virgin', 'tropic', 'forest', 'brazi

In [14]:
# Counting the most frequent roots:
root_frequency = {
    root: roots.count(root) for root in set(roots)
    }

print(sorted(root_frequency.items(), key=lambda x: x[1], reverse=True))

# Deleting roots that are not useful to the study:
del root_frequency['===']
del root_frequency['==']
del root_frequency["''"]
del root_frequency['``']

[('brazil', 207), ('the', 133), ('brazilian', 98), ('countri', 97), ("'s", 84), ('world', 82), ('===', 68), ('in', 62), ('portugues', 53), ('state', 53), ('largest', 48), ('feder', 44), ('popul', 42), ('de', 40), ('also', 38), ('nation', 36), ('govern', 35), ('languag', 33), ('million', 32), ('includ', 31), ('centuri', 28), ('america', 27), ('it', 27), ('produc', 26), ('==', 26), ('south', 25), ('region', 25), ('system', 25), ('first', 25), ('rio', 25), ('intern', 25), ('law', 24), ('power', 24), ('one', 24), ('european', 23), ('polit', 22), ('constitut', 22), ('year', 22), ('forest', 21), ('militari', 20), ('são', 20), ('janeiro', 19), ('coloni', 19), ('among', 18), ('paulo', 18), ('war', 18), ('major', 18), ("''", 18), ('``', 18), ('larg', 17), ('cultur', 17), ('latin', 17), ('area', 17), ('energi', 17), ('influenc', 16), ('portug', 16), ('indigen', 16), ('court', 15), ('organ', 15), ('economi', 15), ('peopl', 15), ('differ', 15), ('number', 15), ('architectur', 15), ('term', 15), ('

In [17]:
# Ranking sentences in the document:
sentences = sent_tokenize(document)

print(sentences)

sentence_ranking = {}  # Change this to a dictionary

for sentence in sentences:
  punctuation = 0
  tokens = word_tokenize(sentence)
  roots = [stemmer.stem(token) for token in tokens]
  for root in roots:
    punctuation += sentence_ranking.get(root, 0) # Now 'get' will work on a dictionary
    # Calculating the average punctuation for the sentence:
    sentence_score = punctuation / len(tokens) if len(tokens) > 0 else 0

    # Updating the sentence ranking dictionary with the sentence score:
    sentence_ranking[sentence] = sentence_score

print(sentence_ranking)

['Brazil, officially the Federative Republic of Brazil, is the largest and easternmost country in South America.', "It is the world's fifth-largest country by area and the seventh largest by population, with over 212 million people.", 'The country is a federation composed of 26 states and a Federal District, which hosts the capital, Brasília.', 'Its most populous city is São Paulo, followed by Rio de Janeiro.', 'Brazil has the most Portuguese speakers in the world and is the only country in the Americas where Portuguese is an official language.', 'Bounded by the Atlantic Ocean on the east, Brazil has a coastline of 7,491 kilometers (4,655 mi).', "Covering roughly half of South America's land area, it borders all other countries and territories on the continent except Ecuador and Chile.", 'Brazil encompasses a wide range of tropical and subtropical landscapes, as well as wetlands, savannas, plateaus, and low mountains.', 'It contains most of the Amazon basin, including the world’s large

In [20]:
# Returning the sentences with punctuation above the average:
average_punctuation = sum(sentence_ranking.values()) / len(sentence_ranking) # Change here: summing the values of sentence_ranking
average_punctuation

summary = []
for i, sentence in enumerate(sentences):
  punctuation = sentence_ranking[sentence] # Change here: accessing dictionary by sentence
  if punctuation > (1.2 * average_punctuation):
    summary.append(sentence)
    if len(summary) == 5:
      resumo = ' '.join(summary)
      break

summary

[]