## Sumarizador de texto

### Inputs do usuario

In [1]:
#[*NLTK corpus, "sample_CNN", "CNN"]
corpus = "sample_CNN"

min_sent_size = 5
use_stopwords = True

### Imports relevantes

In [2]:
from nltk import corpus as nltk_corpus
stopwords = nltk_corpus.stopwords.words("english")

import re

from sklearn.feature_extraction.text import TfidfVectorizer

import gensim
from gensim.corpora import Dictionary
from gensim.models.ldamulticore import LdaMulticore
import pyLDAvis.gensim

from sklearn.cluster import MiniBatchKMeans

import pandas as pd
import numpy as np
import scipy
import pickle

import json
import os

import networkx as nx

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

### Abre o Corpus

In [3]:
manual_conversions = {"nt":"not", "ll":"will", "m":"am", "s":"TRASH"}
if use_stopwords:
    manual_conversions.update({stopword:"TRASH" for stopword in stopwords})

In [4]:
def preprocess(sent):
    sent = re.sub("-LRB-|-RRB-", "", sent, flags=re.DOTALL|re.MULTILINE)
    sent = sent.lower()
    sent = re.sub(r"[^a-z0-9\ ]", "", sent, flags=re.DOTALL|re.MULTILINE)
    sent = re.sub(r"[0-9]+", "num", sent, flags=re.DOTALL|re.MULTILINE)
    sent = re.sub(r" +(?= )", "", sent, flags=re.DOTALL|re.MULTILINE).strip()
    sent = sent.split(" ")
    sent = [manual_conversions[word] if word in manual_conversions.keys() else word for word in sent]
    return [word for word in sent if word != "TRASH"]

In [5]:
if corpus != "CNN" and corpus != "sample_CNN":

    if corpus == "reuters":
        data = nltk_corpus.reuters
    elif corpus == "brown":
        data = nltk_corpus.brown
    else:
        raise ValueError("Corpus not implemented yet")
    
    file_ids = data.fileids()

    sents = {}
    orig_text = {}
    for storie in file_ids:
        file_sents = data.raw(storie).split(".\n") 
        orig_text[storie] = file_sents
        preprocessed_sent = [preprocess(sent) for sent in file_sents]
        sents[storie] = [sent for sent in preprocessed_sent if len(sent) > min_sent_size]

In [6]:
if corpus == "sample_CNN":
    sents = {}
    orig_text = {}
    highlights = {}
    available_stories = os.listdir("./cnn_stories_sample")
    for storie in available_stories:
        with open(f"./cnn_stories_sample/{storie}", "r", encoding="utf-8") as file:
            file = file.read()
        original_sents = re.sub(r"\n\n", " ", file.split("@highlight")[0], flags=re.DOTALL|re.MULTILINE).split(" . ")
        highlights_sents = re.sub(r"\n\n*", " . ", "".join(file.split("@highlight")[1:]), flags=re.DOTALL|re.MULTILINE).split(" . ")

        # Guarda o texto original e os hghlights
        orig_text[storie] = original_sents
        highlights[storie] = highlights_sents
        
        # Guarda as sentenças préprocessadas
        preprocessed_sent = [preprocess(original_sent) for original_sent in original_sents]
        sents[storie] = [sent for sent in preprocessed_sent if len(sent) > min_sent_size]

if corpus == "CNN":
    sents = {}
    orig_text = {}
    highlights = {}
    available_stories = os.listdir("./cnn_stories_tokenized")
    for storie in available_stories:
        with open(f"./cnn_stories_sample/{storie}", "r", encoding="utf-8") as file:
            file = file.read()
        original_sents = re.sub(r"\n\n", " ", file.split("@highlight")[0], flags=re.DOTALL|re.MULTILINE).split(" . ")
        highlights_sents = re.sub(r"\n\n*", " . ", "".join(file.split("@highlight")[1:]), flags=re.DOTALL|re.MULTILINE).split(" . ")

        # Guarda o texto original e os hghlights
        orig_text[storie] = original_sents
        highlight[storie] = highlights_sents
        
        # Guarda as sentenças préprocessadas
        preprocessed_sent = [preprocess(original_sent) for original_sent in original_sents]
        sents[storie] = [sent for sent in preprocessed_sent if len(sent) > min_sent_size]

In [7]:
with open("storage/orig_sents.json", "w") as file:
    json.dump(orig_text, file)

if "CNN" in corpus:
    with open("storage/highlights.json", "w") as file:
        json.dump(sents, file)

sents_reference = {}
i = 0
with open('storage/all_sents.txt', 'w', encoding='utf8') as file:
    
    for text_id, text in sents.items():
        sents_reference[text_id] = []
        
        for sentence in text:
            file.write(" ".join([tok for tok in sentence]) + "\n")
            sents_reference[text_id].append(i)
            i += 1
            
with open("storage/sents_reference.json", "w") as file:
    json.dump(sents_reference, file)

### Define Funções relevantes

In [47]:
def load_data(cnn=False):
     
    with open('storage/all_sents.txt', 'r', encoding='utf8') as file:
        all_sents = [sent.split(" ") for sent in file.read().split("\n")]
    
    with open("storage/sents_reference.json", "r") as file:
        sents_reference = json.load(file)
        
    with open("storage/orig_sents.json", "r") as file:
        orig_sents = json.load(file)
    
    if cnn:
        with open("storage/highlights.json", "r") as file:
            highlights = json.load(file)
    else:
        highlights = None
        
    return all_sents, sents_reference, orig_sents, highlights


def find_most_relevant_cl(vecs, sents_reference, clusters=3, per_cluster_sent=1):
    
    best = {}
    for text_id in sents_reference.keys():
    
        # Pega os vetores da sentença desse texto
        target_vecs = vecs[sents_reference[text_id][0]:sents_reference[text_id][-1]+1] 

        # Faz a clusterização dessas sentenças
        kmeans_cbow = MiniBatchKMeans(n_clusters=3, random_state=42)
        result = kmeans_cbow.fit_transform(target_vecs)
        df = pd.DataFrame(result)

        # Seleciona a sentença mais próxima de cada centro de cluster
        this_best = []
        for cluster_number in range(result.shape[1]):
            this_best.append(df[kmeans_cbow.labels_ == cluster_number].sort_values(by=cluster_number).index.values[:per_cluster_sent])
        best[text_id] = sorted(list(np.array(this_best).flatten()))
            
    return best

def find_most_relevant_pr(vecs, sents_reference, n_sents=3):

    best = {}
    for text_id in sents_reference.keys():

        # Pega os vetores da sentença desse texto
        target_vecs = vecs[sents_reference[text_id][0]:sents_reference[text_id][-1]+1] 
        target_vecs = [vec.toarray() for vec in target_vecs]
        # Faz a clusterização dessas sentenças
        sim_mat = np.zeros((len(sents_reference[text_id]), len(sents_reference[text_id])))
        for i, v1 in enumerate(target_vecs):
            for j, v2 in enumerate(target_vecs):
                norm1 = np.linalg.norm(v1)
                norm2 = np.linalg.norm(v2)
                # Verifica se alguem vetor possui apenas zeros
                if v1.sum() != 0 and v2.sum() != 0:
                    # Verifica se o valor da normalização é razoavel
                    if norm1 > np.finfo(float).eps and norm2 > np.finfo(float).eps:
                        sim_mat[i][j] = (v1 * v2).sum() / (norm1 + norm2)
                    else:
                        sim_mat[i][j] = (v1 * v2).sum()

        graph = nx.from_numpy_array(sim_mat)
        pr = nx.pagerank(graph, max_iter=100)

        best[text_id] = sorted(pr, key=pr.get)[:n_sents]
        
    return best

### TF_IDF

In [9]:
all_sents, sents_reference, orig_text, highlights = load_data(("CNN" in corpus))

In [10]:
%%time
model = TfidfVectorizer(min_df=5, 
                        max_df=0.9, 
                        max_features=5000, 
                        sublinear_tf=False, 
                        analyzer=lambda x: x)

tfidf_vecs = model.fit_transform(all_sents)

Wall time: 312 ms


#### Clusterização

In [11]:
tfidf_cl_best = find_most_relevant_cl(tfidf_vecs, sents_reference)
tfidf_cl_summary = {}
for text_id in tfidf_cl_best.keys():
    tfidf_cl_summary[text_id] = [orig_text[text_id][sent] for sent in tfidf_cl_best[text_id]]

#### Page rank

In [12]:
tfidf_pr_best = find_most_relevant_pr(tfidf_vecs, sents_reference)
tfidf_pr_summary = {}
for text_id in tfidf_pr_best.keys():
    tfidf_pr_summary[text_id] = [orig_text[text_id][sent] for sent in tfidf_pr_best[text_id]]

#### Olha um resultado

In [13]:
text_id = list(tfidf_cl_summary.keys())[np.random.randint(0, len(tfidf_cl_summary))]

In [14]:
". ".join(orig_text[text_id])

"In the wake of criticism over a two-game suspension for Baltimore Ravens running back Ray Rice , the NFL has established a six-game unpaid ban for personnel who violate the league 's policy on domestic violence , Commissioner Roger Goodell said Thursday. A second incident would be punished by a lifetime ban from the league , Goodell said in a letter and memo to the owners of the league 's 32 teams. Without referring to Rice by name , he acknowledged in his letter that he made the wrong decision in that case. `` I did n't get it right. Simply put , we have to do better. And we will , '' he wrote. The policy , which is effective immediately , also applies to other types of violence. `` Violations of the Personal Conduct Policy regarding assault , battery , domestic violence or sexual assault that involve physical force will be subject to a suspension without pay of six games for a first offense , with consideration given to mitigating factors , as well as a longer suspension when circum

In [15]:
tfidf_cl_summary[text_id]

['The policy , which is effective immediately , also applies to other types of violence',
 "As we do in all disciplinary matters , if we believe that players ' due process rights are infringed upon during the course of discipline , we will assert and defend our members ' rights , '' the NFL Players Association said in its statement",
 'Read the letter and memo Rice was suspended for two games after video showed him dragging his unconscious fiancee -- whom he later married -- from an elevator']

In [16]:
tfidf_pr_summary[text_id]

['The policy , which is effective immediately , also applies to other types of violence',
 "'' Goodell said the circumstances that would warrant a longer suspension include incidents that predate a person 's time with an NFL team or acts that involve choking , repeated blows or a weapon",
 'The players union issued a response']

### Word_to_Vec

In [17]:
all_sents, sents_reference, orig_text, highlights = load_data(("CNN" in corpus))

In [18]:
%%time
model_cbow = gensim.models.Word2Vec(
    corpus_file='storage/all_sents.txt',
    window=5,
    size=200,
    seed=42,
    iter=100,
    workers=12,
)

Wall time: 36 s


In [19]:
def sum_word_vecs(model, sent):
    vec = np.zeros(model.wv.vector_size)
    for word in sent:
        if word in model:
            vec += model.wv.get_vector(word)
            
    norm = np.linalg.norm(vec)
    if norm > np.finfo(float).eps:
        vec /= norm
    return vec

In [24]:
word2vec_vecs = scipy.sparse.csr.csr_matrix([sum_word_vecs(model_cbow, sent) for sent in all_sents])

#### Clusterização

In [25]:
word2vec_cl_best = find_most_relevant_cl(word2vec_vecs, sents_reference)
word2vec_cl_summary = {}
for text_id in word2vec_cl_best.keys():
    word2vec_cl_summary[text_id] = [orig_text[text_id][sent] for sent in word2vec_cl_best[text_id]]

#### Page rank

In [48]:
word2vec_pr_best = find_most_relevant_pr(word2vec_vecs, sents_reference)
word2vec_pr_summary = {}
for text_id in word2vec_pr_best.keys():
    word2vec_pr_summary[text_id] = [orig_text[text_id][sent] for sent in word2vec_pr_best[text_id]]

PowerIterationFailedConvergence: (PowerIterationFailedConvergence(...), 'power iteration failed to converge within 100 iterations')

#### Olha um resultado

In [27]:
text_id = list(word2vec_cl_summary.keys())[np.random.randint(0, len(word2vec_cl_summary))]

In [28]:
". ".join(orig_text[text_id])

"-LRB- CNN -RRB- -- War-plagued Somalia , with its crumbling government infrastructure , is the world 's most corrupt country , according to a global survey by the international watchdog Transparency International. The group 's annual Corruption Perception Index measures perceived levels of public sector corruption. As was the case last year , the 2009 survey found that countries that scored lowest all have something in common : they are fragile , unstable and scarred by war or long-standing conflicts. The group scored 180 countries on a scale of 0 -LRB- perceived to be highly corrupt -RRB- to 10 -LRB- perceived to have low levels of corruption -RRB-. Somalia scored 1.1. Next came Afghanistan at 1.3 , Myanmar at 1.4 , and Sudan and Iraq -- both at 1.5 On the other end of the scale , New Zealand ranked highest at 9.4 , followed by Denmark -LRB- 9.3 -RRB- , Singapore and Sweden -LRB- 9.2 -RRB- and Switzerland -LRB- 9.0 -RRB-. The United States came it at 19 -LRB- 7.5 -RRB- and the United

In [29]:
word2vec_cl_summary[text_id]

["The group 's annual Corruption Perception Index measures perceived levels of public sector corruption",
 'The group scored 180 countries on a scale of 0 -LRB- perceived to be highly corrupt -RRB- to 10 -LRB- perceived to have low levels of corruption -RRB-',
 'Somalia scored 1.1']

In [30]:
word2vec_pr_summary[text_id]

NameError: name 'word2vec_pr_summary' is not defined

### LDA

In [31]:
all_sents, sents_reference, orig_text, highlights = load_data(("CNN" in corpus))

In [32]:
dictionary = Dictionary(all_sents)
doc2bow = [dictionary.doc2bow(sent) for sent in all_sents]

In [33]:
%%time
NUM_TOPICS = 20
ldamodel = LdaMulticore(doc2bow, num_topics=NUM_TOPICS, id2word=dictionary, passes=30)

Wall time: 1min 2s


In [34]:
# Caso se queira explorar a LDA mudar para True
if False:
    lda_display = pyLDAvis.gensim.prepare(ldamodel, doc2bow, dictionary, sort_topics=False)
    pyLDAvis.display(lda_display)

In [35]:
raw_vecs = [ldamodel.get_document_topics(text) for text in doc2bow]

In [36]:
lda_vecs = []
for vec in raw_vecs:
    this_vec = []
    curr = 0
    for i in range(NUM_TOPICS):
        if (i == vec[curr][0]):
            this_vec.append(vec[curr][1])
            curr+=1
            if curr == len(vec):
                curr = -1
        else:
            this_vec.append(0)
    lda_vecs.append(this_vec)
    
lda_vecs = scipy.sparse.csr.csr_matrix(lda_vecs)

#### Clusterização

In [37]:
lda_cl_best = find_most_relevant_cl(lda_vecs, sents_reference)
lda_cl_summary = {}
for text_id in lda_cl_best.keys():
    lda_cl_summary[text_id] = [orig_text[text_id][sent] for sent in lda_cl_best[text_id]]

#### Page rank

In [38]:
lda_pr_best = find_most_relevant_pr(lda_vecs, sents_reference)
lda_pr_summary = {}
for text_id in lda_pr_best.keys():
    lda_pr_summary[text_id] = [orig_text[text_id][sent] for sent in lda_pr_best[text_id]]

#### Olha um resultado

In [39]:
text_id = list(lda_cl_summary.keys())[np.random.randint(0, len(lda_cl_summary))]

In [40]:
". ".join(orig_text[text_id])

"ATHENS , Georgia -LRB- CNN -RRB- -- Over the railroad tracks , near Agriculture Drive on the University of Georgia campus , sits a unique machine that may hold one of the solutions to big environmental problems like energy , food production and even global climate change. Biochar 's high carbon content and porous nature can help soil retain water , nutrients , protect soil microbes. `` This machine right here is our baby , '' said UGA research engineer Brian Bibens , who is one of a handful of researchers around the world working on alternative ways to recycle carbon. Bibens ' specialty is `` biochar , '' a highly porous charcoal made from organic waste. The raw material can be any forest , agricultural or animal waste. Some examples are woodchips , corn husks , peanut shells , even chicken manure. Bibens feeds the waste -- called `` biomass '' -- into an octagonally shaped metal barrel where it is cooked under intense heat , sometimes above 1,000 degrees Fahrenheit , the organic matt

In [41]:
lda_cl_summary[text_id]

["Bibens ' specialty is `` biochar , '' a highly porous charcoal made from organic waste",
 "Bibens feeds the waste -- called `` biomass '' -- into an octagonally shaped metal barrel where it is cooked under intense heat , sometimes above 1,000 degrees Fahrenheit , the organic matter is cooked through a thermochemical process called `` pyrolysis ''",
 'Day says biomass -- that otherwise would be thrown away -- could be developed into entirely new markets for biofuels , electricity , biomass extracts and pharmaceutical applications , in addition to biochar']

In [42]:
lda_pr_summary[text_id]

['In a few hours , organic trash is transformed into charcoal-like pellets farmers can turn into fertilizer',
 "`` We have 3 billion people out there who are at risk for climate change and they can be making money solving our global problem , '' said Day",
 'Some examples are woodchips , corn husks , peanut shells , even chicken manure']