## Sumarizador de texto

### Imports relevantes

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models.ldamulticore import LdaMulticore
from sklearn.cluster import MiniBatchKMeans
from nltk import corpus as nltk_corpus
from gensim.corpora import Dictionary
from collections import defaultdict 
from rouge import Rouge 

import matplotlib.pyplot as plt
import pyLDAvis.gensim
import networkx as nx
import pandas as pd
import numpy as np
import warnings
import pickle
import gensim
import scipy
import json
import re
import os

stopwords = nltk_corpus.stopwords.words("english")
warnings.filterwarnings("ignore",category=DeprecationWarning)

### Inputs do usuario

In [4]:
#[*NLTK corpus, "sample_CNN", "CNN"]

def print_options(dct):
    print ("Select Corpus Number:")
    for item, value in dct.items():
        print(f"{item}: {value}")

_corpus = {"0": "NLTK Reunters",
           "1": "NLTK Brown",
           "2": "CNN",
           "3": "CNN Sample"}

print_options(_corpus)
corpus = _corpus[input()]
    
min_sent_size = 5
use_stopwords = True

Select Corpus Number:
0: NLTK Reunters
1: NLTK Brown
2: CNN
3: CNN Sample


 3


### Abre o Corpus

In [5]:
manual_conversions = defaultdict(lambda: None)
manual_conversions.update({"nt":"not", "ll":"will", "m":"am", "s":"TRASH"})

if use_stopwords:
    manual_conversions.update({stopword:"TRASH" for stopword in stopwords})

In [6]:
def preprocess(sent):
    sent = re.sub("-LRB-|-RRB-", "", sent, flags=re.DOTALL|re.MULTILINE)
    sent = sent.lower()
    sent = re.sub(r"[^a-z0-9\ ]", "", sent, flags=re.DOTALL|re.MULTILINE)
    sent = re.sub(r"[0-9]+", "num", sent, flags=re.DOTALL|re.MULTILINE)
    sent = re.sub(r" +(?= )", "", sent, flags=re.DOTALL|re.MULTILINE).strip()
    sent = sent.split(" ")
    
    sent = [manual_conversions[word] if manual_conversions[word] else word for word in sent]
    return [word for word in sent if word != "TRASH"]

In [7]:
def nltk_(data):
    file_ids = data.fileids()

    sents     = {}
    orig_text = {}

    for storie in file_ids:
        file_sents = data.raw(storie).split(".\n") 
        
        orig_text[storie] = file_sents
        preprocessed_sent = [preprocess(sent) for sent in file_sents]
        sents[storie]     = [sent for sent in preprocessed_sent if len(sent) > min_sent_size]

In [8]:
def cnn(_dir):
    sents      = {}
    orig_text  = {}
    highlights = {}
    available_stories = os.listdir(_dir)
    
    for storie in available_stories:
        with open(f"{_dir}/{storie}", "r", encoding="utf-8") as file:
            file = file.read()
        
        original_sents   = re.sub(r"\n\n", " ", file.split("@highlight")[0], flags=re.DOTALL|re.MULTILINE).split(" . ")
        highlights_sents = re.sub(r"\n\n*", " . ", "".join(file.split("@highlight")[1:]), flags=re.DOTALL|re.MULTILINE).split(" . ")

        # Guarda o texto original e os hghlights
        orig_text[storie]  = original_sents
        highlights[storie] = highlights_sents
        
        # Guarda as sentenças préprocessadas
        preprocessed_sent = [preprocess(original_sent) for original_sent in original_sents]
        sents[storie]     = [sent for sent in preprocessed_sent if len(sent) > min_sent_size]
    
    return sents, orig_text, highlights

In [9]:
if(corpus == "NLTK Reunters"):
    sents, orig_text = nltk_(corpus.reuters)

elif(corpus == "NLTK Brown"):
    sents, orig_text = nltk_(corpus.brow)
    
elif(corpus == "CNN"):
    sents, orig_text, highlights = cnn("./cnn_stories")
    
elif(corpus == "CNN Sample"):
    sents, orig_text, highlights = cnn("./cnn_stories_sample")

In [10]:
with open("storage/orig_sents.json", "w") as file:
    json.dump(orig_text, file)

if "CNN" in corpus:
    with open("storage/highlights.json", "w") as file:
        json.dump(sents, file)

sents_reference = {}    
with open('storage/all_sents.txt', 'w', encoding='utf8') as file:
    i = 0
    for text_id, text in sents.items():
        sents_reference[text_id] = []
        
        for sentence in text:
            file.write(" ".join([tok for tok in sentence]) + "\n")
            sents_reference[text_id].append(i)
            i += 1
            
with open("storage/sents_reference.json", "w") as file:
    json.dump(sents_reference, file)

### Define Funções relevantes

In [54]:
def load_data(cnn=False):
     
    with open('storage/all_sents.txt', 'r', encoding='utf8') as file:
        all_sents = [sent.split(" ") for sent in file.read().split("\n")]
    
    with open("storage/sents_reference.json", "r") as file:
        sents_reference = json.load(file)
        
    with open("storage/orig_sents.json", "r") as file:
        orig_sents = json.load(file)
    
    if cnn:
        with open("storage/highlights.json", "r") as file:
            highlights = json.load(file)
    else:
        highlights = None
        
    return all_sents, sents_reference, orig_sents, highlights


def find_most_relevant_cl(vecs, sents_reference, clusters=3, per_cluster_sent=1):
    
    best = {}
    for text_id in sents_reference.keys():
    
        # Pega os vetores da sentença desse texto
        target_vecs = vecs[sents_reference[text_id][0]:sents_reference[text_id][-1]+1] 

        # Faz a clusterização dessas sentenças
        kmeans_cbow = MiniBatchKMeans(n_clusters=3, random_state=42)
        result = kmeans_cbow.fit_transform(target_vecs)
        df = pd.DataFrame(result)

        # Seleciona a sentença mais próxima de cada centro de cluster
        this_best = []
        for cluster_number in range(result.shape[1]):
            this_best.append(df[kmeans_cbow.labels_ == cluster_number].sort_values(by=cluster_number).index.values[:per_cluster_sent])
        
        best[text_id] = sorted(list(np.array(this_best).flatten()))
            
    return best

def find_most_relevant_pr(vecs, sents_reference, n_sents=3):

    best = {}
    for text_id in sents_reference.keys():

        # Pega os vetores da sentença desse texto
        target_vecs = vecs[sents_reference[text_id][0]:sents_reference[text_id][-1]+1] 
        target_vecs = [vec.toarray() for vec in target_vecs]
        
        # Faz a clusterização dessas sentenças
        sim_mat = np.zeros((len(sents_reference[text_id]), len(sents_reference[text_id])))
        for i, v1 in enumerate(target_vecs):
            for j, v2 in enumerate(target_vecs):
                norm1 = np.linalg.norm(v1)
                norm2 = np.linalg.norm(v2)
        
                # Verifica se alguem vetor possui apenas zeros
                if v1.sum() != 0 and v2.sum() != 0:
                    
                    # Verifica se o valor da normalização é razoavel
                    if norm1 > np.finfo(float).eps and norm2 > np.finfo(float).eps:
                        sim_mat[i][j] = (v1 * v2).sum() / (norm1 + norm2)
                    
                    else:
                        sim_mat[i][j] = (v1 * v2).sum()

        graph = nx.from_numpy_array(sim_mat)
        pr = nx.pagerank(graph, max_iter=100)

        best[text_id] = sorted(pr, key=pr.get)[:n_sents]
        
    return best

def find_most_relevant_rougue(orig_text, n_sents=3):
    best = {}
    for text_id in sents_reference.keys():

        # Pega as sentençãs do texto
        sents = all_sents[sents_reference[text_id][0]:sents_reference[text_id][-1]+1] 
        sents = [" ".join(sents) for sents in sents]
        
        #sim_mat = np.zeros((len(sents_reference[text_id]), len(sents_reference[text_id]))) 
        for i, s1 in enumerate(sents):
            for j, s2 in enumerate(sents):
                score = Rouge().get_scores(s1, s2)
                break
            break

#rougue_best = find_most_relevant_rougue(orig_text)

1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1


### TF_IDF

In [14]:
all_sents, sents_reference, orig_text, highlights = load_data(("CNN" in corpus))

In [16]:
model = TfidfVectorizer(min_df=5, 
                        max_df=0.9, 
                        max_features=5000, 
                        sublinear_tf=False, 
                        analyzer=lambda x: x)

tfidf_vecs = model.fit_transform(all_sents)

#### Clusterização

In [12]:
tfidf_cl_best = find_most_relevant_cl(tfidf_vecs, sents_reference)
tfidf_cl_summary = {}
for text_id in tfidf_cl_best.keys():
    tfidf_cl_summary[text_id] = [orig_text[text_id][sent] for sent in tfidf_cl_best[text_id]]

#### Page rank

In [13]:
tfidf_pr_best = find_most_relevant_pr(tfidf_vecs, sents_reference)
tfidf_pr_summary = {}
for text_id in tfidf_pr_best.keys():
    tfidf_pr_summary[text_id] = [orig_text[text_id][sent] for sent in tfidf_pr_best[text_id]]

#### Olha um resultado

In [14]:
text_id = list(tfidf_cl_summary.keys())[np.random.randint(0, len(tfidf_cl_summary))]

In [15]:
". ".join(orig_text[text_id])

"-LRB- CNN -RRB- -- The worst kept secret in Formula One is finally out -- Fernando Alonso is leaving Ferrari and will be replaced by Sebastian Vettel. Red Bull 's four-time world champion has signed a three-year contract with the Scuderia , the oldest team in F1 , from 2015. After ending his five-year stint at Ferrari , Alonso remains coy on where he will be driving next season. The double move by two of the sport 's high profile world champions is the most significant in the driver market this season. But in the fickle world of F1 there are no guarantees it will work out for either of the ambitious racers. There are still seats to be filled at McLaren , Force India and Toro Rosso. With the curtain about to fall on the 2014 season at Sunday 's Abu Dhabi Grand Prix , hopeful drivers have just one more chance to stake their claim for the remaining seats. Where will Alonso go ? Alonso is regarded as the best all-round driver currently racing at the elite level of motorsport , a fact many

In [16]:
tfidf_cl_summary[text_id]

["Red Bull 's four-time world champion has signed a three-year contract with the Scuderia , the oldest team in F1 , from 2015",
 'There are still seats to be filled at McLaren , Force India and Toro Rosso',
 "`` We know you 're awaiting news on our driver line-up"]

In [17]:
tfidf_pr_summary[text_id]

['Button -- the 2009 world champion with Brawn Grand Prix which has since morphed into Mercedes -- has remained sanguine about his future and has even explored the idea of moving to sports car racing',
 'Grid shrinks to 18 in 2015 ? The F1 market may be flooded with plenty of eager racers but the number of seats has been squeezed',
 '`` I am extremely motivated to help the team get back to the top']

### Word_to_Vec

In [18]:
all_sents, sents_reference, orig_text, highlights = load_data(("CNN" in corpus))

In [19]:
%%time
model_cbow = gensim.models.Word2Vec(
    corpus_file='storage/all_sents.txt',
    window=5,
    size=200,
    seed=42,
    iter=100,
    workers=12,
)

Wall time: 4min 19s


In [20]:
def sum_word_vecs(model, sent):
    vec = np.zeros(model.wv.vector_size)
    for word in sent:
        if word in model:
            vec += model.wv.get_vector(word)
            
    norm = np.linalg.norm(vec)
    if norm > np.finfo(float).eps:
        vec /= norm
    return vec

In [21]:
word2vec_vecs = scipy.sparse.csr.csr_matrix([sum_word_vecs(model_cbow, sent) for sent in all_sents])

#### Clusterização

In [22]:
word2vec_cl_best = find_most_relevant_cl(word2vec_vecs, sents_reference)
word2vec_cl_summary = {}
for text_id in word2vec_cl_best.keys():
    word2vec_cl_summary[text_id] = [orig_text[text_id][sent] for sent in word2vec_cl_best[text_id]]

#### Page rank

In [23]:
word2vec_pr_best = find_most_relevant_pr(word2vec_vecs, sents_reference)
word2vec_pr_summary = {}
for text_id in word2vec_pr_best.keys():
    word2vec_pr_summary[text_id] = [orig_text[text_id][sent] for sent in word2vec_pr_best[text_id]]

PowerIterationFailedConvergence: (PowerIterationFailedConvergence(...), 'power iteration failed to converge within 100 iterations')

#### Olha um resultado

In [None]:
text_id = list(word2vec_cl_summary.keys())[np.random.randint(0, len(word2vec_cl_summary))]

In [None]:
". ".join(orig_text[text_id])

In [None]:
word2vec_cl_summary[text_id]

In [None]:
word2vec_pr_summary[text_id]

### LDA

In [None]:
all_sents, sents_reference, orig_text, highlights = load_data(("CNN" in corpus))

In [None]:
dictionary = Dictionary(all_sents)
doc2bow = [dictionary.doc2bow(sent) for sent in all_sents]

In [None]:
%%time
NUM_TOPICS = 20
ldamodel = LdaMulticore(doc2bow, num_topics=NUM_TOPICS, id2word=dictionary, passes=30)

In [None]:
# Caso se queira explorar a LDA mudar para True
if False:
    lda_display = pyLDAvis.gensim.prepare(ldamodel, doc2bow, dictionary, sort_topics=False)
    pyLDAvis.display(lda_display)

In [None]:
raw_vecs = [ldamodel.get_document_topics(text) for text in doc2bow]

In [None]:
lda_vecs = []
for vec in raw_vecs:
    this_vec = []
    curr = 0
    for i in range(NUM_TOPICS):
        if (i == vec[curr][0]):
            this_vec.append(vec[curr][1])
            curr+=1
            if curr == len(vec):
                curr = -1
        else:
            this_vec.append(0)
    lda_vecs.append(this_vec)
    
lda_vecs = scipy.sparse.csr.csr_matrix(lda_vecs)

#### Clusterização

In [None]:
lda_cl_best = find_most_relevant_cl(lda_vecs, sents_reference)
lda_cl_summary = {}
for text_id in lda_cl_best.keys():
    lda_cl_summary[text_id] = [orig_text[text_id][sent] for sent in lda_cl_best[text_id]]

#### Page rank

In [None]:
lda_pr_best = find_most_relevant_pr(lda_vecs, sents_reference)
lda_pr_summary = {}
for text_id in lda_pr_best.keys():
    lda_pr_summary[text_id] = [orig_text[text_id][sent] for sent in lda_pr_best[text_id]]

#### Olha um resultado

In [None]:
text_id = list(lda_cl_summary.keys())[np.random.randint(0, len(lda_cl_summary))]

In [None]:
". ".join(orig_text[text_id])

In [None]:
lda_cl_summary[text_id]

In [None]:
lda_pr_summary[text_id]

#### Rougue