## Sumarizador de texto

### Inputs do usuario

In [1]:
#[*NLTK corpus, "sample_CNN", "CNN"]
corpus = "sample_CNN"

### Imports relevantes

In [2]:
from nltk import corpus as nltk_corpus
stopwords = nltk_corpus.stopwords.words("english")

import re

from sklearn.feature_extraction.text import TfidfVectorizer

import gensim
from gensim.corpora import Dictionary
from gensim.models.ldamulticore import LdaMulticore
import pyLDAvis.gensim

from sklearn.cluster import MiniBatchKMeans

import pandas as pd
import numpy as np
import pickle

import json
import os

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

### Abre o Corpus

In [3]:
manual_conversions = {"nt":"not", "ll":"will", "m":"am", "s":"TRASH"}
manual_conversions.update({stopword:"TRASH" for stopword in stopwords})
def preprocess(sent):
    sent = re.sub("-LRB-|-RRB-", "", sent, flags=re.DOTALL|re.MULTILINE)
    sent = sent.lower()
    sent = re.sub(r"[^a-z0-9\ ]", "", sent, flags=re.DOTALL|re.MULTILINE)
    sent = re.sub(r"[0-9]+", "num", sent, flags=re.DOTALL|re.MULTILINE)
    sent = re.sub(r" +(?= )", "", sent, flags=re.DOTALL|re.MULTILINE).strip()
    sent = sent.split(" ")
    sent = [manual_conversions[word] if word in manual_conversions.keys() else word for word in sent]
    return [word for word in sent if word != "TRASH"]

In [4]:
if corpus != "CNN" and corpus != "sample_CNN":

    if corpus == "reuters":
        data = nltk_corpus.reuters
    elif corpus == "brown":
        data = nltk_corpus.brown
    else:
        raise ValueError("Corpus not implemented yet")
    
    file_ids = data.fileids()

    sents = {}
    for file_id in file_ids:
        file_sents = data.raw(file_id).split(".\n")       
        preprocessed_sent = [preprocess(sent) for sent in file_sents]
        sents[file_id] = [sent for sent in preprocessed_sent if len(sent) > 1]

In [5]:
if corpus == "sample_CNN":
    sents = {}
    available_stories = os.listdir("./cnn_stories_sample")
    for storie in available_stories:
        with open(f"./cnn_stories_sample/{storie}", "r", encoding="utf-8") as file:
            file = file.read()
        original_sents = re.sub(r"\n\n", " ", file.split("@highlight")[0], flags=re.DOTALL|re.MULTILINE).split(" . ")
        highlights_sents = re.sub(r"\n\n*", " . ", "".join(file.split("@highlight")[1:]), flags=re.DOTALL|re.MULTILINE).split(" . ")

        preprocessed_sent = [preprocess(original_sent) for original_sent in original_sents]
        sents[storie] = [sent for sent in preprocessed_sent if len(sent) > 1]

if corpus == "CNN":
    sents = {}
    available_stories = os.listdir("./cnn_stories_tokenized")
    for storie in available_stories:
        with open(f"./cnn_stories_sample/{storie}", "r", encoding="utf-8") as file:
            file = file.read()
        original_sents = re.sub(r"\n\n", " ", file.split("@highlight")[0], flags=re.DOTALL|re.MULTILINE).split(" . ")
        highlights_sents = re.sub(r"\n\n*", " . ", "".join(file.split("@highlight")[1:]), flags=re.DOTALL|re.MULTILINE).split(" . ")

        preprocessed_sent = [preprocess(original_sent) for original_sent in original_sents]
        sents[storie] = [sent for sent in preprocessed_sent if len(sent) > 1]

In [6]:
# salva em um arquivo para evitar uma nova leitura
with open("storage/sents.json", "w") as file:
    json.dump(sents, file)
    
with open('storage/all_sents.txt', 'w', encoding='utf8') as file:
    for text in sents.values():
        for sentence in text:
            file.write(" ".join([tok for tok in sentence]) + "\n")

with open('storage/docs.txt', 'w', encoding='utf8') as file:
    for text in sents.values():
        doc = ""
        for sentence in text:
            doc = doc + " ".join([tok for tok in sentence]) + ". "
        file.write(doc + "\n")

# Documento Inteiro

### TF_IDF

In [7]:
# ordena a lista e salva em um arquivo para evitar uma nova leitura
with open('storage/docs.txt', 'r', encoding='utf8') as file:
    docs = file.read().split("\n")

In [8]:
model = TfidfVectorizer(min_df=5, 
                        max_df=0.9, 
                        max_features=5000, 
                        sublinear_tf=False, 
                        analyzer=lambda x: x)

vecs = model.fit_transform(docs)

In [9]:
kmeans_cbow = MiniBatchKMeans(n_clusters=10, random_state=42)
result = kmeans_cbow.fit_transform(vecs)
df = pd.DataFrame(result)

In [10]:
num_sents = 5
for cluster_number in range(result.shape[1]):
    best_sents = (
        df[kmeans_cbow.labels_ == cluster_number]
        .sort_values(by=cluster_number)
        .index
        .values[:num_sents]
    )
    print('#' * 80)
    print(f'\nCluster {cluster_number}: {best_sents}')
    for k in best_sents:
        print(''.join(docs[k])[:300])
        print('-' * 80)

################################################################################

Cluster 0: [499 733 725 724 694]
cnn eighttime gold medal winner beijing olympics multiple worldrecord holder michael phelps warmed swimming world championships rome setting new men world record num meters butterfly. michael phelps attacks pool indianapolis set new world mark numm fly. numyearold american shaved numhundredths secon
--------------------------------------------------------------------------------
los angeles cnn two charges chris brown dropped tuesday singer still accused hit run minor traffic crash. brown lawyer entered guilty plea behalf judge ordered show sheriff station within week officially booked charge according court spokesman. prosecutor dismissed charges driving without license dr
--------------------------------------------------------------------------------
jos nigeria cnn tensions ran high central nigerian city jos sunday aftermath explosion outside catholic church left six p

### Doc_2_Vec

In [11]:
# ordena a lista e salva em um arquivo para evitar uma nova leitura
with open('storage/docs.txt', 'r', encoding='utf8') as file:
    docs = file.read().split("\n")

In [12]:
model_cbow = gensim.models.Doc2Vec(
    corpus_file='storage/docs.txt',
    vector_size=200,
    window=5,
    min_count=5,
    workers=12,
    epochs=100)

In [13]:
model_cbow.docvecs.vectors_docs

array([[-1.383262  , -0.78129685,  1.6738824 , ..., -1.155832  ,
        -0.48108163,  0.78414184],
       [-0.6268071 ,  1.4433881 ,  1.7895472 , ..., -0.20339996,
        -2.6213005 , -0.85025996],
       [ 0.26107556, -1.841127  ,  0.5145911 , ..., -1.895537  ,
        -0.6030372 , -2.0021684 ],
       ...,
       [-0.9244743 ,  3.2305555 , -1.1793662 , ...,  1.8136733 ,
         2.1451426 ,  0.83805525],
       [-1.2377428 ,  2.5762346 ,  1.0171101 , ..., -2.4829178 ,
        -1.4035474 , -0.6157002 ],
       [ 0.73281795,  1.2762597 ,  0.13295859, ..., -2.4853935 ,
        -1.0125918 ,  1.7376262 ]], dtype=float32)

In [14]:
kmeans_cbow = MiniBatchKMeans(n_clusters=10, random_state=42)
result = kmeans_cbow.fit_transform(model_cbow.docvecs.vectors_docs)
df = pd.DataFrame(result)

In [15]:
df.shape

(1075, 10)

In [16]:
num_sents = 5
for cluster_number in range(result.shape[1]):
    best_sents = (
        df[kmeans_cbow.labels_ == cluster_number]
        .sort_values(by=cluster_number)
        .index
        .values[:num_sents]
    )
    print('#' * 80)
    print(f'\nCluster {cluster_number}: {best_sents}')
    for k in best_sents:
        print(''.join(docs[k])[:300])
        print('-' * 80)

################################################################################

Cluster 0: [ 512   61  529  679 1068]
cnn real madrid came behind beat crosscity rivals atletico num first leg copa del rey quarterfinal tie santiago bernabeu thursday. real fell behind goal uruguay diego forlan seven minutes defender sergio ramos headed equalizer home team six minutes later. cristiano ronaldo followed hattrick villarre
--------------------------------------------------------------------------------
cnn lionel messi captain argentina first time take venezuela friendly cricketmad india friday. messi handed responsibility new argentina coach alejandro sabella disappointing copa america campaign. went quarterfinals eventual winners uruguay costing coach sergio batista job. captain sabella told gat
--------------------------------------------------------------------------------
cnn spain davis cup fifth time third time four years rafael nadal recovered dropping opening set defeat argentina ju

# Todas as sentenças

### TF_IDF

In [17]:
with open('storage/all_sents.txt', 'r', encoding='utf8') as file:
    all_sents = [sent.split(" ") for sent in file.read().split("\n")]

In [18]:
# ordena a lista e salva em um arquivo para evitar uma nova leitura
with open('storage/docs.txt', 'r', encoding='utf8') as file:
    docs = file.read().split("\n")

In [19]:
model = TfidfVectorizer(min_df=5, 
                        max_df=0.9, 
                        max_features=5000, 
                        sublinear_tf=False, 
                        analyzer=lambda x: x)

vecs = model.fit_transform(all_sents)

In [20]:
kmeans_cbow = MiniBatchKMeans(n_clusters=10, random_state=42)
result = kmeans_cbow.fit_transform(vecs)
df = pd.DataFrame(result)

In [21]:
num_sents = 5
for cluster_number in range(result.shape[1]):
    best_sents = (
        df[kmeans_cbow.labels_ == cluster_number]
        .sort_values(by=cluster_number)
        .index
        .values[:num_sents]
    )
    print('#' * 80)
    print(f'\nCluster {cluster_number}: {best_sents}')
    for k in best_sents:
        print(' '.join(all_sents[k])[:300])
        print('-' * 80)

################################################################################

Cluster 0: [17139    92 21357  9577  8997]
apologies president truman buck stops
--------------------------------------------------------------------------------
still president said deserved singled
--------------------------------------------------------------------------------
president barack obama called pakistan president sunday express condolences airstrike
--------------------------------------------------------------------------------
us president barack obama let clear
--------------------------------------------------------------------------------
another jairo hoyos sent prophetic plea president mister president
--------------------------------------------------------------------------------
################################################################################

Cluster 1: [16376  3039  9780 26615  7694]
not gibberish not
--------------------------------------------------------------

### Word_to_Vec

In [22]:
with open('storage/all_sents.txt', 'r', encoding='utf8') as file:
    all_sents = file.read().split("\n")

In [23]:
%%time
model_cbow = gensim.models.Word2Vec(
    corpus_file='storage/all_sents.txt',
    window=5,
    size=200,
    seed=42,
    iter=100,
    workers=12,
)

Wall time: 36.4 s


In [24]:
def cbow(model, sent):
    vec = np.zeros(model.wv.vector_size)
    for word in sent:
        if word in model:
            vec += model.wv.get_vector(word)
            
    norm = np.linalg.norm(vec)
    if norm > np.finfo(float).eps:
        vec /= norm
    return vec

In [25]:
vecs_cbow = [cbow(model_cbow, sent) for sent in sents]

In [26]:
kmeans_cbow = MiniBatchKMeans(n_clusters=10, random_state=42)
result = kmeans_cbow.fit_transform(vecs_cbow)
df = pd.DataFrame(result)

In [27]:
num_sents = 5
for cluster_number in range(result.shape[1]):
    best_sents = (
        df[kmeans_cbow.labels_ == cluster_number]
        .sort_values(by=cluster_number)
        .index
        .values[:num_sents]
    )
    print('#' * 80)
    print(f'\nCluster {cluster_number}: {best_sents}')
    for k in best_sents:
        print(''.join(all_sents[k])[:300])
        print('-' * 80)

################################################################################

Cluster 0: [ 155  195   90  373 1058]
wear medal team
--------------------------------------------------------------------------------
said police arrived ciancia apartment num minutes suspect left airport
--------------------------------------------------------------------------------
time america
--------------------------------------------------------------------------------
retreat immigration means rubio missed opportunity set apart presumptive republican presidential candidates
--------------------------------------------------------------------------------
jokes might considered particularly edgy noniraqis even necessarily funny
--------------------------------------------------------------------------------
################################################################################

Cluster 1: [326 637 703 916  34]
hard describe one personality
------------------------------------------------

### LDA

In [28]:
with open('storage/all_sents.txt', 'r', encoding='utf8') as file:
    all_sents = [sent.split(" ") for sent in file.read().split("\n")]

In [29]:
dictionary = Dictionary(all_sents)
corpus = [dictionary.doc2bow(sent) for sent in all_sents]

pickle.dump(corpus, open('storage/corpus.pkl', 'wb'))
dictionary.save('storage/dictionary.gensim')

In [30]:
%%time
NUM_TOPICS = 20
ldamodel = LdaMulticore(corpus, num_topics=NUM_TOPICS, id2word=dictionary, passes=30)
ldamodel.save('models/model5.gensim')

Wall time: 1min 19s


In [31]:
lda_display = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)