# Prueba de Verosimilitud Logarítmica

En este cuaderno se realizará la implementación de una prueba de verosimilitud logarítmica orientada a generar resúmenes.

Para la implementación de esta técnica, se realizarán las siguientes fases:

1. Cargar datos.
2. Aplicar el preprocesamiento al nuevo corpus.
3. Crear diccionarios para relacionar frases e identificadores 
4. Calcular matriz de verosimilitud logarítmica
5. Determinar umbral.
6. Construir matriz frases-token con 0 y 1.
7. Seleccionar frases

In [1]:
#Importar elementos necesarios de las librerías
import os, shutil, re, pickle
import numpy as np
import pandas as pd
from nltk.corpus import PlaintextCorpusReader
from nltk.stem.snowball import SpanishStemmer
from scipy.stats import binom, chi2
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import csr_matrix

In [2]:

#Funciones auxiliares

def tokenize_sentence(path, file_name):
    #Retorna un documento tokenizado por frases
    doc = []
    text = PlaintextCorpusReader(path, file_name)
    paragraphs = text.paras()
    for paragraph in paragraphs:
        for sentence in paragraph:
            low, i = 0,0
            while i < len(sentence):
                token = sentence[i].split('.')
                if len(token)-1:
                    doc.append(sentence[low:i])
                    low=i+1
                    i+=2
                else:
                    i+=1
            if low!=i-1:
                doc.append(sentence[low:i])
    return doc

def preprocess(doc, stopwords, stemmer):
    #Aplica el preprocesamiento establecido
    #Adicionalmente, retorna el documento original sin las filas vacías por el preprocesamiento 
    doc_preprocesed, doc_reduced = [], []
    for original_sentence in doc:
        preprocessed_sentence = []
        for token in original_sentence:
            if stemmer.stem(token) not in stopwords:
                preprocessed_sentence.append(stemmer.stem(token))
        if len(preprocessed_sentence) and preprocessed_sentence not in doc_preprocesed:
            doc_preprocesed.append(preprocessed_sentence)
            doc_reduced.append(original_sentence)
    return doc_preprocesed, doc_reduced

def get_dictionaries(doc):
    #Retorna un par de diccionarios que relacionan una frase con un id, y un id con una frase.
    sentence2id, id2sentence = {},{}
    n_sentences = len(doc)
    for i in range(n_sentences):
        sentence = ' '.join(doc[i])
        if sentence not in sentence2id:
            sentence2id[sentence] = i
            id2sentence[i] = sentence
    return sentence2id, id2sentence

def log_likelihood_ratio(bag_of_words):
    tokens_per_document = bag_of_words.sum(1)
    occurrences = bag_of_words.sum(0)
    bag_of_words = bag_of_words.toarray()
    total_tokens = tokens_per_document.sum(0)[0,0]
    n_documents,m_tokens = bag_of_words.shape
    loglikelihood_general = []
    for i in range(m_tokens):
        loglikelihood = binom.logpmf(occurrences[0,i], total_tokens, occurrences[0,i]/total_tokens)
        loglikelihood_general.append(loglikelihood)
    data = []
    for i in range(n_documents):
        row = []
        tokens_input = tokens_per_document[i,0]
        tokens_background = total_tokens - tokens_input
        for j in range(m_tokens):
            ocurrences_total = occurrences[0,j]
            ocurrences_input = bag_of_words[i,j]
            ocurrences_background = ocurrences_total - ocurrences_input
            loglikelihood_input = binom.logpmf(ocurrences_input, tokens_input, ocurrences_input/tokens_input)
            loglikelihood_background = binom.logpmf(ocurrences_background, tokens_background, ocurrences_background/tokens_background)
            ratio = loglikelihood_general[j] - loglikelihood_input - loglikelihood_background
            row.append(ratio)
        data.append(row)
    data = np.array(data,dtype=np.float64)
    data = data*(-2)
    return data

def corpus_topic_signatures(log_likelihood_matrix, significance_level):
    treshold = chi2.pdf(x=significance_level,df=1)
    data, col_index, row_index = [],[],[]
    n_documents, m_tokens = log_likelihood_matrix.shape
    for i in range(n_documents):
        for j in range(m_tokens):
            if log_likelihood_matrix[i,j] >= treshold:
                col_index.append(i)
                row_index.append(j)
                data.append(1)
    data = np.array(data)
    row_index = np.array(row_index)
    col_index = np.array(col_index)
    topic_signatures = csr_matrix((data,(row_index,col_index)),shape=(n_documents, m_tokens),dtype=np.float64)
    return topic_signatures

def document_topic_signatures(document, topic_signatures, token2id, doc_id):
    #Construye una matriz de un solo documento de acuerdo a la matriz de topic_signatures 
    data,row_index,col_index = [],[],[]
    topic_signatures = topic_signatures.toarray()
    n_sentences, m_tokens = len(document), len(token2id)
    for i in range(n_sentences):
        sentence = document[i]
        j = 0
        for token in sentence:
            if token in token2id and topic_signatures[doc_id,token2id[token]] == 1:
                if (j==0) or (j>0 and token2id[token] not in col_index[-j:]):
                    value = topic_signatures[doc_id,token2id[token]]
                    topic_signature_value = value
                    data.append(topic_signature_value)
                    row_index.append(i)
                    col_index.append(token2id[token])
                    j+=1
    data = np.array(data)
    row_index = np.array(row_index)
    col_index = np.array(col_index)
    topic_signatures_document = csr_matrix((data,(row_index,col_index)),shape=(n_sentences, m_tokens),dtype=np.float64)
    return topic_signatures_document


## Fase 1. Cargar datos

In [3]:
#Cargar las stopwords obtenido en preprocesamiento.
filename = 'stopwords.pkl'
stopwords = pickle.load(open(filename, 'rb'))

#Carga el corpus usado para el preprocesamiento
filename = 'corpus_single_string.pkl'
corpus = pickle.load(open(filename, 'rb'))

## Fase 2. Aplicar preprocesamiento
### Tokenización por frases

In [4]:
ruta = "D:/Documents/Documentos Universidad/Noveno/Proyecto de grado/textos"
stemmer = SpanishStemmer()
bolivar = tokenize_sentence(ruta, 'segurosbolivar-privacidad.txt')
mozilla = tokenize_sentence(ruta, 'mozilla-privacidad.txt')

### Stopwords y Stemming

In [5]:
bolivar_preprocesado, bolivar = preprocess(bolivar, stopwords, stemmer)
mozilla_preprocesado, mozilla = preprocess(mozilla, stopwords, stemmer)

## Fase 3. Crear diccionarios para relacionar frases e identificadores 

In [6]:
frase2id_bolivar, id2frase_bolivar = get_dictionaries(bolivar)
frase2id_mozilla, id2frase_mozilla = get_dictionaries(mozilla)

## Fase 4. Calcular matriz de verosimilitud logarítmica

In [7]:
vectorizer = CountVectorizer(stop_words = stopwords)
bag_of_words = vectorizer.fit_transform(corpus)

In [8]:
log_likelihood = log_likelihood_ratio(bag_of_words)
log_likelihood

array([[ 2.04202677e-07,  2.04202677e-07,  1.02092054e-06, ...,
         4.08017871e-07, -1.95564343e+00,  6.12382592e-07],
       [ 1.39663101e-07,  1.39663101e-07,  6.99770361e-07, ...,
         2.79747215e-07,  3.21851883e-06,  4.19324899e-07],
       [ 9.55809645e-08,  9.55809645e-08,  4.75879814e-07, ...,
         1.90304883e-07, -1.95538247e+00,  2.85105706e-07],
       ...,
       [ 1.09055254e-07,  1.09055254e-07,  5.43310557e-07, ...,
         2.17165884e-07,  2.49577261e-06,  3.25731875e-07],
       [ 8.37986094e-08,  8.37986094e-08,  4.17017960e-07, ...,
         1.66838626e-07,  1.91412636e-06,  2.49588008e-07],
       [ 1.73025893e-07,  1.73025893e-07,  8.63163011e-07, ...,
         3.45058735e-07,  3.96779159e-06,  5.17500304e-07]])

In [9]:
log_likelihood.shape

(56, 4740)

## Fase 5. Determinar umbral.

In [10]:
topic_signatures = corpus_topic_signatures(log_likelihood,0.05)
topic_signatures.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [11]:
topic_signatures.shape

(56, 4740)

# Fase 6. Construir matriz frases-token con 0 y 1.

In [13]:
#Diccionario que relaciona tokens con un identificador único
token2id = vectorizer.vocabulary_

#Lista de textos dentro del corpus
textos = os.listdir(ruta)

topic_signatures = document_topic_signatures(bolivar_preprocesado, topic_signatures, token2id, textos.index('segurosbolivar-privacidad.txt'))
topic_signatures.shape

(55, 4740)