# Preprocesamiento Infoleg y Datasets Anexos

In [1]:
import collections
import multiprocessing
import os
import random
import re
import sys
import warnings
from pprint import pprint
from time import time

import docx2txt
import gensim
import numpy as np
import pandas as pd
import smart_open
import pprint as pprint

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim import corpora
from IPython.display import display
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from collections import defaultdict

pd.options.display.float_format = "{:.2f}".format

# %load_ext lab_black
# warnings.filterwarnings('ignore')

In [2]:
class LabeledLineSentence(object):
    def __init__(self, doc_list, labels_list):
        self.labels_list = labels_list
        self.doc_list = doc_list

    def __iter__(self):
        for idx, doc in enumerate(self.doc_list):
            yield gensim.models.doc2vec.LabeledSentence(doc, [self.labels_list[idx]])

In [3]:
# Se verfica entorno de ejecución
in_colab = "google.colab" in sys.modules

if in_colab:
    from google.colab import drive

    drive.mount("/content/drive")
    BASE_DIR = "/content/drive/My Drive/Colab Notebooks/Data/"
else:
    BASE_DIR = "./Data"

## Inicio

In [4]:
# Definición de directorios y nombres para train y test data
train_data_dir = "{}".format(os.sep).join([BASE_DIR, "Infoleg/"])
test_data_dir = "{}".format(os.sep).join([BASE_DIR, "Infoleg_test/"])
train_data = train_data_dir + "Infoleg_train.cor"
test_data = test_data_dir + "Infoleg_test.cor"

In [5]:
display(train_data_dir)
display(test_data_dir)
display(train_data)
display(test_data)

'./Data/Infoleg/'

'./Data/Infoleg_test/'

'./Data/Infoleg/Infoleg_train.cor'

'./Data/Infoleg_test/Infoleg_test.cor'

In [6]:
try:
    train = next(os.walk(train_data_dir))
except StopIteration:
    pass  # Some error handling here

In [7]:
try:
    test = next(os.walk(test_data_dir))
except StopIteration:
    pass  # Some error handling here

In [8]:
# Limpieza del texto con la opción remover stopwords


def review_to_wordlist(review, remove_stopwords=False):

    # Conversión de palabras a minúsculas y separación
    words = review.lower().split()

    # Opcionalmente se remueven stop words (true por default)
    if remove_stopwords:
        stops = set(stopwords.words("spanish"))
        words = [w for w in words if not w in stops]

    review_text = " ".join(words)

    # Clean the text
    review_text = re.sub(r"[^A-Za-z0-9(),!.?\'\`]", " ", review_text)
    review_text = re.sub(r"\'s", " 's ", review_text)
    review_text = re.sub(r"\'ve", " 've ", review_text)
    review_text = re.sub(r"n\'t", " 't ", review_text)
    review_text = re.sub(r"\'re", " 're ", review_text)
    review_text = re.sub(r"\'d", " 'd ", review_text)
    review_text = re.sub(r"\'ll", " 'll ", review_text)
    review_text = re.sub(r",", " ", review_text)
    review_text = re.sub(r"\.", " ", review_text)
    review_text = re.sub(r"!", " ", review_text)
    review_text = re.sub(r"\(", " ( ", review_text)
    review_text = re.sub(r"\)", " ) ", review_text)
    review_text = re.sub(r"\?", " ", review_text)
    review_text = re.sub(r"\s{2,}", " ", review_text)

    words = review_text.split()

    stemmer = SnowballStemmer("spanish")
    stemmed_words = [stemmer.stem(word) for word in words]

    review_text = " ".join(stemmed_words)

    # Return a list of words
    return review_text

### Generacion del Corpus de Entrenamiento

#### Definicion de una Funcion para Leer y Preprocesar Texto

In [9]:
def get_doc_text(filepath, file):
    docx_file = file
    text = docx2txt.process(docx_file)
    return text

#### Búsqueda de archivos .Doc y renombrado a .Docx

In [10]:
doc_files = []
path_dir = os.getcwd()
for root, subdirs, files in os.walk(path_dir):
    for file in files:
        if file.endswith(".doc"):
            doc_files.append(((root, subdirs, file)))
        elif file.endswith(".docx"):
            doc_files.append(((root, subdirs, file)))

#### Concatenación de archivos de entrenamiento

In [11]:
docLabels = []
row_list = []
i = 0

for folder, subfolders, filenames in os.walk(train_data_dir):
    for file in filenames:
        if file.endswith(".txt"):
            d = {folder}
            with open(folder + file) as f:
                if f.read():
                    f.seek(0)
                    d = f.read()
                    docLabels.append(folder + file)
                    row_list.append(review_to_wordlist(d))
                    i = i + 1
    break

In [12]:
print(u"Archivos txt de entrenamiento leidos %s \n" % i)

Archivos txt de entrenamiento leidos 8 



#### Barrido de Archivos .Doc para Conversion y Grabacion como Txt

In [13]:
txt_file = ""
i = 0

for path, subdir, file in doc_files:
    doc_file = str(path) + "/" + str(file)

    if file.endswith(".doc"):
        docx_file = doc_file + "x"
        if not os.path.exists(docx_file):
            mycmd = "antiword " + "'" + doc_file + "'" + " > " + "'"
            +docx_file + "'"
            os.system(mycmd)
            os.remove("'" + doc_file + "'")  # it was just to read,so deleting
    elif file.endswith(".docx"):
        docx_file = doc_file

    txt_file = get_doc_text(str(path), docx_file)
    docLabels.append(str(path) + "/" + txt_file)
    row_list.append(review_to_wordlist(txt_file))
    i = i + 1

In [14]:
print(u"Archivos docx leidos y procesados a txt %s:\n" % i)

Archivos docx leidos y procesados a txt 153:



In [15]:
# iterator returned over all documents
it = LabeledLineSentence(row_list, docLabels)

In [16]:
df_train = pd.DataFrame(row_list)
df_train.head(10)

Unnamed: 0,0
0,ente nacional regul de la electr resoluci n en...
1,administraci n federal de ingres p blic impues...
2,defens del consumidor ley n 24 240 norm de pro...
3,direccion general de aduan resoluci n 63 2008 ...
4,indemniz desaparicion forz de person ley 25 98...
5,ministeri del interior bomber voluntari resolu...
6,ente nacional regul del gas tarif resoluci n 3...
7,ministeri de justici segur y derech human decr...
8,sal electoral y de comp originari tribunal sup...
9,sal electoral y de comp originari tribunal sup...


In [17]:
# Generación del corpus de entrenamiento
df_train.to_csv(train_data, sep=" ")

In [18]:
# Concatenación de archivos de test
testLabels = []
row_list = []
i = 0

for folder, subfolders, filenames in os.walk(test_data_dir):
    for file in filenames:
        if file.endswith(".txt"):
            d = {folder}
            with open(folder + file) as f:
                if f.read():
                    f.seek(0)
                    d = f.read()
                    testLabels.append(folder + file)
                    row_list.append(review_to_wordlist(d))
                    i = i + 1
    break

In [19]:
print(u"Archivos txt de test leidos %s:\n" % i)

Archivos txt de test leidos 3:



In [20]:
df_test = pd.DataFrame(row_list)
df_test.head()

Unnamed: 0,0
0,secret a de agricultur ganad a pesc y aliment ...
1,administracion federal de ingres public direcc...
2,administracion federal de ingres public direcc...


In [21]:
# Generación del corpus de test
df_test.to_csv(test_data_dir + "Infoleg_test.cor", sep=" ")

#### Definicion de una Funcion para Leer y Preprocesar Texto

In [22]:
def read_corpus(fname, tokens_only=False):
    with smart_open.open(fname, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            if tokens_only:
                yield gensim.utils.simple_preprocess(line)
            else:
                # Entrenamiento de datos y tags
                yield gensim.models.doc2vec.TaggedDocument(
                    gensim.utils.simple_preprocess(line), [i]
                )

In [23]:
train_corpus = list(read_corpus(train_data))
test_corpus = list(read_corpus(test_data, tokens_only=True))

Damos un vistazo al corpus de entrenamiento

Ahora al corpus de test

Notice that the testing corpus is just a list of lists and does not contain any tags.

### Entrenamiento del Modelo

### Instancia de un Objeto Doc2Vec 

In [24]:
model = Doc2Vec(vector_size=128, min_count=2, epochs=40)

### Construcción de un Vocabulario

In [25]:
# Definición de diferentes alternativas de modelo

cores = multiprocessing.cpu_count()

models = [
    # PV-DBOW
    Doc2Vec(
        dm=0,
        dbow_words=1,
        vector_size=200,
        window=8,
        min_count=19,
        epochs=10,
        workers=cores,
    ),
    # PV-DM w/average
    Doc2Vec(
        dm=1,
        dm_mean=1,
        vector_size=200,
        window=8,
        min_count=19,
        epochs=10,
        workers=cores,
    ),
]

In [26]:
models[0].build_vocab(train_corpus)
print(str(models[0]))
models[1].reset_from(models[0])
print(str(models[1]))

Doc2Vec(dbow+w,d200,n5,w8,mc19,s0.001,t4)
Doc2Vec(dm/m,d200,n5,w8,mc19,s0.001,t4)


### Momento de Entrenar

In [27]:
for model in models:
    %time model.train(train_corpus, total_examples=model.corpus_count, epochs=10)

CPU times: user 2min 35s, sys: 427 ms, total: 2min 36s
Wall time: 53.4 s
CPU times: user 29.4 s, sys: 192 ms, total: 29.5 s
Wall time: 11.1 s


### Infiriendo un Vector

In [28]:
model.infer_vector(
    ["ejecucion", "nacional", "acuerdo", "procesos", "aplicacion", "programa"]
)

array([ 0.03543337, -0.02179321, -0.0186638 ,  0.02324385, -0.01308073,
        0.0104502 ,  0.01946487,  0.02467658,  0.00737994,  0.01650654,
        0.03434812,  0.00699867, -0.00207358, -0.02242089,  0.0146283 ,
        0.01501486,  0.02557651, -0.0006762 ,  0.00915986,  0.06618509,
       -0.0242402 , -0.00293844,  0.00793903,  0.01735396, -0.01516493,
        0.00178138,  0.01627547,  0.02827312,  0.01210817,  0.0386159 ,
        0.02465968, -0.01833941, -0.01266908, -0.03641306, -0.00432767,
       -0.03514048, -0.00524915,  0.0297388 , -0.00699908,  0.00695747,
        0.00651056, -0.02961001,  0.05350425, -0.0047966 ,  0.05040609,
        0.02263964,  0.04255007, -0.02451929, -0.00087611, -0.00566807,
        0.00282642,  0.05884235,  0.01917174,  0.00234831,  0.04085301,
       -0.04272524,  0.00304498,  0.01290464,  0.02652242, -0.00544464,
       -0.02732924,  0.03729615, -0.00110397, -0.00699362, -0.02283825,
        0.02166421,  0.00213674, -0.00628853,  0.01079531, -0.02

## Evaluación del Modelo

In [29]:
ranks = []
second_ranks = []
for doc_id in range(len(train_corpus)):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

    second_ranks.append(sims[1])

Se cuenta como cada documento rankea con respecto al corpus de entrenamiento 

In [30]:
# Los resultados pueden variar debido a la semilla random y a un corpus muy pequeño
collections.Counter(ranks)

Counter({123: 1,
         0: 131,
         5: 1,
         1: 9,
         2: 6,
         14: 1,
         4: 2,
         3: 2,
         9: 3,
         6: 3,
         11: 1,
         13: 2})

In [31]:
# print("Documentos ({}): «{}»\n".format(doc_id, " ".join(train_corpus[doc_id].words)))
print(u"DOCS SIMILARES POR MODELO %s:\n" % model)
for label, index in [
    ("MAS SIMILAR ", 0),
    ("SEGUNDO ", 1),
    ("TERCERO", 2),
    #    ("LEAST", len(sims) - 1),
]:
    print(label, docLabels[index])
#        u"%s %s: «%s»\n"
#        % (label, sims[index], " ".join(train_corpus[sims[index][0]].words))
#    )

DOCS SIMILARES POR MODELO Doc2Vec(dm/m,d200,n5,w8,mc19,s0.001,t4):

MAS SIMILAR  ./Data/Infoleg/124898.txt
SEGUNDO  ./Data/Infoleg/113710.txt
TERCERO ./Data/Infoleg/638.txt


Notice above that the most similar document (usually the same text) is has a similarity score approaching 1.0. However, the similarity score for the second-ranked documents should be significantly lower (assuming the documents are in fact different) and the reasoning becomes obvious when we examine the text itself.

We can run the next cell repeatedly to see a sampling other target-document comparisons. 

## Testing del Modelo

In [32]:
# Tomamos un documento random para testear el corpus e inferir un vector desde the modelo
doc_id = random.randint(0, len(test_corpus) - 1)
inferred_vector = model.infer_vector(test_corpus[doc_id])
sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
doc_test = testLabels[doc_id]

print(u"DOCS SIMILARES POR MODELO %s:\n" % model)
for label, index in [
    ("MAS SIMILAR ", 0),
    ("SEGUNDO ", 1),
    ("TERCERO", 2),
    #    ("LEAST", len(sims) - 1),
]:
    print(label, docLabels[index])
#        u"%s %s: «%s»\n"
#        % (label, sims[index], " ".join(train_corpus[sims[index][0]].words))
#    )

# print("Testeo Documento ({}): «{}»\n".format(doc_id, " ".join(test_corpus[doc_id])))
# print("Testeo Documento ({}): «{}»\n".format(doc_id, " ".join(testLabels[doc_id]))

# print(u"DOCS SIMILARES POR MODELO %s:\n" % model)
# for label, index in [("MAS SIMILAR", 0), ("SEGUNDO", 1), ("TERC", 2)]:
#    print((sims[index], docLabels[index])
#       u"%s %s: «%s»\n"
#        %
#        % (label, sims[index], " ".join(train_corpus[sims[index][0]].words))
#    )

DOCS SIMILARES POR MODELO Doc2Vec(dm/m,d200,n5,w8,mc19,s0.001,t4):

MAS SIMILAR  ./Data/Infoleg/124898.txt
SEGUNDO  ./Data/Infoleg/113710.txt
TERCERO ./Data/Infoleg/638.txt


### Grabación Modelo Serializado

In [33]:
file_name = BASE_DIR + "/my_doc2vec_model"
print(file_name)
model.save(file_name)

./Data/my_doc2vec_model
