# Title: INDECOPI Search Engine Using NLP
## Using Doc2Vec

In [31]:
__author__ = 'Daniel Villanueva'
__email__ = '2144810@brunel.ac.uk'
__website__ = 'https://www.linkedin.com/in/danielvillanuevanunez/'
__copyright__ = 'Copyright 2022, Daniel Villanueva'

## 2. Necessary Libraries

In [6]:
import pandas as pd
import numpy as np
import pdfplumber
import pickle
import string
import os

import nltk
from nltk.tokenize import word_tokenize

from gensim.models.doc2vec import Doc2Vec, TaggedDocument

## 3. Cleaning & Building The Doc2Vec Model

In [33]:
# Extracting directory of the "resoluciones"
resoluciones = ["resoluciones/" + filename for filename in os.listdir("resoluciones/")]

# List of stopwords in Spanish.
stopword_es = nltk.corpus.stopwords.words('spanish')
# Add the "\n" in the list.
stopword_es.append("\n")
stopword_es.append("-")
# List of punctuations such as ".", ",". "\".
punctuation = [punct for punct in string.punctuation]
# Combine both lists into only one.
undesirable_values = punctuation + stopword_es

In [34]:
def pdf_to_text(list_resoluciones):
    """ This function converts the pdfs to text.  
    Args:
        * list_resoluciones (list): this list contains the relative location of the files. 
    Returns:
        * corpus (list): this list contains the string version of the pdfs.
    """
    # Empty list where documents will be stored.
    corpus = []
    # Iterating over files.
    for file in list_resoluciones:
        # Open the file.
        with pdfplumber.open(file) as pdf:
            doc = ""
            # This loop merges the pages of 1 document into 1 string.
            for page in pdf.pages:
                doc += page.extract_text()
            # Merged doc gets saved in the list.
            corpus.append(doc)
    return corpus

def cleaning_corpus(corpus):
    """ This function cleans the corpus by removing undesirable strings (e.g., "/", ".", "\n") and converts words
        into lower cases.
    Args:
        * corpus(list): list containining the documents that needs to be cleaned.
    Returns:
        * corpus_clean(list): list containining a list of the words for each document.
    """
    # Empty list where cleaned documents will be stored. 
    corpus_clean = []
    
    # Iterating over the corpus
    for doc in corpus:
        # 1. Removing strings from the document that belong to the undersirable_values list.
        # 2. Lower case each word.
        doc = [word.lower() for word in word_tokenize(doc) if word not in undesirable_values]       
        # Adding the document to the corpus. This is a list of lists -> [["hello", "eat"]["bye", food"]]
        corpus_clean.append(doc)
        
    return corpus_clean

def tagged_documents(corpus_clean):
    """ This function cleans the corpus a tags them with a number. 
    Args:
        * corpus_clean(list): list containining a list of the words for each document.
    """
    for i, doc in enumerate(corpus_clean):
        yield TaggedDocument(doc, [i])

In [35]:
corpus = pdf_to_text(resoluciones)
clean_corpus = cleaning_corpus(corpus)
data_training = list(tagged_documents(clean_corpus))

In [36]:
# Build model with each document of vector size 1000
model = Doc2Vec(vector_size=1000, min_count=1, epochs=50)
model.build_vocab(data_training)
model.train(data_training, total_examples = model.corpus_count, epochs=model.epochs)
# Save the model
model.save("model/indecopi_resoluciones_doc2vec.model")

In [37]:
corpus_vector = [model.dv[i] for i in range(len(model.dv))]

In [38]:
database = pd.DataFrame(corpus_vector)
database["resoluciones"] = resoluciones

In [39]:
database.to_csv("data/resoluciones_doc2vec.csv", index=False)

## 4. Ranking Class

In [1]:
from indesearch import INDESearch
import pandas as pd

In [2]:
query = """no reembolso"""

In [8]:
# Location of the tf-idf model.
doc2vec_filename = "model/indecopi_resoluciones_doc2vec.model"
# Database
df_corpus = pd.read_csv("data/resoluciones_doc2vec.csv")
# Instantiating the class.
query_class = INDESearch(query, top_values = 10, database = df_corpus)
# Clean the query.
clean_query = query_class.cleaning_query_doc2vec(doc2vec_filename)
# Dataframe containing the results
result = query_class.similarity(clean_query)

In [9]:
result

Unnamed: 0,resoluciones,similarity_score
0,resoluciones/doc_202004281556238031.pdf,0.667612
1,resoluciones/doc_202004281547478711.pdf,0.655933
2,resoluciones/doc_202004281547471577.pdf,0.649919
3,resoluciones/doc_202004281540142661.pdf,0.64236
4,resoluciones/doc_202004281540134017.pdf,0.635256
5,resoluciones/doc_202004281540145617.pdf,0.634874
6,resoluciones/doc_202004281540138585.pdf,0.600505
7,resoluciones/doc_202004290926062486.pdf,0.591984
8,resoluciones/doc_202004281540131638.pdf,0.55059
9,resoluciones/doc_202008260017144816.pdf,0.497838


## References
* https://radimrehurek.com/gensim/models/doc2vec.html
* https://www.youtube.com/watch?v=GQbkthBPqV8
* https://radimrehurek.com/gensim/models/word2vec.html