# Title: INDECOPI Search Engine Using NLP
## Using TF-IDF

In [1]:
__author__ = 'Daniel Villanueva'
__email__ = '2144810@brunel.ac.uk'
__website__ = 'https://www.linkedin.com/in/danielvillanuevanunez/'
__copyright__ = 'Copyright 2022, Daniel Villanueva'

## 2. Necessary Libraries

In [2]:
import pandas as pd
import numpy as np
import pdfplumber
import pickle
import string
import os

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

seed=101096

## 3. Cleaning & Building The TFIDF Model

In [3]:
# Extracting directory of the "resoluciones"
resoluciones = ["resoluciones/" + filename for filename in os.listdir("resoluciones/")]

# List of stopwords in Spanish.
stopword_es = nltk.corpus.stopwords.words('spanish')
# Add the "\n" in the list.
stopword_es.append("\n")
# List of punctuations such as ".", ",". "\".
punctuation = [punct for punct in string.punctuation]
# Combine both lists into only one.
undesirable_values = punctuation + stopword_es

# Instantiating the Stemmer class to stem Spanish Words.
stemmer = SnowballStemmer('spanish')
# Instantiating the TF-IDF vectorizer.
vectorizer = TfidfVectorizer()

In [4]:
def pdf_to_text(list_resoluciones):
    """ This function converts the pdfs to text.  
    Args:
        * list_resoluciones (list): this list contains the relative location of the files. 
    Returns:
        * corpus (list): this list contains the string version of the pdfs.
    """
    # Empty list where documents will be stored.
    corpus = []
    # Iterating over files.
    for file in list_resoluciones:
        # Open the file.
        with pdfplumber.open(file) as pdf:
            doc = ""
            # This loop merges the pages of 1 document into 1 string.
            for page in pdf.pages:
                doc += page.extract_text()
            # Merged doc gets saved in the list.
            corpus.append(doc)
    return corpus

def cleaning_corpus(corpus):
    """ This function cleans the corpus by removing undesirable strings (e.g., "/", ".", "\n") and stems the words
        inside.
    Args:
        * corpus(list): list containining the documents that needs to be cleaned.
    Returns:
        * corpus_clean(list): list containining the cleaned documents.
    """
    # Empty list where cleaned documents will be stored. 
    corpus_clean = []
    
    # Iterating over the corpus
    for doc in corpus:
        # 1. Removing strings from the document that belong to the undersirable_values list.
        # 2. Stemming each word.
        words = [stemmer.stem(word) for word in word_tokenize(doc) if word not in undesirable_values]
        # Joining the words into 1 document.
        doc = " ".join(words)
        # Adding the document to the corpus.
        corpus_clean.append(doc)
        
    return corpus_clean

def vectorizing_tfidf(cleaned_corpus, name_resoluciones, tfidf_filename):
    """ This function converts list of documents into an array holding the tf-idf values. Then this array is converted
        into a dataframe. 
    Args:
        * cleaned_corpus(list):
        * name_resoluciones(list):
    Returns:
        * df_corpus(dataframe):d
    """
    # Convert corpus list into tf-idf
    X = vectorizer.fit_transform(cleaned_corpus)
    # The names of the columns are the words. Each word is a feature.
    column_names = vectorizer.get_feature_names_out()
    # Create dataframe with the list containing the documents.
    df_corpus = pd.DataFrame(X.toarray(), columns=column_names)
    # Add the column "label" that contains the name of the files.
    df_corpus["label"] = name_resoluciones
    # Save vectorizer model
    pickle.dump(vectorizer, open(tfidf_filename, 'wb'))
    
    return df_corpus

In [5]:
corpus = pdf_to_text(resoluciones)

In [6]:
corpus_clean = cleaning_corpus(corpus)

In [7]:
tfidf_filename = "model/indecopi_resoluciones_tfidf.sav"
df_corpus = vectorizing_tfidf(corpus_clean, resoluciones, tfidf_filename)
df_corpus.to_csv("data/resoluciones_tfidf.csv", index=False)

## 4. Ranking Class

In [7]:
from indesearch import INDESearch
import pandas as pd

In [15]:
query = """no reembolso"""

In [16]:
# Location of the tf-idf model.
tfidf_filename = "model/indecopi_resoluciones_tfidf.sav"
# Database
df_corpus = pd.read_csv("data/resoluciones_tfidf.csv")
# Instantiating the class.
query_class = INDESearch(query, top_values = 10, database = df_corpus)
# Clean the query.
clean_query = query_class.cleaning_query_tfidf(tfidf_filename)
# Dataframe containing the results
result = query_class.similarity(clean_query)

In [17]:
result

Unnamed: 0,label,similarity_score
0,resoluciones/doc_202010140846376795.pdf,0.020277
1,resoluciones/doc_202008260017144816.pdf,0.007481
2,resoluciones/doc_202008302319029384.pdf,0.004806
3,resoluciones/doc_202010212308215994.pdf,0.004543
4,resoluciones/doc_202011290056011368.pdf,0.003378
5,resoluciones/doc_202004281540131638.pdf,0.0
6,resoluciones/doc_202008302340032467.pdf,0.0
7,resoluciones/doc_202012130051341411.pdf,0.0
8,resoluciones/doc_202011081957579777.pdf,0.0
9,resoluciones/doc_202011081842348593.pdf,0.0


## 5. References

* https://stackoverflow.com/questions/4211209/remove-all-the-elements-that-occur-in-one-list-from-another
* https://stackoverflow.com/questions/5618878/how-to-convert-list-to-string
* https://discuss.streamlit.io/t/how-to-download-file-in-streamlit/1806/27