# TF-IDF Vectors

### We obtain the tf-idf vectors from a set of 600,000 samples.

In [262]:
import pandas as pd
import spacy
import re
import scipy
import unidecode
import warnings
import numpy as np
import sys

warnings.filterwarnings('ignore')


#### StopWords Portugues

In [263]:
spacy_nlp = spacy.load('pt')
spacy_stopwords = spacy.lang.pt.STOP_WORDS

#### Stematize Function

RSLPStemmer use portugues dictionary

In [265]:
import nltk
from nltk import word_tokenize          
from nltk.stem import RSLPStemmer
#>>>st = RSLPStemmer()
class SteamTokenizer(object):
    def __init__(self):
        self.wnl = RSLPStemmer()
    def __call__(self, doc):
        return [self.wnl.stem(t) for t in word_tokenize(doc)]

In [222]:
#nltk.download('rslp')

In [267]:
data = pd.read_csv("products.csv",encoding='latin')

In [269]:
data.shape

(674719, 10)

In [270]:
data.head()

Unnamed: 0,ID_PDC,ESPECIALIDADE,PROCEDIMENTO_PRINCIPAL,DESCRICAO_DO_PRODUTO,ANVISA,MARCA,REFERENCIA,PRECO,ANVISA_DEF,GMDN_TERMO
0,174274,SISTEMA GENITAL E REPRODUTOR FEMININO,HISTERECTOMIA TOTAL LAPAROSCOPICA COM ANEXECTO...,MANIPULADOR UTERINO CLEARVIEW 7CM,80517980053,CLINICAL INNOVATIONS,UM700,9625.0,-,"uterine manipulator, single-use"
1,176721,SISTEMA MUSCULO-ESQUELETICO E ARTICULAÇÕES,LESÃO AGUDA DE LIGAMENTO COLATERAL DO JOELHO ...,ENXERTO OSSEO EM BLOCO 6 CC 2 X 3 CC ATTRAX PUTTY,80074640024,NUVASIVE,5018006,6875.0,attrax putty nuvasive nuvasive inc,"bone matrix implant, synthetic"
2,181909,METODOS DIAGNOSTICOS POR IMAGEM,EMBOLIZAÇÃO DE ANEURISMA CEREBRAL POR OCLUSÃO ...,INTRODUTOR 6F 12CM ULTIMUM,10332340107,ST JUDE,407694,1375.0,-,vascular catheter introduction set
3,183381,SISTEMA DIGESTIVO E ANEXOS,PANCREATO-DUODENECTOMIA COM LINFADENECTOMIA [ ...,PINÇA LIGA SURE 5MM,10349000188,VALLEYLAB,LS1500,33687.5,-,-
4,183381,SISTEMA DIGESTIVO E ANEXOS,PANCREATO-DUODENECTOMIA COM LINFADENECTOMIA [ ...,TROCATER DESCARTAVEL 12.5X100MM,80082910071,HANGZHOU KANGJI MEDI,101Y524,2062.5,trocarter descartavel hangzhou kangji medical ...,-


##### Preprocess data

In [271]:
data["DOCS"] = data["PROCEDIMENTO_PRINCIPAL"] + " " + data["DESCRICAO_DO_PRODUTO"]

In [272]:
data["DOCS"] = data["DOCS"].apply(lambda x : x.lower() )
data["DOCS"] = data["DOCS"].apply(lambda x : re.sub("[^a-zA-ZÀ-ú']+",' ',x) )
data["DOCS"] = data["DOCS"].apply(lambda x : unidecode.unidecode(x) )

In [273]:
data["DOCS"].head()

0    histerectomia total laparoscopica com anexecto...
1    lesao aguda de ligamento colateral do joelho t...
2    embolizacao de aneurisma cerebral por oclusao ...
3    pancreato duodenectomia com linfadenectomia pa...
4    pancreato duodenectomia com linfadenectomia pa...
Name: DOCS, dtype: object

#### Vocabulary of Words

In [276]:
from sklearn.feature_extraction.text import CountVectorizer

In [277]:
docs = data["DOCS"].tolist()

In [278]:
#create a vocabulary of words, 
#ignore words that appear in 85% of documents, 
#eliminate stop words
CV = CountVectorizer(max_df=0.85,stop_words=spacy_stopwords,lowercase=True,tokenizer=SteamTokenizer())

In [279]:
word_count_vector = CV.fit_transform(docs)

In [282]:
CV.vocabulary_

{'histerectom': 3194,
 'total': 6746,
 'laparoscop': 3728,
 'anexectom': 301,
 'bilater': 730,
 'uter': 7137,
 'manipul': 3987,
 'uterin': 7138,
 'clearview': 1284,
 'lesa': 3786,
 'agud': 184,
 'colater': 1341,
 'joelh': 3627,
 'trat': 6866,
 'cirurg': 1241,
 'reconstruca': 5617,
 'titani': 6694,
 'interferenc': 3476,
 'traumed': 6876,
 'enxert': 2293,
 'osse': 4739,
 'bloc': 849,
 'attrax': 554,
 'putty': 5466,
 'embolizaca': 2139,
 'aneurism': 298,
 'cerebr': 1161,
 'oclusa': 4617,
 'sacul': 5912,
 'metod': 4159,
 'interven': 3503,
 'terapeu': 6594,
 'imag': 3282,
 'introdu': 3560,
 'ultimum': 7024,
 'pancreat': 4834,
 'duodenectom': 2021,
 'linfadenectom': 3838,
 'pancre': 4832,
 'pinc': 5104,
 'trocat': 6944,
 'descarta': 1810,
 'simpatectom': 6099,
 'videotoracoscop': 7286,
 'nerv': 4480,
 'perifer': 5014,
 'dissec': 1945,
 'ganch': 2875,
 'angioplast': 315,
 'translum': 6812,
 'percutane': 4983,
 'multipl': 4385,
 'impl': 3297,
 'stent': 6322,
 'hemodinam': 3093,
 'cardiolog': 1

In [289]:
word_count_vector.shape

(674719, 7440)

#### Get TF-IDF Vectors

In [290]:
from sklearn.feature_extraction.text import TfidfTransformer

In [291]:
tfidf_transformer = TfidfTransformer(smooth_idf=True,use_idf=True)

In [292]:
tfidf_transformer.fit(word_count_vector)

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

In [293]:
tf_idf_vector = tfidf_transformer.transform(CV.transform(docs))

In [294]:
tf_idf_vector.shape

(674719, 7440)

#### Export TF-IDF Vectors

In [295]:
import scipy.sparse
scipy.sparse.save_npz("tfidf_vectors.npz", tf_idf_vector)