In [26]:
import json
import re
import pandas as pd

- Functions to clean data

In [27]:
# Remove comma
def remove_comma(phrase):
    phrase = phrase.replace(',', '')
    return phrase

In [28]:
# Remove stop words from phrase
def remove_stop_words(phrase):
    stop = ['de', 'a', 'o', 'que', 'e', 'do', 'da', 'em', 'um', 'para', 'é', 'com', 'não', 'uma', 'os', 'no', 'se', 'na', 'por', 'mais', 'as', 'dos', 'como', 'mas', 'foi', 'ao', 'ele', 'das', 'tem', 'à', 'seu', 'sua', 'ou', 'ser', 'quando', 'muito', 'há', 'nos', 'já', 'está', 'eu', 'também', 'só', 'pelo', 'pela', 'até', 'isso', 'ela', 'entre', 'era', 'depois', 'sem', 'mesmo', 'aos', 'ter', 'seus', 'quem', 'nas', 'me', 'esse', 'eles', 'estão', 'você', 'tinha', 'foram', 'essa', 'num', 'nem', 'suas', 'meu', 'às', 'minha', 'têm', 'numa', 'pelos', 'elas', 'havia', 'seja', 'qual', 'será', 'nós', 'tenho', 'lhe', 'deles', 'essas', 'esses', 'pelas', 'este', 'fosse', 'dele', 'tu', 'te', 'vocês', 'vos', 'lhes', 'meus', 'minhas', 'teu', 'tua', 'teus', 'tuas', 'nosso', 'nossa', 'nossos', 'nossas', 'dela', 'delas', 'esta', 'estes', 'estas', 'aquele', 'aquela', 'aqueles', 'aquelas', 'isto', 'aquilo', 'estou', 'está', 'estamos', 'estão', 'estive', 'esteve', 'estivemos', 'estiveram', 'estava', 'estávamos', 'estavam', 'estivera', 'estivéramos', 'esteja', 'estejamos', 'estejam', 'estivesse', 'estivéssemos', 'estivessem', 'estiver', 'estivermos', 'estiverem', 'hei', 'há', 'havemos', 'hão', 'houve', 'houvemos', 'houveram', 'houvera', 'houvéramos', 'haja', 'hajamos', 'hajam', 'houvesse', 'houvéssemos', 'houvessem', 'houver', 'houvermos', 'houverem', 'houverei', 'houverá', 'houveremos', 'houverão', 'houveria', 'houveríamos', 'houveriam', 'sou', 'somos', 'são', 'era', 'éramos', 'eram', 'fui', 'foi', 'fomos', 'foram', 'fora', 'fôramos', 'seja', 'sejamos', 'sejam', 'fosse', 'fôssemos', 'fossem', 'for', 'formos', 'forem', 'serei', 'será', 'seremos', 'serão', 'seria', 'seríamos', 'seriam', 'tenho', 'tem', 'temos', 'tém', 'tinha', 'tínhamos', 'tinham', 'tive', 'teve', 'tivemos', 'tiveram', 'tivera', 'tivéramos', 'tenha', 'tenhamos', 'tenham', 'tivesse', 'tivéssemos', 'tivessem', 'tiver', 'tivermos', 'tiverem', 'terei', 'terá', 'teremos', 'terão', 'teria', 'teríamos', 'teriam']
    words = [item for item in phrase.split(' ') if item not in stop]
    phrase = ' '.join(words)
    return phrase

In [29]:
# Simple portuguese characters
def replace_portuguese(phrase):
    characters = {'ç':'c', 'á':'a', 'à':'a', 'é':'e', 'í':'i', 'ó':'o', 'ú':'u'}
    trans = str.maketrans(characters)
    phrase = ''.join(phrase.lower()).translate(trans)
    return phrase

In [30]:
# Replace symbols NEW
def replace_symbols(phrase):
    phrase = phrase.replace('(m)', 'male').replace('(f)', 'female').replace(':', ' ').replace('-', ' ').replace('+', ' ').replace('(', ' ').replace(')', ' ').replace('.', ' ').replace('p/', 'para ').replace('c/', 'com ').replace('s/', 'sem ').replace('/', ' ')
    return phrase

In [31]:
# Remove small words
def remove_small(phrase):
    words = [item for item in phrase.split(' ') if len(item) > 1]
    phrase = ' '.join(words)
    return phrase

In [32]:
# Create new words
def add_words(phrase):
    words = phrase.split(' ')
    if len(words) > 2:
        new_words = words[0] + words[1] + ' ' + words[1] + words[2] + ' ' + words[0] + words[2] + ' ' + words[0] + words[-1] + ' ' + words[0] + words[-2]
        phrase = new_words + ' ' + phrase
    return phrase

- Load data from Base

We have 22009 products and 2578 unique subcats.

In [33]:
data = pd.read_csv('../Datos/Base/basico.csv', sep=',')

In [34]:
data.head()

Unnamed: 0,description,cat_sub,category,subcategory
0,"pasta int vitapower 1,005kg amend/shot",0_0,0,0
1,esponja bettanin brilhus c/1,1_1,1,1
2,agua min schin s/gas 500ml,2_2,2,2
3,fita dupla face c/suporte scotch,3_3,3,3
4,massa pizza romanha oregano pct 160g,4_4,4,4


In [35]:
# 1. Remove ',' because are used to define decimal numbers, however are unnecessary
data['desc'] = data['description'].apply(remove_comma)

In [36]:
# 2. Remove stop words
data['desc'] = data['desc'].apply(remove_stop_words)

In [37]:
# 3. Improve portugues characters
data['desc'] = data['desc'].apply(replace_portuguese)

In [38]:
# 4. Replace symbols
data['desc'] = data['desc'].apply(replace_symbols)

In [39]:
# 5. Remove less than 2 characters
data['desc'] = data['desc'].apply(remove_small)

In [40]:
# 5b. Convert to lowercase 
data['desc'] = data['desc'].str.lower()

In [41]:
# 6. Replacing measures
data['desc'] = data['desc'].replace(to_replace=r'\b\d*ml\b', value='liquido', regex=True)
data['desc'] = data['desc'].replace(to_replace=r'\b\d*l\b', value='liquido', regex=True)

In [42]:
data['desc'] = data['desc'].replace(to_replace=r'\b\d*g\b', value='gramos', regex=True)
data['desc'] = data['desc'].replace(to_replace=r'\b\d*gr\b', value='gramos', regex=True)
data['desc'] = data['desc'].replace(to_replace=r'\b\d*kg\b', value='kilos', regex=True)

In [43]:
data['desc'] = data['desc'].replace(to_replace=r'\b\d*cm\b', value='tamanho', regex=True)

In [44]:
data['desc'] = data['desc'].replace(to_replace=r'\b(\d+x\d+)\b', value='volume', regex=True)
data['desc'] = data['desc'].replace(to_replace=r'\b(\d+x\d+cm)\b', value='volume', regex=True)

In [45]:
data['desc'] = data['desc'].replace(to_replace=r'\d+%', value=' porcentagem ', regex=True)
data['desc'] = data['desc'].replace(to_replace=r'\b(\d+w)\b', value='watts', regex=True)
data['desc'] = data['desc'].replace(to_replace=r'\b(\d+v)\b', value='voltagem', regex=True)

In [46]:
# 7. Add words
data['desc'] = data['desc'].apply(add_words)

- Generate Train and Test datasets

> Considering each cat_sub as and independent group. Each one will separate 50% to train and test. However, if there are less than 2 products, will not be considered. 

In [47]:
percentage = 50

In [48]:
with open('../Datos/Base/cat_sub_counter.json') as json_file:
    cat_sub_counter = json.load(json_file)

In [None]:
train = []
test = []
for cat_sub, total in cat_sub_counter.items():

    df_temp = data.query('cat_sub == "' + cat_sub + '"')
    
    rows_number_train = int(total*(percentage/100))
    train.append(df_temp.head(rows_number_train))
    
    rows_number_test = total - rows_number_train
    test.append(df_temp.tail(rows_number_test))

In [None]:
train_df = pd.concat(train)
test_df = pd.concat(test)

In [None]:
train_df.to_csv('../Datos/Generados/05train.csv', index = False)
test_df.to_csv('../Datos/Generados/05test.csv', index = False)