In [1]:
import json
import re
import pandas as pd

- Load data

We have 22009 products and 2578 unique subcats.

In [2]:
data_train = pd.read_csv('original.csv', sep=',')

In [3]:
data_train.describe()

Unnamed: 0,DESCRIÇÃO PARCEIRO,SUB-CATEGORIA,CATEGORIA,DEPARTAMENTO
count,22009,22009,22009,22009
unique,22009,1968,332,53
top,"ASSAD VITRIZI RETG 3,6L MEDIA",TRADICIONAL,VINHO,MERCEARIA DOCE
freq,1,810,789,3614


- Some changes in data, before generating the input
    
    1. Transform subcategory and category to numbers.
    2. Join subcategories and categories to create cat_sub name.
    3. Set description to lowercase
    4. Export dictionary for category and subcategory names
    5. Export basic dataset. Without any refinement.
    6. Export counter of products in each cat_sub

In [4]:
# 1.
data_train['subcategory'] = pd.factorize(data_train['SUB-CATEGORIA'])[0].astype(str)
data_train['category'] = pd.factorize(data_train['CATEGORIA'])[0].astype(str)

In [5]:
# 2
data_train['cat_sub'] = data_train['category'].map(str) + '_' + data_train['subcategory'].map(str)

In [6]:
# 3. 
data_train['description'] = data_train['DESCRIÇÃO PARCEIRO'].str.lower()

In [7]:
# 4. DICTIONARY catid1: [subcatid1, subcatid2, ...], catid2: [...], ...
tree = data_train.groupby('category')['subcategory'].unique().apply(list).to_dict()

In [8]:
# 5. DICTIONARY catid1: namecat1, catid2: namecat2, ... / subcatid1: namesubcat1, subcatid2: namesubcat2, ... 
categories_names = dict(zip(data_train.category, data_train['CATEGORIA'])) 
subcategories_names = dict(zip(data_train.subcategory, data_train['SUB-CATEGORIA'])) 

In [9]:
# 6. DICTIONARY cat_sub1: namecat1, cat_sub1: namecat2, ... / cat_sub1: namesubcat1, cat_sub1: namesubcat2, ... 
categories = dict(zip(data_train.cat_sub, data_train['CATEGORIA']))
subcategories = dict(zip(data_train.cat_sub, data_train['SUB-CATEGORIA']))

In [10]:
# 7. SAVE DICTIONARIES
with open('tree.json', 'w') as fp:
    json.dump(tree, fp)
with open('categories_names.json', 'w') as fp:
    json.dump(categories_names, fp)
with open('subcategories_names.json', 'w') as fp:
    json.dump(subcategories_names, fp)
with open('categories.json', 'w') as fp:
    json.dump(categories, fp)
with open('subcategories.json', 'w') as fp:
    json.dump(subcategories, fp)

In [11]:
# 5. 
header = ["description", "cat_sub", "category", "subcategory"]
data_train.to_csv('basico.csv', columns = header, index = False)

In [12]:
# 6. 
df_count = data_train.groupby('cat_sub')['description'].count().sort_values(ascending=False).to_dict()

In [13]:
# Generate cat_sub counter json
with open('cat_sub_counter.json', 'w') as fp:
    json.dump(df_count, fp)