# INPUT FILES FORMATTING

In [1]:

import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import LabelEncoder

## Replacing Portuguese Categories with English Translations

In [2]:
products_df = pd.read_csv("./data/olist_products_dataset.csv")
translations_df = pd.read_csv("./data/product_category_name_translation.csv")

translations_dict = dict(zip(translations_df["product_category_name"], translations_df["product_category_name_english"]))
products_df["product_category_name"] = products_df["product_category_name"].apply(lambda x: translations_dict[x] if x in translations_dict else x)

products_df.to_csv("./data/olist_products_dataset.csv", index=False)

## Removing All Double Quotes

In [3]:
def remove_double_quotes_from_file(file_path):
    try:
        df = pd.read_csv(file_path)
        df = df.applymap(lambda x: str(x).replace('"', ''))
        df.to_csv(file_path, index=False)
        print(f"Double quotes removed from {file_path}.")
    except Exception as e:
        print(f"Error processing {file_path}: {e}")

data_folder = "data/"
files = [os.path.join(data_folder, file) for file in os.listdir(data_folder) if os.path.isfile(os.path.join(data_folder, file))]
for file in files:
    if file.endswith(".csv"):
        remove_double_quotes_from_file(file)


Double quotes removed from data/olist_customers_dataset.csv.
Double quotes removed from data/olist_geolocation_dataset.csv.
Double quotes removed from data/olist_orders_dataset.csv.
Double quotes removed from data/olist_order_items_dataset.csv.
Double quotes removed from data/olist_order_payments_dataset.csv.
Double quotes removed from data/olist_order_reviews_dataset.csv.
Double quotes removed from data/olist_products_dataset.csv.
Double quotes removed from data/olist_sellers_dataset.csv.
Double quotes removed from data/product_category_name_translation.csv.


## Formatting input files for the algorithm
### Formatting Transaction database
In the **transaction database**, each line represents a transaction. Each line is composed of three sections, as follows.

1. First, the **items** contained in the transaction are listed. An item is represented by a positive integer. Each item is separated from the next item by a single space. It is assumed that all items within a same transaction are sorted according to a total order (e.g. ascending order) and that no item can appear twice within the same transaction.
2. Second, the symbol “:” appears and is followed by the **transaction utility** (an integer).
3. Third, the symbol “:” appears and is followed by the **utility of each item** in this transaction (an integer), separated by single spaces.

For example, for the previous example, the input file for the transaction is defined as follows:

1 3:6:5 1\
5:3:3\
1 2 3 4 5:25:5 10 1 6 3\
2 3 4 5:20:8 3 6 3\
1 3 4:8:5 1 2\
1 3 5:22:10 6 6\
2 3 5:9:4 2 3

Consider the first line. It means that the transaction {1, 3} has a total utility of 6 and that items 1 and 3 respectively have a utility of 5 and 1 in this transaction. The following lines follow the same format.


### Formatting Taxonomy database

In the **taxonomy database**, each line represents a category relationship. Each line is composed of two sections, as follows.

1. First, the **item**.
2. Second, the symbol “,” appears and is followed by a **generalized item**, representing the category that the item belongs.

For example, for the previous example, the input file for the **taxonomy** is defined as follows:

1,6\
2,6\
3,7\
4,8\
5,8\
6,7

Consider the first line. It means that the item 1 is belongs to the generalized item 6.



In [4]:
order_items_df = pd.read_csv("./data/olist_order_items_dataset.csv")
products_df = pd.read_csv("./data/olist_products_dataset.csv")
merged_df = pd.merge(order_items_df, products_df, on="product_id", how="inner")
print("----Dataframe shapes----")
print(f'Order items: {order_items_df.shape}')
print(f'Products: {products_df.shape}')
print(f'Merged: {merged_df.shape}')
print()

merged_df = merged_df[['order_id', 'product_id', 'price', 'product_category_name']]
merged_df.sample(10)
# print merged_df without index 

----Dataframe shapes----
Order items: (112650, 7)
Products: (32951, 9)
Merged: (112650, 15)



Unnamed: 0,order_id,product_id,price,product_category_name
90084,7ff76bc285fc3b21099d0ed9007fd489,201842e46ecc7b96047e78835d973548,104.9,consoles_games
55049,9211fd3895bb47e2540f63577341f7b3,37bc5c7b7dca15cfcbe282fb0dd0bab3,13.99,housewares
45611,b9bb7dc5b3014d158c4d61d500c97a18,ba4bfbf74dbe7ab37e263b9326da0523,33.9,sports_leisure
3293,25d054bcaafc7ed0a9b503a55ae9f2f1,19f0917ba5781b978298ffcd9ae225c1,89.99,consoles_games
18996,d8691426e29a4206ab5076fcdbc1736d,1613b819ab5dae53aead2dbb4ebdb378,27.9,auto
69465,42dd28e31692200f58f7ab447d6c8eae,e766d04f4dd2f3b88523f2f67ab80e2d,119.0,furniture_decor
72014,e6ab8245f96b5c84ade5edfb8e0c2753,1cc61b32763a4d816212b3507b6b6c59,524.9,books_general_interest
74591,308b7e21ad330aa50c1c7a9987437a53,774ad2570c29b52d50433d7bb89273e7,69.0,home_appliances
6486,0d88b2392b18e01d23c582190ada0f24,aca2eb7d00ea1a7b8ebd4e68314663af,69.9,furniture_decor
3612,c48c006cc34f94c1fb2aa8afd2c1cc9f,389d119b48cf3043d311335e499d9c6b,49.0,garden_tools


In [5]:
print("--------Missing values-------")
print(merged_df.isnull().sum())

print("\n-Removing all missing values-")
merged_df = merged_df.dropna()
print(merged_df.isnull().sum())


merged_df.sample(10)

--------Missing values-------
order_id                    0
product_id                  0
price                       0
product_category_name    1603
dtype: int64

-Removing all missing values-
order_id                 0
product_id               0
price                    0
product_category_name    0
dtype: int64


Unnamed: 0,order_id,product_id,price,product_category_name
96737,7e41e7372ce6842cf380d475bf0927b4,b84186a036c7f3b9c2e4701800a02e45,113.47,construction_tools_construction
16043,3eaf0488f419dab6c6d582b1d28ab37e,423a6644f0aa529e8828ff1f91003690,465.0,agro_industry_and_commerce
79486,3c33046864727b932e3c4087844f091c,b095c6b329584cdf2a7e0986de9f3898,63.9,home_construction
91460,64dc7842b184ebd280aef040ab951134,a1c4bbc86f12628bb57430973e680129,159.9,perfumery
72208,ef2bf4a6c31539b683224ca3ac767cad,f63729cf4441fe64282cd1b82281792d,209.9,sports_leisure
78396,77831afddf2a5ebdda899a6adc91f88f,f88d1685109f08578f9cc5a9cd4c770a,89.9,toys
36583,07c8d74971a3302f6e7d2b0017ffb295,3e03619d360cd44f29e094aae3c4a76b,37.8,sports_leisure
90355,c95fd0fc2b4b423d8ded594da8d1a398,e43311c5b307c31b994c47588c59b168,24.9,furniture_decor
62634,c887294d29ae18c8f8cdfa478a4a36b2,bcad5b050c241af2fdc24c51556b297c,329.9,garden_tools
104519,b2664e0e967b1c146e574185530e813d,1b6cecc37e5cf78922520ccb14860e63,109.0,housewares


In [6]:
encoder = LabelEncoder()
merged_df['order_id_encoded'] = encoder.fit_transform(merged_df['order_id'])
merged_df['product_id_encoded'] = encoder.fit_transform(merged_df['product_id'])
merged_df['product_category_name_encoded'] = encoder.fit_transform(merged_df['product_category_name'])
merged_df.sample(10)

Unnamed: 0,order_id,product_id,price,product_category_name,order_id_encoded,product_id_encoded,product_category_name_encoded
61731,effbd7f83c0f10ef73b5704c87b1d168,f788dd44fc883f0664693a6dea50437f,39.0,toys,91222,31282,71
22831,f25bbb036033d9fdbce59f6858f0bcd2,6413f7a28e149a324c4a914000399fb2,144.99,cool_stuff,92099,12696,20
29890,9b1feddaf70fe81f5182faabb45bede6,e3af4b7c220aca1535b34416de24dfb7,45.99,toys,58580,28780,71
100420,caebb338f575e41c6ff53ac124033ccb,c551ffaf36e5c1d8444b44bb7c78123b,99.0,toys,77132,24912,71
83151,4631c1e3bc2f88c99b9ffcb6a8a62e37,1c6c35cdf70ee1c3515cd1527f4aa6d8,79.9,stationery,26655,3697,68
38318,fca0397489a3333e988ec1218efe40b6,42ec84ace63b58b8c5a7ba7be01d5fb8,89.9,cool_stuff,96001,8559,20
49661,6414e4e32a19c462fab11da3e3a03ac4,f51dde119b9a90e3c7464f0cc45c0953,18.9,electronics,37969,30981,26
87264,5460a05579eef48f6c8d54ca6c3f2249,073f139ce5bceceb73c9f287e2540eb3,229.0,furniture_decor,32057,917,39
41875,0a671290b0be76ffb379a3c0031d2f44,d7e7b1fc76e781313a0f5137d801951d,319.49,sports_leisure,3990,27335,67
43892,ff38e180176d38e46f43488a0571d6c2,24c66f106f642621e524291a895c9032,159.9,health_beauty,97010,4771,43


In [7]:
taxonomy_df = merged_df[['product_id_encoded', 'product_category_name_encoded']].drop_duplicates()
taxonomy_df.to_csv('./data/taxonomy.csv', index=False, header=False)
taxonomy_df.sample(10)

Unnamed: 0,product_id_encoded,product_category_name_encoded
57831,10734,7
84112,29586,39
103088,7943,68
91405,21477,72
99044,22178,26
27989,21922,49
62234,27133,7
69389,26387,67
72174,15968,70
104295,12060,49


In [8]:
with open('./data/transaction.csv', 'w') as f:
    for order_id, group_df in merged_df.groupby('order_id'):
        product_quantities = {}
        
        for index, row in group_df.iterrows():
            product_id = row['product_id_encoded']
            if product_id in product_quantities:
                product_quantities[product_id] += 1
            else:
                product_quantities[product_id] = 1
        
        product_ids = [str(product_id) for product_id in product_quantities.keys()]
        total_price = sum(row['price'] * quantity for product_id, quantity in product_quantities.items())
        subtotals = [str(row['price'] * quantity) for product_id, quantity in product_quantities.items()]
        formatted_line = f"{' '.join(product_ids)}:{total_price}:{' '.join(subtotals)}\n"
        
        f.write(formatted_line)