# INPUT FILES FORMATTING

In [1]:

import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import LabelEncoder

## Replacing Portuguese Categories with English Translations

In [2]:
products_df = pd.read_csv("./data/olist_products_dataset.csv")
translations_df = pd.read_csv("./data/product_category_name_translation.csv")

translations_dict = dict(zip(translations_df["product_category_name"], translations_df["product_category_name_english"]))
products_df["product_category_name"] = products_df["product_category_name"].apply(lambda x: translations_dict[x] if x in translations_dict else x)

products_df.to_csv("./data/olist_products_dataset.csv", index=False)

## Removing All Double Quotes

In [3]:
def remove_double_quotes_from_file(file_path):
    try:
        df = pd.read_csv(file_path)
        df = df.applymap(lambda x: str(x).replace('"', ''))
        df.to_csv(file_path, index=False)
        print(f"Double quotes removed from {file_path}.")
    except Exception as e:
        print(f"Error processing {file_path}: {e}")

data_folder = "data/"
files = [os.path.join(data_folder, file) for file in os.listdir(data_folder) if os.path.isfile(os.path.join(data_folder, file))]
for file in files:
    if file.endswith(".csv"):
        remove_double_quotes_from_file(file)


Double quotes removed from data/olist_order_items_dataset.csv.
Double quotes removed from data/olist_sellers_dataset.csv.
Double quotes removed from data/olist_customers_dataset.csv.
Double quotes removed from data/olist_products_dataset.csv.
Double quotes removed from data/olist_geolocation_dataset.csv.
Double quotes removed from data/olist_orders_dataset.csv.
Double quotes removed from data/product_category_name_translation.csv.
Double quotes removed from data/olist_order_reviews_dataset.csv.
Double quotes removed from data/olist_order_payments_dataset.csv.


## Formatting input files for the algorithm
### Formatting Transaction database
In the **transaction database**, each line represents a transaction. Each line is composed of three sections, as follows.

1. First, the **items** contained in the transaction are listed. An item is represented by a positive integer. Each item is separated from the next item by a single space. It is assumed that all items within a same transaction are sorted according to a total order (e.g. ascending order) and that no item can appear twice within the same transaction.
2. Second, the symbol “:” appears and is followed by the **transaction utility** (an integer).
3. Third, the symbol “:” appears and is followed by the **utility of each item** in this transaction (an integer), separated by single spaces.

For example, for the previous example, the input file for the transaction is defined as follows:

1 3:6:5 1\
5:3:3\
1 2 3 4 5:25:5 10 1 6 3\
2 3 4 5:20:8 3 6 3\
1 3 4:8:5 1 2\
1 3 5:22:10 6 6\
2 3 5:9:4 2 3

Consider the first line. It means that the transaction {1, 3} has a total utility of 6 and that items 1 and 3 respectively have a utility of 5 and 1 in this transaction. The following lines follow the same format.


### Formatting Taxonomy database

In the **taxonomy database**, each line represents a category relationship. Each line is composed of two sections, as follows.

1. First, the **item**.
2. Second, the symbol “,” appears and is followed by a **generalized item**, representing the category that the item belongs.

For example, for the previous example, the input file for the **taxonomy** is defined as follows:

1,6\
2,6\
3,7\
4,8\
5,8\
6,7

Consider the first line. It means that the item 1 is belongs to the generalized item 6.



In [4]:
order_items_df = pd.read_csv("./data/olist_order_items_dataset.csv")
products_df = pd.read_csv("./data/olist_products_dataset.csv")
merged_df = pd.merge(order_items_df, products_df, on="product_id", how="inner")
print("----Dataframe shapes----")
print(f'Order items: {order_items_df.shape}')
print(f'Products: {products_df.shape}')
print(f'Merged: {merged_df.shape}')
print()

merged_df = merged_df[['order_id', 'product_id', 'price', 'product_category_name']]
merged_df.sample(10)
# print merged_df without index 

----Dataframe shapes----
Order items: (112650, 7)
Products: (32951, 9)
Merged: (112650, 15)



Unnamed: 0,order_id,product_id,price,product_category_name
45665,b449aa0c8526754fe10deb8c4596029e,cad2db5e318d7283a234c517b2f28a42,105.9,bed_bath_table
42548,b0e2da4433a10c624a9f71b187e93915,acdd9eaa79720ca045ce3768250b8e47,99.9,health_beauty
18587,8c389c12aa23b689926d20a01729206d,fb55982be901439613a95940feefd9ee,79.0,stationery
48840,28a1f9cca638114ce8fc6ddfce2b7d52,7fa9cd17cea7ecbac9778cc86b7f9033,27.0,housewares
82528,fa01cb3e41fcdf8cf3c55e01a5477d46,44053ed0088d0655ed338810cd2d4d51,132.9,bed_bath_table
44201,0bf736fd0fd5169d60de3699fcbcf986,0009406fd7479715e4bef61dd91f2462,229.0,bed_bath_table
37930,dedfae6510ab02ac08526276e245e006,251876e7ba013b87b7262fb2524e2d60,64.89,baby
5269,3e36ac046e031f1d3e1f8ce6767c018c,779559842fc122d55edbd03153f35e81,26.9,sports_leisure
45293,35e1000125ea64ee7fcffec9d5e110b8,c72edc98e7d77f0c4c04ec80730c05f5,79.0,sports_leisure
79441,56e11716ba370a51d5795ecf34f9510d,1f3c4e3bcb3d2d1c4da4692dbf20de5a,49.0,perfumery


In [5]:
print("--------Missing values-------")
print(merged_df.isnull().sum())

print("\n-Removing all missing values-")
merged_df = merged_df.dropna()
print(merged_df.isnull().sum())


merged_df.sample(10)

--------Missing values-------
order_id                    0
product_id                  0
price                       0
product_category_name    1603
dtype: int64

-Removing all missing values-
order_id                 0
product_id               0
price                    0
product_category_name    0
dtype: int64


Unnamed: 0,order_id,product_id,price,product_category_name
68154,24aa32ff7b966959313509b567cb68be,6921371f065f148b890f1007f4520c14,40.0,auto
46616,31cb8821ab778cd23ebb6ce6f9e2bce0,a0fe1efb855f3e786f0650268cd77f44,21.99,agro_industry_and_commerce
68888,c0fc5c6702f93ec8646c5404b4147864,0f91b6a95a57a43203a9b17de98960d9,116.9,health_beauty
73813,a4a495e7d0758f5523480d2f4cd12515,640dcd3e549f736cd39e1f943ff14c3b,24.99,fashion_bags_accessories
100746,9798002938de9acb5883e90411ded034,88d1ed18fd38fa0ffeeb4b517ca42a9d,36.9,housewares
28422,34d7c2c94f8f4d4f2153adcb60b22145,6c3effec7c8ddba466d4f03f982c7aa3,19.9,consoles_games
53518,12c8a9b7b11a8922911c9ecf62dc0aa7,be75efb07aff79cef2eec7ddc5de3952,99.9,sports_leisure
35422,088858a852d62de0c175c9cc4bc0ddb6,bfc275f6de912665e4dcd8da32f43c10,97.0,housewares
25376,510a5c60f473c07f3cb21d6506c0b4ba,b38b25d838ae0b8385e8cc68b9017644,180.0,health_beauty
4395,c77338848d4979e1c775831208262849,d1c427060a0f73f6b889a5c7c61f2ac4,149.0,computers_accessories


In [6]:
# encode starting with 1 not 0

encoder = LabelEncoder()
merged_df['order_id_encoded'] = encoder.fit_transform(merged_df['order_id'])
merged_df['order_id_encoded'] = merged_df['order_id_encoded'] + 1
merged_df['product_id_encoded'] = encoder.fit_transform(merged_df['product_id'])
merged_df['product_id_encoded'] = merged_df['product_id_encoded'] + 1
merged_df['product_category_name_encoded'] = encoder.fit_transform(merged_df['product_category_name'])
merged_df['price'] = merged_df['price'].astype(int)
merged_df.sample(10)

Unnamed: 0,order_id,product_id,price,product_category_name,order_id_encoded,product_id_encoded,product_category_name_encoded
10868,93e7c3691fb66f8e336d76b73d006cc1,79c0ace2ccf0da036e787875a49151a4,379,kitchen_dining_laundry_garden_furniture,55905,15515,51
21621,3569dcadd0374718b87bead0d7241d6d,84d2057bc1f88332d0892eea0e997d28,27,telephony,20218,16849,70
44701,30dfb5b33bf44683be3e2f7df4bdd769,759c85bda80286f647f1f71b847c6457,33,housewares,18438,14973,49
58473,c1712cb5d91f5d05267fea1773f3aec9,55b71bf300a2765a382eaec566df531f,74,sports_leisure,73492,10947,67
50240,97e63289be97b0d9bfd3fad99000f662,d41d8cd98f00b204e9800998ecf8427e,229,auto,57387,26849,5
73781,b0ef7f818e22c209f99d74eef200d2fa,78120320a8a3d2656f159d513fcb83b6,40,furniture_decor,67061,15303,39
71020,ccecb43e46d11aa9ed8c92440657995a,5457c026a643626249213e1e39c31d10,247,perfumery,77879,10758,60
28232,5928d420527d13a50a884bc80c6ea676,0fc6914383c2bea767a9562c9d8b43a1,48,computers_accessories,33836,2031,15
7295,636fac4217a01ca1f468baf22a52e945,aba86c093ccdbac75b09111d57e50004,119,office_furniture,37736,21588,57
110929,ec451fb3a1ee5782fef4937473a9ffac,1377fd796465e86c4306ad7afc6a6dfe,269,computers_accessories,89830,2508,15


In [7]:
taxonomy_df = merged_df[['product_id_encoded', 'product_category_name_encoded']].drop_duplicates()
taxonomy_df = taxonomy_df.sort_values(by='product_id_encoded')
taxonomy_df.to_csv('./data/taxonomy.txt', index=False, header=False)
taxonomy_df.head(10)

Unnamed: 0,product_id_encoded,product_category_name_encoded
111565,1,60
111787,2,5
44201,3,7
79011,4,49
102982,5,72
105784,6,5
107382,7,20
60429,8,16
102030,9,7
36133,10,39


In [8]:
with open('./data/transaction.txt', 'w') as f:
    for order_id, group_df in merged_df.groupby('order_id'):
        product_quantities = {}
        
        if not group_df.empty:
            for index, row in group_df.iterrows():
                product_id = row['product_id_encoded']
                if product_id in product_quantities:
                    product_quantities[product_id] += 1
                else:
                    product_quantities[product_id] = 1
                    
                product_ids = [str(product_id) for product_id in product_quantities.keys()]
                total_price = sum(row['price'] * quantity for product_id, quantity in product_quantities.items())
                
                subtotals = [str(row['price'] * quantity) for product_id, quantity in product_quantities.items()]
                formatted_line = f"{' '.join(product_ids)}:{total_price}:{' '.join(subtotals)}\n"
                f.write(formatted_line)

