# INPUT FILES FORMATTING

In [1]:

import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import LabelEncoder

## Replacing Portuguese Categories with English Translations

In [2]:
products_df = pd.read_csv("./data/olist_products_dataset.csv")
translations_df = pd.read_csv("./data/product_category_name_translation.csv")

translations_dict = dict(zip(translations_df["product_category_name"], translations_df["product_category_name_english"]))
products_df["product_category_name"] = products_df["product_category_name"].apply(lambda x: translations_dict[x] if x in translations_dict else x)

products_df.to_csv("./data/olist_products_dataset.csv", index=False)

## Removing All Double Quotes

In [3]:
def remove_double_quotes_from_file(file_path):
    try:
        df = pd.read_csv(file_path)
        df = df.applymap(lambda x: str(x).replace('"', ''))
        df.to_csv(file_path, index=False)
        print(f"Double quotes removed from {file_path}.")
    except Exception as e:
        print(f"Error processing {file_path}: {e}")

data_folder = "data/"
files = [os.path.join(data_folder, file) for file in os.listdir(data_folder) if os.path.isfile(os.path.join(data_folder, file))]
for file in files:
    if file.endswith(".csv"):
        remove_double_quotes_from_file(file)


Double quotes removed from data/olist_order_items_dataset.csv.
Double quotes removed from data/olist_sellers_dataset.csv.
Double quotes removed from data/olist_customers_dataset.csv.
Double quotes removed from data/olist_products_dataset.csv.
Double quotes removed from data/olist_geolocation_dataset.csv.
Double quotes removed from data/olist_orders_dataset.csv.
Double quotes removed from data/product_category_name_translation.csv.
Double quotes removed from data/olist_order_reviews_dataset.csv.
Double quotes removed from data/olist_order_payments_dataset.csv.


## Formatting input files for the algorithm
### Formatting Transaction database
In the **transaction database**, each line represents a transaction. Each line is composed of three sections, as follows.

1. First, the **items** contained in the transaction are listed. An item is represented by a positive integer. Each item is separated from the next item by a single space. It is assumed that all items within a same transaction are sorted according to a total order (e.g. ascending order) and that no item can appear twice within the same transaction.
2. Second, the symbol “:” appears and is followed by the **transaction utility** (an integer).
3. Third, the symbol “:” appears and is followed by the **utility of each item** in this transaction (an integer), separated by single spaces.

For example, for the previous example, the input file for the transaction is defined as follows:

1 3:6:5 1\
5:3:3\
1 2 3 4 5:25:5 10 1 6 3\
2 3 4 5:20:8 3 6 3\
1 3 4:8:5 1 2\
1 3 5:22:10 6 6\
2 3 5:9:4 2 3

Consider the first line. It means that the transaction {1, 3} has a total utility of 6 and that items 1 and 3 respectively have a utility of 5 and 1 in this transaction. The following lines follow the same format.


### Formatting Taxonomy database

In the **taxonomy database**, each line represents a category relationship. Each line is composed of two sections, as follows.

1. First, the **item**.
2. Second, the symbol “,” appears and is followed by a **generalized item**, representing the category that the item belongs.

For example, for the previous example, the input file for the **taxonomy** is defined as follows:

1,6\
2,6\
3,7\
4,8\
5,8\
6,7

Consider the first line. It means that the item 1 is belongs to the generalized item 6.



In [4]:
order_items_df = pd.read_csv("./data/olist_order_items_dataset.csv")
products_df = pd.read_csv("./data/olist_products_dataset.csv")
merged_df = pd.merge(order_items_df, products_df, on="product_id", how="inner")
print("----Dataframe shapes----")
print(f'Order items: {order_items_df.shape}')
print(f'Products: {products_df.shape}')
print(f'Merged: {merged_df.shape}')
print()

merged_df = merged_df[['order_id', 'product_id', 'price', 'product_category_name']]
merged_df.sample(10)
# print merged_df without index 

----Dataframe shapes----
Order items: (112650, 7)
Products: (32951, 9)
Merged: (112650, 15)



Unnamed: 0,order_id,product_id,price,product_category_name
39226,a41222506c789f70f25201004288819a,2f13d1dc8b4e1d9d8027be50339546a9,105.0,furniture_decor
35684,0758eac62489b47a99c64b7d67c5ff18,b9023d686fca0dca3ac844814e92e92b,21.9,electronics
28691,9e07481935df5f97ce05a674b3165bd5,d017a2151d543a9885604dc62a3d9dcc,49.0,fashion_bags_accessories
58698,463f34a9bee899d2d1f5f08af0c86d06,ee5329ffbe604bebcf84b2fd7792024e,233.0,health_beauty
46756,d8fd983079230409820f11f044e4fe4a,a00d11a2119bd70d658fc7cdcf1f59d3,99.9,furniture_decor
24823,c1b6c1248aa41e66a84d6c6d8bcd9225,5aad359a63cbb4fc35ead44346a3eeb4,10.99,furniture_decor
42994,7c07f56b7d198e14ff2e6a757644d55c,59fe488ea6ac9439bc86663f4a564c23,17.0,telephony
60587,eb1340135e81ffed22e5703c6fbbe879,db6796aa712a2626572bb3d0b9a039cf,52.9,bed_bath_table
41520,2c1580970bae47b697ca0ee56495f27e,6777865be5ee937f7deb9c390e9d920f,34.9,furniture_decor
98920,8b388d845a3e8700444b971fa2bef902,35bd74b6319ba18ae4f30d8b54144fd2,39.9,baby


In [5]:
print("--------Missing values-------")
print(merged_df.isnull().sum())

print("\n-Removing all missing values-")
merged_df = merged_df.dropna()
print(merged_df.isnull().sum())


merged_df.sample(10)

--------Missing values-------
order_id                    0
product_id                  0
price                       0
product_category_name    1603
dtype: int64

-Removing all missing values-
order_id                 0
product_id               0
price                    0
product_category_name    0
dtype: int64


Unnamed: 0,order_id,product_id,price,product_category_name
8430,6a233c2059af7138039e43722cde5874,5d66715cc928aadd0074f61332698593,19.9,electronics
111507,f26d4637c4f560f72ba7feac1675c100,37244e10d8b612acfba950f0ef0be093,399.0,construction_tools_construction
95918,7a189472050d185af07c5550a7dfc5ad,ffcfaba393e8ef71937c6e8421bc2868,288.0,home_construction
83665,47c5b42ecbd216813dc0dbcdfb964a9e,f2844deda444d0208297c6d614d1493f,225.75,home_appliances
105922,bcc2b6a2acc5f49f9ed5ea31410be28a,83f2c628d86dde5369cde6ffb51d919a,85.9,sports_leisure
67272,3981ab3fc085c2a1d4dcca88197fe7fd,4eba76ab901d06e619d8d7ac69b3579a,43.0,baby
15154,64c233aafb76618a84c4a3af54e79bb9,ddceb6eef6af67e9fbbd4c148dbe5fd9,209.99,cool_stuff
20281,881b1412b8cff1e5c88e905fdedd7605,ecbe401f10ecd3b822d940d63cc3311e,79.0,electronics
67238,8c9cc9d7e2dfc4cb091854203914cade,33d26f298db59a201858e0248d9cee6c,149.99,toys
79356,3bddb2b41050ba947411a0fc9143fcbd,660b5f38348490e8e14f63deb7389bf2,119.9,baby


In [6]:
encoder = LabelEncoder()
merged_df['order_id_encoded'] = encoder.fit_transform(merged_df['order_id'])
merged_df['product_id_encoded'] = encoder.fit_transform(merged_df['product_id'])
merged_df['product_category_name_encoded'] = encoder.fit_transform(merged_df['product_category_name'])
merged_df.sample(10)

Unnamed: 0,order_id,product_id,price,product_category_name,order_id_encoded,product_id_encoded,product_category_name_encoded
81009,40654581b4282e70a554c9eb51fc8a91,78ae9099d0b8c74be6cb8deb9bd84f8a,39.99,baby,24382,15376,6
21376,d891fcff7f5720b6a87ecd0a3dbf0c27,764292b2b0f73f77a0272be03fdd45f3,89.9,furniture_decor,82406,15056,39
11392,6b6e84189c512720324071e9a7769ceb,629e019a6f298a83aeecc7877964f935,93.9,auto,40804,12521,5
92830,a46e7f3658f8b1ede78bcb17c859655b,70501750103046be6103489889ccb96e,210.0,garden_tools,62180,14267,42
45575,1f99a2d6f950889eb7d7a61f4d97c70c,dd6a505f83dd3c6326aa9856519e0978,49.9,toys,11988,28029,71
89699,5dba08d21f1386085c1bdd47548f34a4,7c1ac37ea884ec9c46ebf155ffc53d50,40.16,health_beauty,35485,15800,43
101479,9c990d5aff064d220ea8962a5d97f4a9,4bc67f0e0c4badffd66ddb6641122a33,48.0,watches_gifts,59147,9661,72
16809,01d3d4cb8c553d507dfd26fbd8b41e95,33e77eecc23c25a65d10f627e5053617,114.94,office_furniture,685,6637,57
25892,9436e03932021844bcaa6fb20dfc4968,e5ae72c62ebfa708624f5029d609b160,61.9,computers_accessories,56015,29024,15
20723,567b8d2e08305bf8754eb7f1a3ad7351,2c7f01e4f39f8ca21caddb86374d7b4b,249.0,electronics,32861,5736,26


In [7]:
taxonomy_df = merged_df[['product_id_encoded', 'product_category_name_encoded']].drop_duplicates()
taxonomy_df.to_csv('./data/taxonomy.csv', index=False, header=False)
taxonomy_df.sample(10)

Unnamed: 0,product_id_encoded,product_category_name_encoded
112236,29407,30
53864,5151,43
90589,3611,7
85431,31557,43
92637,10811,15
21023,1306,6
76464,11884,70
108062,2702,15
92194,13326,67
99626,711,7


In [9]:
with open('./data/transaction.csv', 'w') as f:
    for order_id, group_df in merged_df.groupby('order_id'):
        product_quantities = {}
        
        for index, row in group_df.iterrows():
            product_id = row['product_id_encoded']
            if product_id in product_quantities:
                product_quantities[product_id] += 1
            else:
                product_quantities[product_id] = 1
        
        product_ids = [str(product_id) for product_id in product_quantities.keys()]
        total_price = sum(row['price'] * quantity for product_id, quantity in product_quantities.items())
        subtotals = [str(row['price'] * quantity) for product_id, quantity in product_quantities.items()]
        formatted_line = f"{' '.join(product_ids)}:{total_price}:{' '.join(subtotals)}\n"
        
        f.write(formatted_line)