# INPUT FILES FORMATTING

In [1]:

import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import LabelEncoder

## Replacing Portuguese Categories with English Translations

In [5]:
products_df = pd.read_csv("./data/olist_products_dataset.csv")
translations_df = pd.read_csv("./data/product_category_name_translation.csv")

translations_dict = dict(zip(translations_df["product_category_name"], translations_df["product_category_name_english"]))
products_df["product_category_name"] = products_df["product_category_name"].apply(lambda x: translations_dict[x] if x in translations_dict else x)

products_df.to_csv("./data/olist_products_dataset.csv", index=False)

## Removing All Double Quotes

In [13]:
def remove_double_quotes_from_file(file_path):
    try:
        df = pd.read_csv(file_path)
        df = df.applymap(lambda x: str(x).replace('"', ''))
        df.to_csv(file_path, index=False)
        print(f"Double quotes removed from {file_path}.")
    except Exception as e:
        print(f"Error processing {file_path}: {e}")

data_folder = "data/"
files = [os.path.join(data_folder, file) for file in os.listdir(data_folder) if os.path.isfile(os.path.join(data_folder, file))]
for file in files:
    if file.endswith(".csv"):
        remove_double_quotes_from_file(file)


Double quotes removed from data/olist_customers_dataset.csv.
Double quotes removed from data/olist_geolocation_dataset.csv.
Double quotes removed from data/olist_orders_dataset.csv.
Double quotes removed from data/olist_order_items_dataset.csv.
Double quotes removed from data/olist_order_payments_dataset.csv.
Double quotes removed from data/olist_order_reviews_dataset.csv.
Double quotes removed from data/olist_products_dataset.csv.
Double quotes removed from data/olist_sellers_dataset.csv.
Double quotes removed from data/product_category_name_translation.csv.
Double quotes removed from data/taxonomy.csv.
Double quotes removed from data/transaction.csv.


## Formatting input files for the algorithm
### Formatting Transaction database
In the **transaction database**, each line represents a transaction. Each line is composed of three sections, as follows.

1. First, the **items** contained in the transaction are listed. An item is represented by a positive integer. Each item is separated from the next item by a single space. It is assumed that all items within a same transaction are sorted according to a total order (e.g. ascending order) and that no item can appear twice within the same transaction.
2. Second, the symbol “:” appears and is followed by the **transaction utility** (an integer).
3. Third, the symbol “:” appears and is followed by the **utility of each item** in this transaction (an integer), separated by single spaces.

For example, for the previous example, the input file for the transaction is defined as follows:

1 3:6:5 1\
5:3:3\
1 2 3 4 5:25:5 10 1 6 3\
2 3 4 5:20:8 3 6 3\
1 3 4:8:5 1 2\
1 3 5:22:10 6 6\
2 3 5:9:4 2 3

Consider the first line. It means that the transaction {1, 3} has a total utility of 6 and that items 1 and 3 respectively have a utility of 5 and 1 in this transaction. The following lines follow the same format.


### Formatting Taxonomy database

In the **taxonomy database**, each line represents a category relationship. Each line is composed of two sections, as follows.

1. First, the **item**.
2. Second, the symbol “,” appears and is followed by a **generalized item**, representing the category that the item belongs.

For example, for the previous example, the input file for the **taxonomy** is defined as follows:

1,6\
2,6\
3,7\
4,8\
5,8\
6,7

Consider the first line. It means that the item 1 is belongs to the generalized item 6.



In [14]:
order_items_df = pd.read_csv("./data/olist_order_items_dataset.csv")
products_df = pd.read_csv("./data/olist_products_dataset.csv")
merged_df = pd.merge(order_items_df, products_df, on="product_id", how="inner")
print("----Dataframe shapes----")
print(f'Order items: {order_items_df.shape}')
print(f'Products: {products_df.shape}')
print(f'Merged: {merged_df.shape}')
print()

merged_df = merged_df[['order_id', 'product_id', 'price', 'product_category_name']]
merged_df.sample(10)
# print merged_df without index 

----Dataframe shapes----
Order items: (112650, 7)
Products: (32951, 9)
Merged: (112650, 15)



Unnamed: 0,order_id,product_id,price,product_category_name
50068,be382a9e1ed25128148b97d6bfdb21af,6dde44b4172999f35f08654d06bad633,194.99,office_furniture
48955,35b2be75ac71d838d1743e201de6bcaa,937c825572d26b9e5a723527c6d625c6,44.57,bed_bath_table
99277,8d91588bf6ce83db4198f80c56776af6,32f48d8080ba3d1d0cf00475f68cdb07,209.99,furniture_decor
41493,e98fdad1f9ba500a9aecfaa90e1662cd,e8316a4667e5870c85e906b1f062bde1,78.99,office_furniture
108914,d746b7265b49f240491cad6ed27e20be,93e27ca75901cf1b9d641cd11dd9f6e5,45.9,housewares
15465,bdbe430ba78a7cff2db4ac139372d5b5,2028bf1b01cafb2d2b1901fca4083222,56.99,perfumery
106598,c297f4fecff2d1898ed5266cd2c22b29,5ba865074da5eba1460c298b574f9927,160.0,fashion_bags_accessories
39842,a17b0f920f100411c57496d41de7aa12,16679d68bdbed3fb97af806424d7f2b7,29.0,health_beauty
105847,bc2875d103c239620c7d56924ce991df,a50b1500d1f7ec62d2f3fba260974613,86.3,furniture_decor
22036,a2749402701871292b1ca325a1b9f6ff,44d097d59e8430f88a67517cd0c4f865,69.0,fashion_bags_accessories


In [15]:
print("--------Missing values-------")
print(merged_df.isnull().sum())

print("\n-Removing all missing values-")
merged_df = merged_df.dropna()
print(merged_df.isnull().sum())


merged_df.sample(10)

--------Missing values-------
order_id                    0
product_id                  0
price                       0
product_category_name    1603
dtype: int64

-Removing all missing values-
order_id                 0
product_id               0
price                    0
product_category_name    0
dtype: int64


Unnamed: 0,order_id,product_id,price,product_category_name
105663,baa821bd1dbd5c6d31c9b1c68f760b8b,1c6771d60124b84ada98e8f67ba111db,3.98,housewares
79521,a02f84215d0352b46e21cd5c8bba24f8,af827351cc8287dd464466e8793046b0,66.9,health_beauty
63164,1d35be28af536ed23f7e3ae4daa91aaf,c71a9c53596bf311acd9e298a0b3dc42,10.9,electronics
34441,0dca3fc331e8c1be1dd82ecb2689f3af,362b773250263786dd58670d2df42c3b,59.9,sports_leisure
62473,42c85bf5583375774d0e2a73b4ba4d7e,44a6e4ecf7b48f87481c243383b1c4cf,42.9,health_beauty
79687,3cc541809872382b28df3d51b420c807,a12d9f996527844ba7beaa1abe2bcfcb,39.99,fashion_bags_accessories
7196,54bc2449fbe0368b74470e4280fe494e,78efe838c04bbc568be034082200ac20,99.9,furniture_decor
12504,33315f815f0bf4c95cf6b4dbc3c89024,c4baedd846ed09b85f78a781b522f126,89.99,garden_tools
53727,c8ddaf4d1bb162951e8e76d02c160867,7c24503a64637f3ab756ed1b44826747,35.9,toys
88478,bdabf4d21213c2844d0b672d8fd72054,216ac797639b7734f0b125861ea845ae,44.0,stationery


In [16]:
encoder = LabelEncoder()
merged_df['order_id_encoded'] = encoder.fit_transform(merged_df['order_id'])
merged_df['product_id_encoded'] = encoder.fit_transform(merged_df['product_id'])
merged_df['product_category_name_encoded'] = encoder.fit_transform(merged_df['product_category_name'])
merged_df.sample(10)

Unnamed: 0,order_id,product_id,price,product_category_name,order_id_encoded,product_id_encoded,product_category_name_encoded
69115,38173930252d680d26301b01fbc43de4,3871b7939245ed35ab97c7d3924c6862,17.99,computers_accessories,21223,7201,15
78683,39fc0202a3b70b2619a6f3e579beedeb,8f8cb7e4a7f16d339f87f8aa2711a003,79.9,toys,21994,18158,71
105619,ba57df46d6c7801e304a57084265bd86,55b22991373bc1fc844500be10faf60d,110.9,sports_leisure,70751,10945,67
70856,a5376eacc65c47eeceb1e796b6c2600d,269616df4c79febdae020f2d978594ed,89.0,air_conditioning,62488,5007,1
47728,cbec458163d85abfb23e8838fd9196e6,5a57a59c44429be19e1ce8e69e15c473,120.0,housewares,77507,11508,49
56975,4f04661f0e0a1a23f6d87956884f4a02,f32415d23c358ef1e387a7d329d9ce9f,36.9,stationery,30072,30735,68
11620,7df9173a1f3284d95a5ca6ac5aad32e2,0bcc3eeca39e1064258aa1e932269894,49.0,garden_tools,47789,1483,42
33886,93a5d882408bb90dde6eaf5159ac5adb,371fa349d92646bbeea5873df0bcdc5a,39.0,housewares,55788,7025,49
33301,9bdb93ca8e16e9436136b6ed68b6d507,2d063994ae4667cff3d6841e9d441d1d,119.99,home_appliances_2,58861,5799,45
102940,a685d016c8a26f71a0bb67821070e398,ebd7c847c1e1cb69ec374ae0ebee1f4c,84.9,furniture_decor,63016,29793,39


In [17]:
taxonomy_df = merged_df[['product_id_encoded', 'product_category_name_encoded']].drop_duplicates()
taxonomy_df.to_csv('./data/taxonomy.csv', index=False, header=False)
taxonomy_df.sample(10)

Unnamed: 0,product_id_encoded,product_category_name_encoded
69403,20640,15
100177,21786,57
108807,9809,17
107773,7524,72
46583,20045,70
109971,15970,7
109021,28479,39
40842,25658,39
60128,9292,50
108403,11842,71


In [11]:
with open('./data/transaction.csv', 'w') as f:
    for order_id, group_df in merged_df.groupby('order_id'):
        product_quantities = {}
        
        for index, row in group_df.iterrows():
            product_id = row['product_id_encoded']
            if product_id in product_quantities:
                product_quantities[product_id] += 1
            else:
                product_quantities[product_id] = 1
        
        product_ids = [str(product_id) for product_id in product_quantities.keys()]
        total_price = sum(row['price'] * quantity for product_id, quantity in product_quantities.items())
        subtotals = [str(row['price'] * quantity) for product_id, quantity in product_quantities.items()]
        formatted_line = f"{' '.join(product_ids)}:{total_price}:{' '.join(subtotals)}\n"
        
        f.write(formatted_line)