# INPUT FILES FORMATTING

In [1]:

import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import LabelEncoder

## Replacing Portuguese Categories with English Translations

In [2]:
products_df = pd.read_csv("./data/olist_products_dataset.csv")
translations_df = pd.read_csv("./data/product_category_name_translation.csv")

translations_dict = dict(zip(translations_df["product_category_name"], translations_df["product_category_name_english"]))
products_df["product_category_name"] = products_df["product_category_name"].apply(lambda x: translations_dict[x] if x in translations_dict else x)

products_df.to_csv("./data/olist_products_dataset.csv", index=False)

## Removing All Double Quotes

In [3]:
def remove_double_quotes_from_file(file_path):
    try:
        df = pd.read_csv(file_path)
        df = df.applymap(lambda x: str(x).replace('"', ''))
        df.to_csv(file_path, index=False)
        print(f"Double quotes removed from {file_path}.")
    except Exception as e:
        print(f"Error processing {file_path}: {e}")

data_folder = "data/"
files = [os.path.join(data_folder, file) for file in os.listdir(data_folder) if os.path.isfile(os.path.join(data_folder, file))]
for file in files:
    if file.endswith(".csv"):
        remove_double_quotes_from_file(file)


Double quotes removed from data/olist_order_items_dataset.csv.
Double quotes removed from data/olist_sellers_dataset.csv.
Double quotes removed from data/olist_customers_dataset.csv.
Double quotes removed from data/olist_products_dataset.csv.
Double quotes removed from data/olist_geolocation_dataset.csv.
Double quotes removed from data/olist_orders_dataset.csv.
Double quotes removed from data/product_category_name_translation.csv.
Double quotes removed from data/olist_order_reviews_dataset.csv.
Double quotes removed from data/DATA.csv.
Double quotes removed from data/olist_order_payments_dataset.csv.


## Formatting input files for the algorithm
### Formatting Transaction database
In the **transaction database**, each line represents a transaction. Each line is composed of three sections, as follows.

1. First, the **items** contained in the transaction are listed. An item is represented by a positive integer. Each item is separated from the next item by a single space. It is assumed that all items within a same transaction are sorted according to a total order (e.g. ascending order) and that no item can appear twice within the same transaction.
2. Second, the symbol “:” appears and is followed by the **transaction utility** (an integer).
3. Third, the symbol “:” appears and is followed by the **utility of each item** in this transaction (an integer), separated by single spaces.

For example, for the previous example, the input file for the transaction is defined as follows:

1 3:6:5 1\
5:3:3\
1 2 3 4 5:25:5 10 1 6 3\
2 3 4 5:20:8 3 6 3\
1 3 4:8:5 1 2\
1 3 5:22:10 6 6\
2 3 5:9:4 2 3

Consider the first line. It means that the transaction {1, 3} has a total utility of 6 and that items 1 and 3 respectively have a utility of 5 and 1 in this transaction. The following lines follow the same format.


### Formatting Taxonomy database

In the **taxonomy database**, each line represents a category relationship. Each line is composed of two sections, as follows.

1. First, the **item**.
2. Second, the symbol “,” appears and is followed by a **generalized item**, representing the category that the item belongs.

For example, for the previous example, the input file for the **taxonomy** is defined as follows:

1,6\
2,6\
3,7\
4,8\
5,8\
6,7

Consider the first line. It means that the item 1 is belongs to the generalized item 6.



In [4]:
order_items_df = pd.read_csv("./data/olist_order_items_dataset.csv")
products_df = pd.read_csv("./data/olist_products_dataset.csv")
merged_df = pd.merge(order_items_df, products_df, on="product_id", how="inner")
print("----Dataframe shapes----")
print(f'Order items: {order_items_df.shape}')
print(f'Products: {products_df.shape}')
print(f'Merged: {merged_df.shape}')
print()

merged_df = merged_df[['order_id', 'product_id', 'price', 'product_category_name']]
merged_df.sample(10)
# print merged_df without index 

----Dataframe shapes----
Order items: (112650, 7)
Products: (32951, 9)
Merged: (112650, 15)



Unnamed: 0,order_id,product_id,price,product_category_name
32339,53f5a7f622d498ff3eeb334b8efa7ae7,eff955ba97941dc6837a770367d66944,43.9,furniture_decor
88903,5a92b60d387be04eb53698b2f8b9b043,11165ade430c6cc010bb0042fe88d150,189.0,toys
37292,3ac72d6b4d6ecee25955544a54657e3b,781afe929e3016a667f5f439afd55fce,107.7,sports_leisure
103603,ab30819331b8ee6115370bcfa01a8350,edb5629baae4be4447033a2d14b1d2d3,198.0,bed_bath_table
64230,ff7ea91a98e1590d290e99bcffefa149,1c21c1ed398eae61184065889ca3d4b0,69.0,bed_bath_table
12169,9bdc00acb04f677adc8329b932122fd6,422879e10f46682990de24d770e7f83d,49.91,garden_tools
16773,50564f05e21e6d61f68d7a65c57c8623,57fcacc3434a1f2f2b039c1b4e61f5e1,18.9,telephony
64011,1e54e64066c85e4c0b929c0dde6bec3b,03940e741e919e2c258e177d875f8423,59.9,auto
1908,383e295c58b1b5e923ba15932359ad38,bdcf6a834e8faa30dac3886c7a58e92e,35.9,health_beauty
29650,333c2d7fa97da12f6523d992565cc8a7,cfb763496d9fc48751a27db4fd02aa2d,49.95,bed_bath_table


In [5]:
print("--------Missing values-------")
print(merged_df.isnull().sum())

print("\n-Removing all missing values-")
merged_df = merged_df.dropna()
print(merged_df.isnull().sum())


merged_df.sample(10)

--------Missing values-------
order_id                    0
product_id                  0
price                       0
product_category_name    1603
dtype: int64

-Removing all missing values-
order_id                 0
product_id               0
price                    0
product_category_name    0
dtype: int64


Unnamed: 0,order_id,product_id,price,product_category_name
13184,9e6491dcf30125187adc8a3b5b5a571b,29427de7f8a9ee983d9dbc51cec569b4,99.99,cool_stuff
87895,567ca196ed2fefd9a66f852f8fd67058,39fbb313b6c8c6cc28e358f37e3068c1,179.9,watches_gifts
101078,99d3df1714b20b9eb9734e731927379a,42d2add84cc80443a8128adc87b6a958,299.9,perfumery
65045,1fcbf9da01b8072e053902e28b52ac9f,d1ff1a1223a2739d17005c9f4c4a40b6,99.0,stationery
24627,92723afaec3525daf0ecfb20994aafa0,dc404a1496a08f9f5540c8b5d4b92925,299.99,toys
20699,7df81d6dcc2ba63195a6197f6362c4bb,25e2023ed83352bde98dc1490d14c3d8,99.0,toys
74437,d34c74f081e266474543cef1597b1e60,b570f2a8228eeaa87e224f25264576b5,84.9,bed_bath_table
79430,5486a5d84a1d6dcc94887f6f2821a4be,5bdfe8e8b816fda68f9f8eb7c0c08e62,37.0,bed_bath_table
9477,4753fcd70058320b38a45cc68f6de6c7,53759a2ecddad2bb87a079a1f1519f73,59.0,garden_tools
24360,9183e2fd9515c57925a0251ccfc21c63,4ae634441e444ca4bc85903cafe98d73,13.98,telephony


In [6]:
encoder = LabelEncoder()
merged_df['order_id_encoded'] = encoder.fit_transform(merged_df['order_id'])
merged_df['product_id_encoded'] = encoder.fit_transform(merged_df['product_id'])
merged_df['product_id_encoded'] = merged_df['product_id_encoded'] + 1
merged_df['product_category_name_encoded'] = encoder.fit_transform(merged_df['product_category_name'])
merged_df['product_category_name_encoded'] = merged_df['product_category_name_encoded'] + max(merged_df['product_id_encoded']) + 1
merged_df['price'] = merged_df['price'].astype(int)
merged_df.sample(10)

Unnamed: 0,order_id,product_id,price,product_category_name,order_id_encoded,product_id_encoded,product_category_name_encoded
79448,3c20f6cdfc62556d4c0ee421cbff6d76,bfd4d65a528678737bcb523e5a3c5f58,105,garden_tools,22796,24201,32384
28726,d626131b771ffd6c88169da39f702900,d017a2151d543a9885604dc62a3d9dcc,49,fashion_bags_accessories,81511,26340,32370
26666,cbc956fc2a27bbc8dc18bd3a27ce7127,ee406bf28024d97771c4b1e8b7e8e219,144,cool_stuff,77456,30104,32362
51540,4b1104831c2d0c40dc17dc5c9d5cfb35,0eda6e310c91549299a2f97d0354f7fb,104,health_beauty,28548,1912,32385
16623,57ebffcbf30e5b8850f0e4c8713fd2dc,7d50930301709838d8e90dcac5cfaa84,59,sports_leisure,33387,15955,32409
101115,9a0b2a68507680a6e151319e4b6e68f7,01e20e6604216c8adb31d463214ba00c,319,home_construction,58164,247,32390
110753,ea461a2421a4be04b0d69e11e12788ca,969c37a70b5e0bec58e389978297e72f,39,costruction_tools_garden,89070,18991,32363
46345,b52d29abb97a3978df1d8922bfe36fbe,6463e442310f2f4f321c83a32d8c662b,138,health_beauty,68738,12733,32385
57777,2cd7c5efeb90335265680de1f5a82a44,a02d0123079f4ae96001ba2010d1a2df,230,construction_tools_lights,16888,20234,32360
88235,57e4fc8f20507381b3f7abbe4e2ccda3,6ca11e79ed1acb748ccbb8e6b821967d,199,perfumery,33382,13778,32402


In [7]:
taxonomy_df = merged_df[['product_id_encoded', 'product_category_name_encoded']].drop_duplicates()
taxonomy_df = taxonomy_df.sort_values(by='product_id_encoded')
taxonomy_df.to_csv('./data/taxonomy.txt', index=False, header=False)
taxonomy_df.head(10)

Unnamed: 0,product_id_encoded,product_category_name_encoded
111565,1,32402
111787,2,32347
44201,3,32349
79011,4,32391
102982,5,32414
105784,6,32347
107382,7,32362
60429,8,32358
102030,9,32349
36133,10,32381


In [8]:
with open('./data/transaction.txt', 'w') as f:
    for order_id, group_df in merged_df.groupby('order_id'):
        product_quantities = {}
        
        if not group_df.empty:
            for index, row in group_df.iterrows():
                product_id = row['product_id_encoded']
                if product_id in product_quantities:
                    product_quantities[product_id] += 1
                else:
                    product_quantities[product_id] = 1
                    
                product_ids = [str(product_id) for product_id in product_quantities.keys()]
                total_price = sum(row['price'] * quantity for product_id, quantity in product_quantities.items())
                
                subtotals = [str(row['price'] * quantity) for product_id, quantity in product_quantities.items()]
                formatted_line = f"{' '.join(product_ids)}:{total_price}:{' '.join(subtotals)}\n"
                f.write(formatted_line)

