In [22]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import folium

import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', None) # display all columns, without limits

In [23]:
# Load the dataset
products_df = pd.read_csv('dataset/olist_products_dataset.csv')
products_df

Unnamed: 0,product_id,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
0,1e9e8ef04dbcff4541ed26657ea517e5,perfumaria,40.0,287.0,1.0,225.0,16.0,10.0,14.0
1,3aa071139cb16b67ca9e5dea641aaa2f,artes,44.0,276.0,1.0,1000.0,30.0,18.0,20.0
2,96bd76ec8810374ed1b65e291975717f,esporte_lazer,46.0,250.0,1.0,154.0,18.0,9.0,15.0
3,cef67bcfe19066a932b7673e239eb23d,bebes,27.0,261.0,1.0,371.0,26.0,4.0,26.0
4,9dc1a7de274444849c219cff195d0b71,utilidades_domesticas,37.0,402.0,4.0,625.0,20.0,17.0,13.0
...,...,...,...,...,...,...,...,...,...
32946,a0b7d5a992ccda646f2d34e418fff5a0,moveis_decoracao,45.0,67.0,2.0,12300.0,40.0,40.0,40.0
32947,bf4538d88321d0fd4412a93c974510e6,construcao_ferramentas_iluminacao,41.0,971.0,1.0,1700.0,16.0,19.0,16.0
32948,9a7c6041fa9592d9d9ef6cfe62a71f8c,cama_mesa_banho,50.0,799.0,1.0,1400.0,27.0,7.0,27.0
32949,83808703fc0706a22e264b9d75f04a2e,informatica_acessorios,60.0,156.0,2.0,700.0,31.0,13.0,20.0


In [24]:
# translate product categories
pcateg_translation_df = pd.read_csv('dataset/product_category_name_translation.csv')

final_products_df = products_df.merge(pcateg_translation_df, on='product_category_name', how='left')
final_products_df = final_products_df.drop(columns=['product_category_name'])
final_products_df = final_products_df.rename(columns={'product_category_name_english': 'product_category_name'})
final_products_df

Unnamed: 0,product_id,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm,product_category_name
0,1e9e8ef04dbcff4541ed26657ea517e5,40.0,287.0,1.0,225.0,16.0,10.0,14.0,perfumery
1,3aa071139cb16b67ca9e5dea641aaa2f,44.0,276.0,1.0,1000.0,30.0,18.0,20.0,art
2,96bd76ec8810374ed1b65e291975717f,46.0,250.0,1.0,154.0,18.0,9.0,15.0,sports_leisure
3,cef67bcfe19066a932b7673e239eb23d,27.0,261.0,1.0,371.0,26.0,4.0,26.0,baby
4,9dc1a7de274444849c219cff195d0b71,37.0,402.0,4.0,625.0,20.0,17.0,13.0,housewares
...,...,...,...,...,...,...,...,...,...
32946,a0b7d5a992ccda646f2d34e418fff5a0,45.0,67.0,2.0,12300.0,40.0,40.0,40.0,furniture_decor
32947,bf4538d88321d0fd4412a93c974510e6,41.0,971.0,1.0,1700.0,16.0,19.0,16.0,construction_tools_lights
32948,9a7c6041fa9592d9d9ef6cfe62a71f8c,50.0,799.0,1.0,1400.0,27.0,7.0,27.0,bed_bath_table
32949,83808703fc0706a22e264b9d75f04a2e,60.0,156.0,2.0,700.0,31.0,13.0,20.0,computers_accessories


### **Missing Values**

In [25]:
final_products_df.isna().sum() # check for missing values

product_id                      0
product_name_lenght           610
product_description_lenght    610
product_photos_qty            610
product_weight_g                2
product_length_cm               2
product_height_cm               2
product_width_cm                2
product_category_name         623
dtype: int64

In [30]:
final_products_df[final_products_df['product_category_name'].isna()]

Unnamed: 0,product_id,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm,product_category_name
105,a41e356c76fab66334f36de622ecbd3a,,,,650.0,17.0,14.0,12.0,
128,d8dee61c2034d6d075997acef1870e9b,,,,300.0,16.0,7.0,20.0,
145,56139431d72cd51f19eb9f7dae4d1617,,,,200.0,20.0,20.0,20.0,
154,46b48281eb6d663ced748f324108c733,,,,18500.0,41.0,30.0,41.0,
197,5fb61f482620cb672f5e586bb132eae9,,,,300.0,35.0,7.0,12.0,
...,...,...,...,...,...,...,...,...,...
32515,b0a0c5dd78e644373b199380612c350a,,,,1800.0,30.0,20.0,70.0,
32589,10dbe0fbaa2c505123c17fdc34a63c56,,,,800.0,30.0,10.0,23.0,
32616,bd2ada37b58ae94cc838b9c0569fecd8,,,,200.0,21.0,8.0,16.0,
32772,fa51e914046aab32764c41356b9d4ea4,,,,1300.0,45.0,16.0,45.0,


### **Duplicate Values**

In [27]:
final_products_df.duplicated().sum() # check for duplicates

0

### **Unique Values**

In [28]:
pd.set_option('display.max_colwidth', None)

listItem = []
for col in final_products_df.columns :
    listItem.append([col, final_products_df[col].nunique(), final_products_df[col].sort_values().unique()])

df_uniques_per_column = pd.DataFrame(columns=['Column Name', 'Number of Unique', 'Unique Sample'], data=listItem)
df_uniques_per_column.sort_values('Number of Unique', ascending=False)

Unnamed: 0,Column Name,Number of Unique,Unique Sample
0,product_id,32951,"[00066f42aeeb9f3007548bb9d3f33c38, 00088930e925c41fd95ebfe695fd2655, 0009406fd7479715e4bef61dd91f2462, 000b8f95fcb9e0096488278317764d19, 000d9be29b5207b54e86aa1b1ac54872, 0011c512eb256aa0dbbb544d8dffcf6e, 00126f27c813603687e6ce486d909d01, 001795ec6f1b187d37335e1c4704762e, 001b237c0e9bb435f2e54071129237e9, 001b72dfd63e9833e8c02742adf472e3, 001c5d71ac6ad696d22315953758fa04, 00210e41887c2a8ef9f791ebc780cc36, 002159fe700ed3521f46cfcf6e941c76, 0021a87d4997a48b6cef1665602be0f5, 00250175f79f584c14ab5cecd80553cd, 002552c0663708129c0019cc97552d7d, 002959d7a0b0990fe2d69988affcbc80, 002af88741ba70c7b5cf4e4a0ad7ef85, 002c6dab60557c48cfd6c2222ef7fd76, 002d4ea7c04739c130bb74d7e7cd1694, 002ec297b1b00fb9dde7ee6ac24b6771, 0030026a6ddb3b2d1d4bc225b4b4c4da, 0030e635639c898b323826589761cf23, 003128f981470c3e5a2e7445e4a771cd, 0036bb031e69d915cd384d1b3838b5d3, 003938452c98ff9ab28e9e7b4bfe97ab, 003962cb74a8b43cf1034fed541a76f0, 003a31970fea14fffe92ac856b8a9b97, 003c0b8f6580c850bd2e32044d2ac307, 003dbcabcf8e3231de657c7d9f9a5eba, 004154251837f6ac124ad4374b3a8148, 0042f1a9a7e0edd1400c6cd0fda065f8, 0043c62d00db47eff6a6bc4cf6bfaeda, 0043d1a25ef08fb6f41b8fa6f91742ab, 0044d70d4e53450c0fbb8255446a797b, 004552d98c5d3653af8b4dbe8def0048, 004636c889c7c3dad6631f136b7fa082, 004ffcbfa5aac82212a95bc972ea8a85, 005030ef108f58b46b78116f754d8d38, 005c6b24cc96dca3e2c01e824401030e, 0060b415594c5e1200324ef1a18493c4, 00636f9286f69c9f1bdabe76e670fb50, 006508eabcc19ae52ed87323124ad0e9, 006619bbed68b000c8ba3f8725d5409e, 0067f97995b0c3a86739734dc87ee6b6, 006baa9a5b8f95895f15273a35bc2664, 006c67546bfe73c33b83f6bd1ad58c36, 00716e5593e8eea55dbe7a29b72a70bc, 00738f5a17ec4450e88915357b085c1e, 0075c14603d80515e20908c156ceb47b, 007c63ae4b346920756b5adcad8095de, 0081787b18cce4acc2d1adec4520d7ba, 0082684bb4a60a862baaf7a60a5845ed, 0085dddda27567189f1dfd1eda7c286e, 008686d56f4c85b987a4c24a286834c1, 00878d953636afec00d3e85d55a12e7f, 008b0ec3de3638637099f97add750136, 008b51e6be49ee039854b5e35c331b02, 008cff0e5792219fae03e570f980b330, 00905d58c87afcbce21420b3712cacaa, 00917855135d67fb060dcd81863166a4, 0091d85023824de2d33252310c952307, 00929aaa7751a77220db9caa1ae6d3ab, 0092a9e1c8a471753ed7ffa7eef5086e, 00989337a1916a0055eedd1fdb35eb53, 009af1277432f1a05742df69cdf72d1e, 009c09f439988bc06a93d6b8186dce73, 009df2b0bc078648fc4f5898de8cabff, 00a0db09dc7c94d512ec51900141ed69, 00a8c742ae1da97e9064205459977e6c, 00a985c524adbb97a4211e4ce17aabec, 00aa70fd749f3b9ce8ca4bc914a9e7b1, 00ab8a8b9fe219511dc3f178c6d79698, 00ae7076313576f94d9107599d79a978, 00b264091d1c8df03976c3f3b176b35c, 00b4155166f994ba9da3cf001eb80505, 00b782074a5ec523bf31951b4935d572, 00ba6d766f0b1d7b78a5ce3e1e033263, 00baba5b58e274d0332a0c8a0a66f877, 00bb62ea3729537a687c3fddcd123662, 00bc6e6d4ceb98ec7ed058b0a243a8b9, 00be617b58175bf207fd35910d5097a4, 00c1e6d55fe174bfbb179257255e49bb, 00c3e35b00dd00ab864d4a94c1b97e66, 00c407e056c74eb8040a1233ca6da16f, 00c744ca2f3b0e76ce227b146095d3f9, 00c794371af4dc7c5b6ba3ef913556bb, 00cc15e0cef551848c797ab75ba904b1, 00cf06147ed4880ec5fbba2adbb20e1d, 00d2add85b1f5aba6bb3d5d977314e25, 00d2fa6f2a87633084c8e0400d41a65a, 00d363b8c14bb9a6c73febc4c42acf47, 00d51a45854ebce28acda79f414388ee, 00d62b338366db4c4aec8547ea8f928e, 00d62d0388b0786d730a1aaed91c0ae0, 00d93a09990b319a7c946ed7a8c67656, 00dc8a1b2de45a7fca5921544e5f351d, 00ddaddbe03b7103d80daee96892363f, 00de606a5dda99c2499f94ef18282977, 00de7f393d962717eeeb2d7131a40dba, ...]"
2,product_description_lenght,2960,"[4.0, 8.0, 15.0, 20.0, 23.0, 26.0, 27.0, 28.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0, 65.0, 66.0, 67.0, 68.0, 69.0, 70.0, 71.0, 72.0, 73.0, 74.0, 75.0, 76.0, 77.0, 78.0, 79.0, 80.0, 81.0, 82.0, 83.0, 84.0, 85.0, 86.0, 87.0, 88.0, 89.0, 90.0, 91.0, 92.0, 93.0, 94.0, 95.0, 96.0, 97.0, 98.0, 99.0, 100.0, 101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0, 109.0, 110.0, 111.0, 112.0, 113.0, 114.0, 115.0, 116.0, 117.0, 118.0, 119.0, 120.0, 121.0, 122.0, ...]"
4,product_weight_g,2204,"[0.0, 2.0, 25.0, 50.0, 53.0, 54.0, 55.0, 58.0, 60.0, 61.0, 63.0, 65.0, 67.0, 70.0, 75.0, 76.0, 77.0, 78.0, 80.0, 82.0, 83.0, 85.0, 87.0, 88.0, 90.0, 91.0, 92.0, 94.0, 95.0, 96.0, 97.0, 98.0, 99.0, 100.0, 102.0, 103.0, 104.0, 105.0, 107.0, 108.0, 110.0, 111.0, 113.0, 114.0, 115.0, 116.0, 117.0, 118.0, 119.0, 120.0, 121.0, 122.0, 123.0, 124.0, 125.0, 126.0, 127.0, 128.0, 129.0, 130.0, 132.0, 133.0, 135.0, 136.0, 137.0, 138.0, 139.0, 140.0, 143.0, 144.0, 145.0, 146.0, 147.0, 148.0, 150.0, 151.0, 152.0, 153.0, 154.0, 155.0, 156.0, 157.0, 158.0, 160.0, 161.0, 162.0, 163.0, 165.0, 166.0, 167.0, 169.0, 170.0, 171.0, 172.0, 173.0, 175.0, 177.0, 178.0, 179.0, 180.0, ...]"
6,product_height_cm,102,"[2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0, 64.0, 65.0, 66.0, 67.0, 68.0, 69.0, 70.0, 71.0, 72.0, 73.0, 74.0, 75.0, 76.0, 77.0, 78.0, 79.0, 80.0, 81.0, 82.0, 83.0, 84.0, 85.0, 86.0, 87.0, 88.0, 89.0, 90.0, 92.0, 93.0, 94.0, 95.0, 96.0, 97.0, 98.0, 99.0, 100.0, 102.0, 103.0, ...]"
5,product_length_cm,99,"[7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0, 64.0, 65.0, 66.0, 67.0, 68.0, 69.0, 70.0, 71.0, 72.0, 73.0, 74.0, 75.0, 76.0, 77.0, 78.0, 79.0, 80.0, 81.0, 82.0, 83.0, 84.0, 85.0, 86.0, 87.0, 88.0, 89.0, 90.0, 91.0, 92.0, 93.0, 94.0, 95.0, 96.0, 97.0, 98.0, 99.0, 100.0, 101.0, 102.0, 103.0, 104.0, 105.0, nan]"
7,product_width_cm,95,"[6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0, 64.0, 65.0, 66.0, 67.0, 68.0, 69.0, 70.0, 71.0, 72.0, 73.0, 74.0, 75.0, 76.0, 77.0, 78.0, 79.0, 80.0, 82.0, 83.0, 84.0, 85.0, 86.0, 87.0, 90.0, 91.0, 92.0, 93.0, 95.0, 97.0, 98.0, 100.0, 101.0, 102.0, 103.0, 104.0, 105.0, 118.0, nan]"
8,product_category_name,71,"[agro_industry_and_commerce, air_conditioning, art, arts_and_craftmanship, audio, auto, baby, bed_bath_table, books_general_interest, books_imported, books_technical, cds_dvds_musicals, christmas_supplies, cine_photo, computers, computers_accessories, consoles_games, construction_tools_construction, construction_tools_lights, construction_tools_safety, cool_stuff, costruction_tools_garden, costruction_tools_tools, diapers_and_hygiene, drinks, dvds_blu_ray, electronics, fashio_female_clothing, fashion_bags_accessories, fashion_childrens_clothes, fashion_male_clothing, fashion_shoes, fashion_sport, fashion_underwear_beach, fixed_telephony, flowers, food, food_drink, furniture_bedroom, furniture_decor, furniture_living_room, furniture_mattress_and_upholstery, garden_tools, health_beauty, home_appliances, home_appliances_2, home_comfort_2, home_confort, home_construction, housewares, industry_commerce_and_business, kitchen_dining_laundry_garden_furniture, la_cuisine, luggage_accessories, market_place, music, musical_instruments, office_furniture, party_supplies, perfumery, pet_shop, security_and_services, signaling_and_security, small_appliances, small_appliances_home_oven_and_coffee, sports_leisure, stationery, tablets_printing_image, telephony, toys, watches_gifts, nan]"
1,product_name_lenght,66,"[5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0, 64.0, 66.0, 67.0, 68.0, 69.0, 72.0, 76.0, nan]"
3,product_photos_qty,19,"[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 17.0, 18.0, 19.0, 20.0, nan]"
