In [1]:
%load_ext sql
import urllib.parse

In [4]:
host = 'localhost'
user = 'postgres'
dbname = 'olist'
password = urllib.parse.quote_from_bytes('CHiheb 10'.encode())
conn_string = f"postgresql://{user}:{password}@{host}/{dbname}"

In [5]:
%sql $conn_string

#  Exploration

In [6]:
%%sql 
select * from products limit 10;

 * postgresql://postgres:***@localhost/olist
10 rows affected.


product_id,product_category,product_name_length,product_desc_length,product_photos_qty,product_weight_grams,product_length_cm,product_height_cm,product_width_cm
1e9e8ef04dbcff4541ed26657ea517e5,perfumaria,40,287,1,225,16,10,14
3aa071139cb16b67ca9e5dea641aaa2f,artes,44,276,1,1000,30,18,20
96bd76ec8810374ed1b65e291975717f,esporte_lazer,46,250,1,154,18,9,15
cef67bcfe19066a932b7673e239eb23d,bebes,27,261,1,371,26,4,26
9dc1a7de274444849c219cff195d0b71,utilidades_domesticas,37,402,4,625,20,17,13
41d3672d4792049fa1779bb35283ed13,instrumentos_musicais,60,745,1,200,38,5,11
732bd381ad09e530fe0a5f457d81becb,cool_stuff,56,1272,4,18350,70,24,44
2548af3e6e77a690cf3eb6368e9ab61e,moveis_decoracao,56,184,2,900,40,8,40
37cc742be07708b53a98702e77a21a02,eletrodomesticos,57,163,1,400,27,13,17
8c92109888e8cdf9d66dc7e463025574,brinquedos,36,1156,1,600,17,10,12


In [24]:
%%sql
select count(*) as total_products,
        sum(case when product_name_length is null then 1 else 0 end) as nbr_null_name,
        ((sum(case when product_name_length is null then 1 else 0 end)::real/count(*))*100)::real as percentage_null_value
from products;

 * postgresql://postgres:***@localhost/olist
1 rows affected.


total_products,nbr_null_name,percentage_null_value
32951,610,1.8512336


# Average length of name and description by category


In [28]:
%%sql 

select product_category,avg(product_name_length)::int as avg_name_length , avg(product_desc_length)::int as avg_desc_length
from products 
where product_category is not null
group by product_category
order by avg_name_length asc;

 * postgresql://postgres:***@localhost/olist
73 rows affected.


product_category,avg_name_length,avg_desc_length
fashion_roupa_masculina,40,627
fashion_roupa_feminina,40,639
livros_interesse_geral,40,989
livros_tecnicos,42,1352
moveis_colchao_e_estofado,43,1111
market_place,44,829
artes_e_artesanato,44,620
artigos_de_natal,44,412
consoles_games,44,850
bebidas,44,1048


#  Average weight, length, height, width by  product category


In [35]:
%%sql

select product_category, ROUND((avg(product_weight_grams)/1000)::decimal,3) as avg_weight_kg, avg(product_length_cm)::real as avg_length_cm, avg(product_height_cm)::real as avg_height_cm, avg(product_width_cm)::real as avg_width_cm
from products
where product_category is not null
group by product_category;

 * postgresql://postgres:***@localhost/olist
73 rows affected.


product_category,avg_weight_kg,avg_length_cm,avg_height_cm,avg_width_cm
climatizacao,4.46,36.467743,23.887096,26.088709
livros_importados,0.597,29.741936,3.451613,21.225807
artigos_de_natal,1.85,28.23077,16.215385,22.830769
livros_tecnicos,1.108,27.325203,5.869919,18.463415
ferramentas_jardim,3.104,30.936255,19.204515,23.504648
cine_foto,0.796,27.642857,11.571428,18.178572
dvds_blu_ray,0.382,21.270834,4.4166665,14.875
fashion_roupa_feminina,0.572,23.296297,11.481482,18.333334
beleza_saude,1.435,23.800737,15.712357,17.997545
livros_interesse_geral,0.747,23.481482,9.773149,19.328703


# Average Volume of Box for each Product Category


In [48]:
%%sql 

with products_volume
as 
( select product_category, avg(volume)::real as avg_volume
 from
    (select product_category, (product_length_cm::real *product_height_cm::real *product_width_cm::real ) as volume
    from products 
    where product_category is not null) as t
group by product_category
)
select product_category, avg_volume, rank() over(order by avg_volume desc) as rank
from products_volume;

 * postgresql://postgres:***@localhost/olist
73 rows affected.


product_category,avg_volume,rank
moveis_colchao_e_estofado,77244.3,1
moveis_escritorio,75468.47,2
moveis_cozinha_area_de_servico_jantar_e_jardim,69406.09,3
eletrodomesticos_2,55476.312,4
moveis_sala,54486.13,5
moveis_quarto,51038.844,6
pcs,44635.168,7
agro_industria_e_comercio,37604.23,8
industria_comercio_e_negocios,37372.31,9
malas_acessorios,32950.336,10


# Correlation between freight value and product weight, length, height, volume and price



In [63]:
%%sql 

with products_data
as 
(
    select oi.freight_value,
           oi.price,
            product_weight_grams,
            product_height_cm,
            product_width_cm,
            product_length_cm,
            (product_height_cm::real * product_width_cm::real * product_length_cm::real) as volume
    from order_items oi
    join products p using(product_id)    
)
select corr(freight_value,product_length_cm)::real as corr_length,
        corr(freight_value,product_width_cm)::real as corr_width,
        corr(freight_value,product_height_cm)::real as corr_height,
        corr(freight_value,volume) as corr_volume,
        corr(freight_value,product_weight_grams)::real as corr_weight,
        corr(freight_value,price) as worr_price
from products_data;

 * postgresql://postgres:***@localhost/olist
1 rows affected.


corr_length,corr_width,corr_height,corr_volume,corr_weight,worr_price
0.30908597,0.32377744,0.39183104,0.5872700708999319,0.6104202,0.4142043091015094


# Linear Relationship between freight value and product weight



In [71]:
%%sql

with products_data
as (
        select oi.freight_value, (product_weight_grams/1000)::real as weight
        from products p 
        join order_items oi using(product_id)
          
)
select regr_slope(freight_value,weight)::real as slope_reg , regr_intercept(freight_value,weight)::real as intercept_reg
from products_data;

 * postgresql://postgres:***@localhost/olist
1 rows affected.


slope_reg,intercept_reg
2.575215,15.649705
