In [10]:
import pandas as pd
import numpy as np
import os
import bisect 
import pandas as pd
from sklearn.model_selection import train_test_split
from pycaret.classification import *

In [59]:
itens = pd.read_csv("Data/itens.csv") 
ordens = pd.read_csv("Data/ordens.csv")
produtos = pd.read_csv("Data/produtos.csv")
geolocal = pd.read_csv("Data/geolocal.csv")
clientes = pd.read_csv("Data/clientes.csv")
avaliacoes = pd.read_csv('Data/avaliacoes.csv')
vendedores = pd.read_csv("Data/vendedores.csv")
pagamentos = pd.read_csv("Data/pagamentos.csv")

## Lendo dados

## Clientes

In [36]:
clientes['customer_return'] = clientes.duplicated(subset=['customer_unique_id']).astype(int)
clientes['customer_return'].value_counts() 

0    96096
1     3345
Name: customer_return, dtype: int64

In [37]:
clientes.drop(['customer_city', 'customer_unique_id'], axis=1, inplace=True)
clientes.rename(columns={"clientes_estado": "clientes_code"}, inplace=True)

In [66]:
clientes.to_csv('clientes.csv')

## Geolocalização

Este conjunto de dados contém CEPs brasileiros e suas coordenadas lat/lng. 

In [38]:
geolocal.shape

(738332, 6)


Eliminação de colunas indesejadas e duplicatas para manter apenas um par de latitude/longitude por prefixo de código postal.

In [39]:
geolocal = geolocal.drop(['geolocation_city', 'geolocation_state'], axis=1)
geolocal = geolocal.drop_duplicates(subset = ['geolocation_zip_code_prefix'],ignore_index=True)


Criar 2 novos dataframe para juntar com os dados do cliente posteriormente

In [40]:
geo_customer = geolocal.rename(columns={"geolocation_zip_code_prefix":"customer_zip_code_prefix",
                                                  "geolocation_lat":"customer_lat",
                                                  "geolocation_lng":"customer_lng"})
geo_seller = geolocal.rename(columns={"geolocation_zip_code_prefix":"seller_zip_code_prefix",
                                                  "geolocation_lat":"seller_lat",
                                                  "geolocation_lng":"seller_lng"})
geo_customer.head(3)

Unnamed: 0.1,Unnamed: 0,customer_zip_code_prefix,customer_lat,customer_lng
0,0,1037,-23.545621,-46.639292
1,1,1046,-23.546081,-46.64482
2,3,1041,-23.544392,-46.639499


In [67]:
geolocal.to_csv('geolocal.csv')

## Itens

Este conjunto de dados contém informações sobre os produtos em cada pedido. Os preços estão em reais. O valor do frete é calculado de acordo com as medidas e peso de cada item.

In [42]:
print(itens.shape)
itens.head(3)

(112650, 9)


Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value
0,0,0,00010242fe8c5a6d1ba2dd792cb16214,1,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,2017-09-19 09:45:35,58.9,13.29
1,1,1,00018f77f2f0320c557190d7a144bdd3,1,e5f2d52b802189ee658865ca93d83a8f,dd7ddc04e1b6c2c614352b383efe2d36,2017-05-03 11:05:13,239.9,19.93
2,2,2,000229ec398224ef6ca0657da4fc703e,1,c777355d18b72b67abbeef9df44fd0fd,5b51032eddd242adc84c38acab88f23d,2018-01-18 14:48:30,199.0,17.87


### Avaliação

In [43]:
print(avaliacoes.shape)
avaliacoes.head(3)

(99224, 6)


Unnamed: 0.1,Unnamed: 0,review_id,order_id,review_score,review_creation_date,review_answer_timestamp
0,0,7bc2406110b926393aa56f80a40eba40,73fc7af87114b39712e6da79b0a377eb,4,2018-01-18 00:00:00,2018-01-18 21:46:59
1,1,80e641a11e56f04c1ad469d5645fdfde,a548910a1c6147796b98fdf73dbeba33,5,2018-03-10 00:00:00,2018-03-11 03:05:13
2,2,228ce5500dc1d8e020d8d1322874b6f0,f9e4b658b201a9f2ecdecbb34bed034b,5,2018-02-17 00:00:00,2018-02-18 14:36:24


Retira as duplicatas para o mesmo order_id selecionando a avaliação mais recente com base em review_answer_timestamp, pois os dados já estão classificados por hora. Em seguida, retire avaliações duplicadas devido ao problema mencionado acima com pedidos de vários itens e várias avaliações.


In [44]:
avaliacoes.drop_duplicates(subset=['order_id'], keep='last', ignore_index=True, inplace=True)
avaliacoes.drop_duplicates(subset=['review_id'], keep=False, ignore_index=True, inplace=True)

Criar um novo atributo para capturar o tempo que cada cliente leva para responder à pesquisa após a compra. Em seguida, solte as colunas de carimbo de data/hora.


In [45]:
avaliacoes['review_answer_timestamp'] = pd.to_datetime(avaliacoes['review_answer_timestamp'], format='%Y/%m/%d')
avaliacoes['review_creation_date'] = pd.to_datetime(avaliacoes['review_creation_date'], format='%Y/%m/%d')
avaliacoes['review_answer_delay'] = (avaliacoes['review_answer_timestamp'] - avaliacoes['review_creation_date']).dt.days

In [46]:
avaliacoes.isna().sum() / len(avaliacoes)

Unnamed: 0                 0.0
review_id                  0.0
order_id                   0.0
review_score               0.0
review_creation_date       0.0
review_answer_timestamp    0.0
review_answer_delay        0.0
dtype: float64

In [47]:
avaliacoes.head()

Unnamed: 0.1,Unnamed: 0,review_id,order_id,review_score,review_creation_date,review_answer_timestamp,review_answer_delay
0,0,7bc2406110b926393aa56f80a40eba40,73fc7af87114b39712e6da79b0a377eb,4,2018-01-18,2018-01-18 21:46:59,0
1,1,80e641a11e56f04c1ad469d5645fdfde,a548910a1c6147796b98fdf73dbeba33,5,2018-03-10,2018-03-11 03:05:13,1
2,2,228ce5500dc1d8e020d8d1322874b6f0,f9e4b658b201a9f2ecdecbb34bed034b,5,2018-02-17,2018-02-18 14:36:24,1
3,3,e64fb393e7b32834bb789ff8bb30750e,658677c97b385a9be170737859d3511b,5,2017-04-21,2017-04-21 22:02:06,0
4,4,f7c4243c7fe1938f181bec41a392bdeb,8e6bfb81e283fa7e4f11123a3fb894f1,5,2018-03-01,2018-03-02 10:26:53,1


In [68]:
avaliacoes.to_csv('avaliacoes.csv')

## Ordens

In [50]:
print(ordens.shape)
ordens.head(3)

(99441, 9)


Unnamed: 0.1,Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
0,0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18
1,1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13
2,2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04


In [60]:
ordens.loc[:,'order_purchase_timestamp'] = pd.to_datetime(ordens['order_purchase_timestamp'],
                                                              format='%Y/%m/%d').dt.date
ordens.loc[:,'order_delivered_customer_date'] = pd.to_datetime(ordens['order_delivered_customer_date'],
                                                              format='%Y/%m/%d').dt.date
ordens.loc[:,'order_estimated_delivery_date'] = pd.to_datetime(ordens['order_estimated_delivery_date'],
                                                              format='%Y/%m/%d').dt.date


ordens['data_estimada_entrega'] = (ordens['order_estimated_delivery_date'] 
                                            - ordens['order_purchase_timestamp']).astype('timedelta64[D]')
ordens['data_real_entrega'] = (ordens['order_delivered_customer_date'] 
                                            - ordens['order_purchase_timestamp']).astype('timedelta64[D]')

In [61]:
ordens['year'] = pd.to_datetime(ordens['order_purchase_timestamp'], format='%Y/%m/%d').dt.year

In [62]:
ordens = ordens[(~ordens['order_status'].isin(['canceled', 'unavailable']))]

cols_to_drop = ['order_approved_at', 'order_delivered_carrier_date', 
               'order_delivered_customer_date', 'order_estimated_delivery_date'] 
ordens = ordens.drop(cols_to_drop, axis=1)
ordens.head(3)

Unnamed: 0.1,Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,data_estimada_entrega,data_real_entrega,year
0,0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02,16.0,8.0,2017
1,1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24,20.0,14.0,2018
2,2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08,27.0,9.0,2018


In [69]:
ordens.to_csv('ordens.csv')

## Produto

In [64]:
print(produtos.shape)
produtos.head(3)

(32340, 7)


Unnamed: 0.1,Unnamed: 0,product_id,product_category_name,product_weight_g,product_length_cm,product_height_cm,product_width_cm
0,0,1e9e8ef04dbcff4541ed26657ea517e5,perfumaria,225.0,16.0,10.0,14.0
1,1,3aa071139cb16b67ca9e5dea641aaa2f,artes,1000.0,30.0,18.0,20.0
2,2,96bd76ec8810374ed1b65e291975717f,esporte_lazer,154.0,18.0,9.0,15.0


In [65]:
produtos.isna().sum()

Unnamed: 0               0
product_id               0
product_category_name    0
product_weight_g         0
product_length_cm        0
product_height_cm        0
product_width_cm         0
dtype: int64

In [None]:
produtos['product_category_name'] = produtos['product_category_name'].fillna('not_reported')
produtos.head(3)
