# Imports, leitura e estruturação dos dados

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os

Neste notebook vou resolver um problema de previsão de perda de clientes, portanto o primeiro passo para tal é entender o negócio, ou seja, compreender fatores que levam o cliente a abandonar a empresa. Portanto, começarei pensando em dados relevantes encontrados no [dataset da olist no kaggle](https://www.kaggle.com/datasets/olistbr/brazilian-ecommerce), criarei minha ABT (analytical base table), farei a análise exploratória de dados e criarei meu modelo.

Variáveis que vou usar:
* olist_order_items_dataset(order_id, seller_id, product_id, price, *freight_value) *talvez vendedores com fretes mais altos tendem a abandonar a empresa
* olist_order_reviews_dataset(review_score, review_creation_date, review_answer_timestamp). Pretendo criar uma variável representando o tempo para resposta e outra indicando se houve resposta ou não.
* olist_orders_dataset(order_status, order_approved_at)
* olist_products_dataset(product_category_name). Pretendo criar variáveis, como variedade de produtos de um vendedor.

Além disso, manipularei esses dados para criar variáveis que segmentam os clientes através da classificação RFV (recência, frequência e valor), como quantidade de produtos vendidos, periodicidade do vendedor, ticket médio, etc.

In [2]:
FILE_PATH = os.path.abspath("__file__")
PROJECT_PATH = os.path.dirname(FILE_PATH)
DATA_PATH = os.path.join(PROJECT_PATH, 'data')

In [3]:
df_oi = pd.read_csv(os.path.join(DATA_PATH,'olist_order_items_dataset.csv'))
df_or = pd.read_csv(os.path.join(DATA_PATH,'olist_order_reviews_dataset.csv'))
df_orders = pd.read_csv(os.path.join(DATA_PATH,'olist_orders_dataset.csv'))
df_products = pd.read_csv(os.path.join(DATA_PATH,'olist_products_dataset.csv'))

In [76]:
df = pd.concat([df_oi, df_or, df_orders, df_products], axis=1)
df.head()

Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value,review_id,order_id.1,review_score,...,order_estimated_delivery_date,product_id.1,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
0,00010242fe8c5a6d1ba2dd792cb16214,1,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,2017-09-19 09:45:35,58.9,13.29,7bc2406110b926393aa56f80a40eba40,73fc7af87114b39712e6da79b0a377eb,4.0,...,2017-10-18 00:00:00,1e9e8ef04dbcff4541ed26657ea517e5,perfumaria,40.0,287.0,1.0,225.0,16.0,10.0,14.0
1,00018f77f2f0320c557190d7a144bdd3,1,e5f2d52b802189ee658865ca93d83a8f,dd7ddc04e1b6c2c614352b383efe2d36,2017-05-03 11:05:13,239.9,19.93,80e641a11e56f04c1ad469d5645fdfde,a548910a1c6147796b98fdf73dbeba33,5.0,...,2018-08-13 00:00:00,3aa071139cb16b67ca9e5dea641aaa2f,artes,44.0,276.0,1.0,1000.0,30.0,18.0,20.0
2,000229ec398224ef6ca0657da4fc703e,1,c777355d18b72b67abbeef9df44fd0fd,5b51032eddd242adc84c38acab88f23d,2018-01-18 14:48:30,199.0,17.87,228ce5500dc1d8e020d8d1322874b6f0,f9e4b658b201a9f2ecdecbb34bed034b,5.0,...,2018-09-04 00:00:00,96bd76ec8810374ed1b65e291975717f,esporte_lazer,46.0,250.0,1.0,154.0,18.0,9.0,15.0
3,00024acbcdf0a6daa1e931b038114c75,1,7634da152a4610f1595efa32f14722fc,9d7a1d34a5052409006425275ba1c2b4,2018-08-15 10:10:18,12.99,12.79,e64fb393e7b32834bb789ff8bb30750e,658677c97b385a9be170737859d3511b,5.0,...,2017-12-15 00:00:00,cef67bcfe19066a932b7673e239eb23d,bebes,27.0,261.0,1.0,371.0,26.0,4.0,26.0
4,00042b26cf59d7ce69dfabb4e55b4fd9,1,ac6c3623068f30de03045865e4e10089,df560393f3a51e74553ab94004ba5c87,2017-02-13 13:57:51,199.9,18.14,f7c4243c7fe1938f181bec41a392bdeb,8e6bfb81e283fa7e4f11123a3fb894f1,5.0,...,2018-02-26 00:00:00,9dc1a7de274444849c219cff195d0b71,utilidades_domesticas,37.0,402.0,4.0,625.0,20.0,17.0,13.0


In [77]:
COLS_TO_USE = ['order_id', 'seller_id', 'product_id', 'price', 'freight_value','review_score', 'review_creation_date',
               'review_answer_timestamp', 'order_status', 'order_delivered_carrier_date', 'order_delivered_customer_date', 
               'order_estimated_delivery_date','product_category_name']
df = df.loc[:,~df.columns.duplicated()][COLS_TO_USE]
df.head()

Unnamed: 0,order_id,seller_id,product_id,price,freight_value,review_score,review_creation_date,review_answer_timestamp,order_status,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,product_category_name
0,00010242fe8c5a6d1ba2dd792cb16214,48436dade18ac8b2bce089ec2a041202,4244733e06e7ecb4970a6e2683c13e61,58.9,13.29,4.0,2018-01-18 00:00:00,2018-01-18 21:46:59,delivered,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00,perfumaria
1,00018f77f2f0320c557190d7a144bdd3,dd7ddc04e1b6c2c614352b383efe2d36,e5f2d52b802189ee658865ca93d83a8f,239.9,19.93,5.0,2018-03-10 00:00:00,2018-03-11 03:05:13,delivered,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13 00:00:00,artes
2,000229ec398224ef6ca0657da4fc703e,5b51032eddd242adc84c38acab88f23d,c777355d18b72b67abbeef9df44fd0fd,199.0,17.87,5.0,2018-02-17 00:00:00,2018-02-18 14:36:24,delivered,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04 00:00:00,esporte_lazer
3,00024acbcdf0a6daa1e931b038114c75,9d7a1d34a5052409006425275ba1c2b4,7634da152a4610f1595efa32f14722fc,12.99,12.79,5.0,2017-04-21 00:00:00,2017-04-21 22:02:06,delivered,2017-11-22 13:39:59,2017-12-02 00:28:42,2017-12-15 00:00:00,bebes
4,00042b26cf59d7ce69dfabb4e55b4fd9,df560393f3a51e74553ab94004ba5c87,ac6c3623068f30de03045865e4e10089,199.9,18.14,5.0,2018-03-01 00:00:00,2018-03-02 10:26:53,delivered,2018-02-14 19:46:34,2018-02-16 18:17:02,2018-02-26 00:00:00,utilidades_domesticas


In [78]:
DATE_COLS = ['review_creation_date', 'review_answer_timestamp', 'order_delivered_carrier_date', 'order_delivered_customer_date', 
             'order_estimated_delivery_date']
df[DATE_COLS] = pd.to_datetime(df[DATE_COLS].stack()).unstack()
df.head()

Unnamed: 0,order_id,seller_id,product_id,price,freight_value,review_score,review_creation_date,review_answer_timestamp,order_status,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,product_category_name
0,00010242fe8c5a6d1ba2dd792cb16214,48436dade18ac8b2bce089ec2a041202,4244733e06e7ecb4970a6e2683c13e61,58.9,13.29,4.0,2018-01-18,2018-01-18 21:46:59,delivered,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18,perfumaria
1,00018f77f2f0320c557190d7a144bdd3,dd7ddc04e1b6c2c614352b383efe2d36,e5f2d52b802189ee658865ca93d83a8f,239.9,19.93,5.0,2018-03-10,2018-03-11 03:05:13,delivered,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13,artes
2,000229ec398224ef6ca0657da4fc703e,5b51032eddd242adc84c38acab88f23d,c777355d18b72b67abbeef9df44fd0fd,199.0,17.87,5.0,2018-02-17,2018-02-18 14:36:24,delivered,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04,esporte_lazer
3,00024acbcdf0a6daa1e931b038114c75,9d7a1d34a5052409006425275ba1c2b4,7634da152a4610f1595efa32f14722fc,12.99,12.79,5.0,2017-04-21,2017-04-21 22:02:06,delivered,2017-11-22 13:39:59,2017-12-02 00:28:42,2017-12-15,bebes
4,00042b26cf59d7ce69dfabb4e55b4fd9,df560393f3a51e74553ab94004ba5c87,ac6c3623068f30de03045865e4e10089,199.9,18.14,5.0,2018-03-01,2018-03-02 10:26:53,delivered,2018-02-14 19:46:34,2018-02-16 18:17:02,2018-02-26,utilidades_domesticas


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 112650 entries, 0 to 112649
Data columns (total 11 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   order_id                 112650 non-null  object        
 1   seller_id                112650 non-null  object        
 2   product_id               112650 non-null  object        
 3   price                    112650 non-null  float64       
 4   freight_value            112650 non-null  float64       
 5   review_score             99224 non-null   float64       
 6   review_creation_date     99224 non-null   datetime64[ns]
 7   review_answer_timestamp  99224 non-null   datetime64[ns]
 8   order_status             99441 non-null   object        
 9   order_approved_at        99281 non-null   datetime64[ns]
 10  product_category_name    32341 non-null   object        
dtypes: datetime64[ns](3), float64(3), object(5)
memory usage: 14.3+ MB


In [8]:
df['order_status'].value_counts()

delivered      96478
shipped         1107
canceled         625
unavailable      609
invoiced         314
processing       301
created            5
approved           2
Name: order_status, dtype: int64

In [9]:
df = df.drop(df.loc[df['order_status'] != 'delivered'].index)

In [10]:
df['order_status'].value_counts()

delivered    96478
Name: order_status, dtype: int64

Próximos passos:
* Criar as novas variáveis.
    * quantidade de vendas
    * dias sem vender
    * frequência de vendas (quantidade de dias que um vendedor leva para vender novamente)
    * total de dinheiro em vendas
    * preço médio por venda
    * média de vendas mensais
    * média de vendas nos meses em que o vendedor estava ativo
    * etc.

# Criação de novas variáveis

## Quantidade de vendas de cada vendedor

In [89]:
df['qtd_vendas'] = df.groupby('seller_id')['order_id'].transform('count')
df.head()

Unnamed: 0,order_id,seller_id,product_id,price,freight_value,review_score,review_creation_date,review_answer_timestamp,order_status,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,product_category_name,qtd_vendas
0,00010242fe8c5a6d1ba2dd792cb16214,48436dade18ac8b2bce089ec2a041202,4244733e06e7ecb4970a6e2683c13e61,58.9,13.29,4.0,2018-01-18,2018-01-18 21:46:59,delivered,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18,perfumaria,151
1,00018f77f2f0320c557190d7a144bdd3,dd7ddc04e1b6c2c614352b383efe2d36,e5f2d52b802189ee658865ca93d83a8f,239.9,19.93,5.0,2018-03-10,2018-03-11 03:05:13,delivered,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13,artes,143
2,000229ec398224ef6ca0657da4fc703e,5b51032eddd242adc84c38acab88f23d,c777355d18b72b67abbeef9df44fd0fd,199.0,17.87,5.0,2018-02-17,2018-02-18 14:36:24,delivered,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04,esporte_lazer,14
3,00024acbcdf0a6daa1e931b038114c75,9d7a1d34a5052409006425275ba1c2b4,7634da152a4610f1595efa32f14722fc,12.99,12.79,5.0,2017-04-21,2017-04-21 22:02:06,delivered,2017-11-22 13:39:59,2017-12-02 00:28:42,2017-12-15,bebes,16
4,00042b26cf59d7ce69dfabb4e55b4fd9,df560393f3a51e74553ab94004ba5c87,ac6c3623068f30de03045865e4e10089,199.9,18.14,5.0,2018-03-01,2018-03-02 10:26:53,delivered,2018-02-14 19:46:34,2018-02-16 18:17:02,2018-02-26,utilidades_domesticas,29


In [90]:
df.loc[df['seller_id'] == '001cca7ae9ae17fb1caed9dfb1094831'].head(1)

Unnamed: 0,order_id,seller_id,product_id,price,freight_value,review_score,review_creation_date,review_answer_timestamp,order_status,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,product_category_name,qtd_vendas
176,006e43460a55bc60c0a437521e426529,001cca7ae9ae17fb1caed9dfb1094831,08574b074924071f4e201e151b152b4e,99.0,43.06,4.0,2017-07-25,2017-08-02 18:15:52,delivered,2018-04-07 00:49:39,2018-04-16 23:35:26,2018-05-02,livros_interesse_geral,239


## Dias sem vender

In [91]:
date = df.sort_values(by='order_delivered_carrier_date', 
                      ascending=False)['order_delivered_carrier_date'].iloc[0] - pd.DateOffset(months=3)
date

Timestamp('2018-06-11 19:48:28')

In [92]:
df2 = df[df['order_delivered_carrier_date'] < date].copy()
df2 = df2.sort_values(by=['seller_id', 'order_delivered_carrier_date'])
df2.tail()

Unnamed: 0,order_id,seller_id,product_id,price,freight_value,review_score,review_creation_date,review_answer_timestamp,order_status,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,product_category_name,qtd_vendas
83803,be580c71d638ce682e9792c27c7aacb2,ffff564a4f9085cd26170f4732393726,c5897f6f2d995196dbb40542439da9b9,34.65,11.74,5.0,2017-06-02,2017-06-09 16:52:23,delivered,2018-01-10 14:20:28,2018-01-23 22:19:33,2018-03-07,,20
91715,d01c5b46e00bd214519fe9f64bbb2649,ffff564a4f9085cd26170f4732393726,96aca2f53bcaed6f466449f7fb18ae75,79.0,14.72,4.0,2017-05-13,2017-05-14 19:59:39,delivered,2018-02-07 18:17:11,2018-02-16 12:59:02,2018-03-02,,20
53977,7ab9c55c59eaeea579d047e2d8aaed81,ffff564a4f9085cd26170f4732393726,c4b925e40f11289063a854c47aaef129,11.5,10.96,5.0,2018-08-22,2018-08-22 21:45:42,delivered,2018-03-12 14:36:52,2018-04-26 19:37:26,2018-03-28,,20
42778,616b813dbea8acc9de0ca0380cd89b83,ffff564a4f9085cd26170f4732393726,dbd024d4182504993ad1e3cd2ee9d9e9,29.4,16.05,3.0,2018-02-21,2018-02-23 23:24:50,delivered,2018-03-21 01:58:24,2018-04-16 21:37:53,2018-04-10,,20
93638,d437ec1ece70f3e35d2695adfeb8a272,ffff564a4f9085cd26170f4732393726,8f7a3322e1abfed89ac080b0f7364779,52.5,18.96,5.0,2018-02-06,2018-02-07 12:22:50,delivered,2018-04-10 00:02:22,2018-04-23 22:38:42,2018-04-27,,20


In [93]:
df2['days_without_sell'] = df2.groupby(['seller_id'])['order_delivered_carrier_date'].diff()
df2['days_without_sell'] = df2['days_without_sell'].dt.days
df2.head()

Unnamed: 0,order_id,seller_id,product_id,price,freight_value,review_score,review_creation_date,review_answer_timestamp,order_status,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,product_category_name,qtd_vendas,days_without_sell
93696,d455a8cb295653b55abda06d434ab492,0015a82c2db000af6aaaf3ae2ecb0532,a2ff5a97bf95719e38ea2e3b4105bce8,895.0,21.02,5.0,2018-07-28,2018-07-30 23:59:38,delivered,2016-10-18 15:53:51,2016-10-26 16:35:46,2016-12-14,,3,
69082,9dc8d1a6f16f1b89874c29c9d8d30447,0015a82c2db000af6aaaf3ae2ecb0532,a2ff5a97bf95719e38ea2e3b4105bce8,895.0,21.02,3.0,2018-06-20,2018-06-23 12:33:50,delivered,2018-01-26 17:49:06,2018-01-29 21:19:11,2018-02-09,,3,465.0
55943,7f39ba4c9052be115350065d07583cac,0015a82c2db000af6aaaf3ae2ecb0532,a2ff5a97bf95719e38ea2e3b4105bce8,895.0,21.02,5.0,2018-06-19,2018-06-21 01:37:12,delivered,2018-03-14 18:14:50,2018-03-27 20:38:17,2018-04-03,,3,47.0
62827,8f7176f38b6ac3f5e924b9b05716440c,001cca7ae9ae17fb1caed9dfb1094831,98a8c2fa16d7239c606640f5555768e4,109.0,44.84,5.0,2017-12-13,2017-12-13 09:11:13,delivered,2016-10-12 11:02:41,2016-10-28 14:11:26,2016-12-14,,239,
32068,48ac23662de1f4a94e29e7f3452a85d9,001cca7ae9ae17fb1caed9dfb1094831,0da9ffd92214425d880de3f94e74ce39,112.0,46.08,5.0,2017-12-12,2017-12-18 04:28:50,delivered,2016-10-25 14:16:48,2016-10-26 16:41:21,2016-11-24,informatica_acessorios,239,13.0


## Média de dias sem vender (dias para vender)

In [94]:
df2['days_to_sell'] = df2.groupby('seller_id')['days_without_sell'].transform('mean')
df2.head()

Unnamed: 0,order_id,seller_id,product_id,price,freight_value,review_score,review_creation_date,review_answer_timestamp,order_status,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,product_category_name,qtd_vendas,days_without_sell,days_to_sell
93696,d455a8cb295653b55abda06d434ab492,0015a82c2db000af6aaaf3ae2ecb0532,a2ff5a97bf95719e38ea2e3b4105bce8,895.0,21.02,5.0,2018-07-28,2018-07-30 23:59:38,delivered,2016-10-18 15:53:51,2016-10-26 16:35:46,2016-12-14,,3,,256.0
69082,9dc8d1a6f16f1b89874c29c9d8d30447,0015a82c2db000af6aaaf3ae2ecb0532,a2ff5a97bf95719e38ea2e3b4105bce8,895.0,21.02,3.0,2018-06-20,2018-06-23 12:33:50,delivered,2018-01-26 17:49:06,2018-01-29 21:19:11,2018-02-09,,3,465.0,256.0
55943,7f39ba4c9052be115350065d07583cac,0015a82c2db000af6aaaf3ae2ecb0532,a2ff5a97bf95719e38ea2e3b4105bce8,895.0,21.02,5.0,2018-06-19,2018-06-21 01:37:12,delivered,2018-03-14 18:14:50,2018-03-27 20:38:17,2018-04-03,,3,47.0,256.0
62827,8f7176f38b6ac3f5e924b9b05716440c,001cca7ae9ae17fb1caed9dfb1094831,98a8c2fa16d7239c606640f5555768e4,109.0,44.84,5.0,2017-12-13,2017-12-13 09:11:13,delivered,2016-10-12 11:02:41,2016-10-28 14:11:26,2016-12-14,,239,,3.301887
32068,48ac23662de1f4a94e29e7f3452a85d9,001cca7ae9ae17fb1caed9dfb1094831,0da9ffd92214425d880de3f94e74ce39,112.0,46.08,5.0,2017-12-12,2017-12-18 04:28:50,delivered,2016-10-25 14:16:48,2016-10-26 16:41:21,2016-11-24,informatica_acessorios,239,13.0,3.301887


## Total de receitas em vendas

In [95]:
df2['income'] = df2.groupby('seller_id')['price'].transform('sum')
df2.head()

Unnamed: 0,order_id,seller_id,product_id,price,freight_value,review_score,review_creation_date,review_answer_timestamp,order_status,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,product_category_name,qtd_vendas,days_without_sell,days_to_sell,income
93696,d455a8cb295653b55abda06d434ab492,0015a82c2db000af6aaaf3ae2ecb0532,a2ff5a97bf95719e38ea2e3b4105bce8,895.0,21.02,5.0,2018-07-28,2018-07-30 23:59:38,delivered,2016-10-18 15:53:51,2016-10-26 16:35:46,2016-12-14,,3,,256.0,2685.0
69082,9dc8d1a6f16f1b89874c29c9d8d30447,0015a82c2db000af6aaaf3ae2ecb0532,a2ff5a97bf95719e38ea2e3b4105bce8,895.0,21.02,3.0,2018-06-20,2018-06-23 12:33:50,delivered,2018-01-26 17:49:06,2018-01-29 21:19:11,2018-02-09,,3,465.0,256.0,2685.0
55943,7f39ba4c9052be115350065d07583cac,0015a82c2db000af6aaaf3ae2ecb0532,a2ff5a97bf95719e38ea2e3b4105bce8,895.0,21.02,5.0,2018-06-19,2018-06-21 01:37:12,delivered,2018-03-14 18:14:50,2018-03-27 20:38:17,2018-04-03,,3,47.0,256.0,2685.0
62827,8f7176f38b6ac3f5e924b9b05716440c,001cca7ae9ae17fb1caed9dfb1094831,98a8c2fa16d7239c606640f5555768e4,109.0,44.84,5.0,2017-12-13,2017-12-13 09:11:13,delivered,2016-10-12 11:02:41,2016-10-28 14:11:26,2016-12-14,,239,,3.301887,16859.16
32068,48ac23662de1f4a94e29e7f3452a85d9,001cca7ae9ae17fb1caed9dfb1094831,0da9ffd92214425d880de3f94e74ce39,112.0,46.08,5.0,2017-12-12,2017-12-18 04:28:50,delivered,2016-10-25 14:16:48,2016-10-26 16:41:21,2016-11-24,informatica_acessorios,239,13.0,3.301887,16859.16


## Ticket médio

In [96]:
df2['average_ticket'] = df2.income/df2.qtd_vendas
df2.head()

Unnamed: 0,order_id,seller_id,product_id,price,freight_value,review_score,review_creation_date,review_answer_timestamp,order_status,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,product_category_name,qtd_vendas,days_without_sell,days_to_sell,income,average_ticket
93696,d455a8cb295653b55abda06d434ab492,0015a82c2db000af6aaaf3ae2ecb0532,a2ff5a97bf95719e38ea2e3b4105bce8,895.0,21.02,5.0,2018-07-28,2018-07-30 23:59:38,delivered,2016-10-18 15:53:51,2016-10-26 16:35:46,2016-12-14,,3,,256.0,2685.0,895.0
69082,9dc8d1a6f16f1b89874c29c9d8d30447,0015a82c2db000af6aaaf3ae2ecb0532,a2ff5a97bf95719e38ea2e3b4105bce8,895.0,21.02,3.0,2018-06-20,2018-06-23 12:33:50,delivered,2018-01-26 17:49:06,2018-01-29 21:19:11,2018-02-09,,3,465.0,256.0,2685.0,895.0
55943,7f39ba4c9052be115350065d07583cac,0015a82c2db000af6aaaf3ae2ecb0532,a2ff5a97bf95719e38ea2e3b4105bce8,895.0,21.02,5.0,2018-06-19,2018-06-21 01:37:12,delivered,2018-03-14 18:14:50,2018-03-27 20:38:17,2018-04-03,,3,47.0,256.0,2685.0,895.0
62827,8f7176f38b6ac3f5e924b9b05716440c,001cca7ae9ae17fb1caed9dfb1094831,98a8c2fa16d7239c606640f5555768e4,109.0,44.84,5.0,2017-12-13,2017-12-13 09:11:13,delivered,2016-10-12 11:02:41,2016-10-28 14:11:26,2016-12-14,,239,,3.301887,16859.16,70.540418
32068,48ac23662de1f4a94e29e7f3452a85d9,001cca7ae9ae17fb1caed9dfb1094831,0da9ffd92214425d880de3f94e74ce39,112.0,46.08,5.0,2017-12-12,2017-12-18 04:28:50,delivered,2016-10-25 14:16:48,2016-10-26 16:41:21,2016-11-24,informatica_acessorios,239,13.0,3.301887,16859.16,70.540418


## Média de receitas de vendas por mês

In [97]:
df2['avg_month_income'] = (df2['income']/
                         round(((df2['order_delivered_carrier_date'].iloc[-1] - df2['order_delivered_carrier_date'].
                                 iloc[0])/np.timedelta64(1, 'M'))))
df2.head()

Unnamed: 0,order_id,seller_id,product_id,price,freight_value,review_score,review_creation_date,review_answer_timestamp,order_status,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,product_category_name,qtd_vendas,days_without_sell,days_to_sell,income,average_ticket,avg_month_income
93696,d455a8cb295653b55abda06d434ab492,0015a82c2db000af6aaaf3ae2ecb0532,a2ff5a97bf95719e38ea2e3b4105bce8,895.0,21.02,5.0,2018-07-28,2018-07-30 23:59:38,delivered,2016-10-18 15:53:51,2016-10-26 16:35:46,2016-12-14,,3,,256.0,2685.0,895.0,149.166667
69082,9dc8d1a6f16f1b89874c29c9d8d30447,0015a82c2db000af6aaaf3ae2ecb0532,a2ff5a97bf95719e38ea2e3b4105bce8,895.0,21.02,3.0,2018-06-20,2018-06-23 12:33:50,delivered,2018-01-26 17:49:06,2018-01-29 21:19:11,2018-02-09,,3,465.0,256.0,2685.0,895.0,149.166667
55943,7f39ba4c9052be115350065d07583cac,0015a82c2db000af6aaaf3ae2ecb0532,a2ff5a97bf95719e38ea2e3b4105bce8,895.0,21.02,5.0,2018-06-19,2018-06-21 01:37:12,delivered,2018-03-14 18:14:50,2018-03-27 20:38:17,2018-04-03,,3,47.0,256.0,2685.0,895.0,149.166667
62827,8f7176f38b6ac3f5e924b9b05716440c,001cca7ae9ae17fb1caed9dfb1094831,98a8c2fa16d7239c606640f5555768e4,109.0,44.84,5.0,2017-12-13,2017-12-13 09:11:13,delivered,2016-10-12 11:02:41,2016-10-28 14:11:26,2016-12-14,,239,,3.301887,16859.16,70.540418,936.62
32068,48ac23662de1f4a94e29e7f3452a85d9,001cca7ae9ae17fb1caed9dfb1094831,0da9ffd92214425d880de3f94e74ce39,112.0,46.08,5.0,2017-12-12,2017-12-18 04:28:50,delivered,2016-10-25 14:16:48,2016-10-26 16:41:21,2016-11-24,informatica_acessorios,239,13.0,3.301887,16859.16,70.540418,936.62


## Tempo para responder

In [98]:
df2['time_to_answer'] = (df2['review_answer_timestamp'] - df2['review_creation_date']).dt.days
df2 = df2.drop(columns=['review_answer_timestamp', 'review_creation_date'])
df2.head()

Unnamed: 0,order_id,seller_id,product_id,price,freight_value,review_score,order_status,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,product_category_name,qtd_vendas,days_without_sell,days_to_sell,income,average_ticket,avg_month_income,time_to_answer
93696,d455a8cb295653b55abda06d434ab492,0015a82c2db000af6aaaf3ae2ecb0532,a2ff5a97bf95719e38ea2e3b4105bce8,895.0,21.02,5.0,delivered,2016-10-18 15:53:51,2016-10-26 16:35:46,2016-12-14,,3,,256.0,2685.0,895.0,149.166667,2.0
69082,9dc8d1a6f16f1b89874c29c9d8d30447,0015a82c2db000af6aaaf3ae2ecb0532,a2ff5a97bf95719e38ea2e3b4105bce8,895.0,21.02,3.0,delivered,2018-01-26 17:49:06,2018-01-29 21:19:11,2018-02-09,,3,465.0,256.0,2685.0,895.0,149.166667,3.0
55943,7f39ba4c9052be115350065d07583cac,0015a82c2db000af6aaaf3ae2ecb0532,a2ff5a97bf95719e38ea2e3b4105bce8,895.0,21.02,5.0,delivered,2018-03-14 18:14:50,2018-03-27 20:38:17,2018-04-03,,3,47.0,256.0,2685.0,895.0,149.166667,2.0
62827,8f7176f38b6ac3f5e924b9b05716440c,001cca7ae9ae17fb1caed9dfb1094831,98a8c2fa16d7239c606640f5555768e4,109.0,44.84,5.0,delivered,2016-10-12 11:02:41,2016-10-28 14:11:26,2016-12-14,,239,,3.301887,16859.16,70.540418,936.62,0.0
32068,48ac23662de1f4a94e29e7f3452a85d9,001cca7ae9ae17fb1caed9dfb1094831,0da9ffd92214425d880de3f94e74ce39,112.0,46.08,5.0,delivered,2016-10-25 14:16:48,2016-10-26 16:41:21,2016-11-24,informatica_acessorios,239,13.0,3.301887,16859.16,70.540418,936.62,6.0


## Vendas por mês

In [99]:
df2['avg_month_sell'] = (df2['qtd_vendas']/
                         round(((df2['order_delivered_carrier_date'].iloc[-1] - df2['order_delivered_carrier_date'].
                                 iloc[0])/np.timedelta64(1, 'M'))))
df2.head()

Unnamed: 0,order_id,seller_id,product_id,price,freight_value,review_score,order_status,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,product_category_name,qtd_vendas,days_without_sell,days_to_sell,income,average_ticket,avg_month_income,time_to_answer,avg_month_sell
93696,d455a8cb295653b55abda06d434ab492,0015a82c2db000af6aaaf3ae2ecb0532,a2ff5a97bf95719e38ea2e3b4105bce8,895.0,21.02,5.0,delivered,2016-10-18 15:53:51,2016-10-26 16:35:46,2016-12-14,,3,,256.0,2685.0,895.0,149.166667,2.0,0.166667
69082,9dc8d1a6f16f1b89874c29c9d8d30447,0015a82c2db000af6aaaf3ae2ecb0532,a2ff5a97bf95719e38ea2e3b4105bce8,895.0,21.02,3.0,delivered,2018-01-26 17:49:06,2018-01-29 21:19:11,2018-02-09,,3,465.0,256.0,2685.0,895.0,149.166667,3.0,0.166667
55943,7f39ba4c9052be115350065d07583cac,0015a82c2db000af6aaaf3ae2ecb0532,a2ff5a97bf95719e38ea2e3b4105bce8,895.0,21.02,5.0,delivered,2018-03-14 18:14:50,2018-03-27 20:38:17,2018-04-03,,3,47.0,256.0,2685.0,895.0,149.166667,2.0,0.166667
62827,8f7176f38b6ac3f5e924b9b05716440c,001cca7ae9ae17fb1caed9dfb1094831,98a8c2fa16d7239c606640f5555768e4,109.0,44.84,5.0,delivered,2016-10-12 11:02:41,2016-10-28 14:11:26,2016-12-14,,239,,3.301887,16859.16,70.540418,936.62,0.0,13.277778
32068,48ac23662de1f4a94e29e7f3452a85d9,001cca7ae9ae17fb1caed9dfb1094831,0da9ffd92214425d880de3f94e74ce39,112.0,46.08,5.0,delivered,2016-10-25 14:16:48,2016-10-26 16:41:21,2016-11-24,informatica_acessorios,239,13.0,3.301887,16859.16,70.540418,936.62,6.0,13.277778


## Idade em meses

In [101]:
df2['age'] = round(((df2['order_delivered_carrier_date'].iloc[-1] - df2.groupby('seller_id')['order_delivered_carrier_date']
                     .transform('min'))/np.timedelta64(1, 'M')))
df2.head()

Unnamed: 0,order_id,seller_id,product_id,price,freight_value,review_score,order_status,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,product_category_name,qtd_vendas,days_without_sell,days_to_sell,income,average_ticket,avg_month_income,time_to_answer,avg_month_sell,age
93696,d455a8cb295653b55abda06d434ab492,0015a82c2db000af6aaaf3ae2ecb0532,a2ff5a97bf95719e38ea2e3b4105bce8,895.0,21.02,5.0,delivered,2016-10-18 15:53:51,2016-10-26 16:35:46,2016-12-14,,3,,256.0,2685.0,895.0,149.166667,2.0,0.166667,18.0
69082,9dc8d1a6f16f1b89874c29c9d8d30447,0015a82c2db000af6aaaf3ae2ecb0532,a2ff5a97bf95719e38ea2e3b4105bce8,895.0,21.02,3.0,delivered,2018-01-26 17:49:06,2018-01-29 21:19:11,2018-02-09,,3,465.0,256.0,2685.0,895.0,149.166667,3.0,0.166667,18.0
55943,7f39ba4c9052be115350065d07583cac,0015a82c2db000af6aaaf3ae2ecb0532,a2ff5a97bf95719e38ea2e3b4105bce8,895.0,21.02,5.0,delivered,2018-03-14 18:14:50,2018-03-27 20:38:17,2018-04-03,,3,47.0,256.0,2685.0,895.0,149.166667,2.0,0.166667,18.0
62827,8f7176f38b6ac3f5e924b9b05716440c,001cca7ae9ae17fb1caed9dfb1094831,98a8c2fa16d7239c606640f5555768e4,109.0,44.84,5.0,delivered,2016-10-12 11:02:41,2016-10-28 14:11:26,2016-12-14,,239,,3.301887,16859.16,70.540418,936.62,0.0,13.277778,18.0
32068,48ac23662de1f4a94e29e7f3452a85d9,001cca7ae9ae17fb1caed9dfb1094831,0da9ffd92214425d880de3f94e74ce39,112.0,46.08,5.0,delivered,2016-10-25 14:16:48,2016-10-26 16:41:21,2016-11-24,informatica_acessorios,239,13.0,3.301887,16859.16,70.540418,936.62,6.0,13.277778,18.0


## Média de frete

In [102]:
df2['avg_freight'] = df2.groupby('seller_id')['freight_value'].transform('mean')
df2.head()

Unnamed: 0,order_id,seller_id,product_id,price,freight_value,review_score,order_status,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,...,qtd_vendas,days_without_sell,days_to_sell,income,average_ticket,avg_month_income,time_to_answer,avg_month_sell,age,avg_freight
93696,d455a8cb295653b55abda06d434ab492,0015a82c2db000af6aaaf3ae2ecb0532,a2ff5a97bf95719e38ea2e3b4105bce8,895.0,21.02,5.0,delivered,2016-10-18 15:53:51,2016-10-26 16:35:46,2016-12-14,...,3,,256.0,2685.0,895.0,149.166667,2.0,0.166667,18.0,21.02
69082,9dc8d1a6f16f1b89874c29c9d8d30447,0015a82c2db000af6aaaf3ae2ecb0532,a2ff5a97bf95719e38ea2e3b4105bce8,895.0,21.02,3.0,delivered,2018-01-26 17:49:06,2018-01-29 21:19:11,2018-02-09,...,3,465.0,256.0,2685.0,895.0,149.166667,3.0,0.166667,18.0,21.02
55943,7f39ba4c9052be115350065d07583cac,0015a82c2db000af6aaaf3ae2ecb0532,a2ff5a97bf95719e38ea2e3b4105bce8,895.0,21.02,5.0,delivered,2018-03-14 18:14:50,2018-03-27 20:38:17,2018-04-03,...,3,47.0,256.0,2685.0,895.0,149.166667,2.0,0.166667,18.0,21.02
62827,8f7176f38b6ac3f5e924b9b05716440c,001cca7ae9ae17fb1caed9dfb1094831,98a8c2fa16d7239c606640f5555768e4,109.0,44.84,5.0,delivered,2016-10-12 11:02:41,2016-10-28 14:11:26,2016-12-14,...,239,,3.301887,16859.16,70.540418,936.62,0.0,13.277778,18.0,37.369125
32068,48ac23662de1f4a94e29e7f3452a85d9,001cca7ae9ae17fb1caed9dfb1094831,0da9ffd92214425d880de3f94e74ce39,112.0,46.08,5.0,delivered,2016-10-25 14:16:48,2016-10-26 16:41:21,2016-11-24,...,239,13.0,3.301887,16859.16,70.540418,936.62,6.0,13.277778,18.0,37.369125


## Quantidade de categorias distintas

In [103]:
df2['distinct_cat'] = df2.groupby('seller_id')['product_category_name'].transform('count')
df2.head()

Unnamed: 0,order_id,seller_id,product_id,price,freight_value,review_score,order_status,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,...,days_without_sell,days_to_sell,income,average_ticket,avg_month_income,time_to_answer,avg_month_sell,age,avg_freight,distinct_cat
93696,d455a8cb295653b55abda06d434ab492,0015a82c2db000af6aaaf3ae2ecb0532,a2ff5a97bf95719e38ea2e3b4105bce8,895.0,21.02,5.0,delivered,2016-10-18 15:53:51,2016-10-26 16:35:46,2016-12-14,...,,256.0,2685.0,895.0,149.166667,2.0,0.166667,18.0,21.02,0
69082,9dc8d1a6f16f1b89874c29c9d8d30447,0015a82c2db000af6aaaf3ae2ecb0532,a2ff5a97bf95719e38ea2e3b4105bce8,895.0,21.02,3.0,delivered,2018-01-26 17:49:06,2018-01-29 21:19:11,2018-02-09,...,465.0,256.0,2685.0,895.0,149.166667,3.0,0.166667,18.0,21.02,0
55943,7f39ba4c9052be115350065d07583cac,0015a82c2db000af6aaaf3ae2ecb0532,a2ff5a97bf95719e38ea2e3b4105bce8,895.0,21.02,5.0,delivered,2018-03-14 18:14:50,2018-03-27 20:38:17,2018-04-03,...,47.0,256.0,2685.0,895.0,149.166667,2.0,0.166667,18.0,21.02,0
62827,8f7176f38b6ac3f5e924b9b05716440c,001cca7ae9ae17fb1caed9dfb1094831,98a8c2fa16d7239c606640f5555768e4,109.0,44.84,5.0,delivered,2016-10-12 11:02:41,2016-10-28 14:11:26,2016-12-14,...,,3.301887,16859.16,70.540418,936.62,0.0,13.277778,18.0,37.369125,62
32068,48ac23662de1f4a94e29e7f3452a85d9,001cca7ae9ae17fb1caed9dfb1094831,0da9ffd92214425d880de3f94e74ce39,112.0,46.08,5.0,delivered,2016-10-25 14:16:48,2016-10-26 16:41:21,2016-11-24,...,13.0,3.301887,16859.16,70.540418,936.62,6.0,13.277778,18.0,37.369125,62


## Prazo estimado

In [112]:
df2['avg_estimated_time'] = (df2['order_estimated_delivery_date'] - df2['order_delivered_carrier_date']).dt.days
df2['avg_estimated_time'] = df2.groupby('seller_id')['avg_estimated_time'].transform('mean')
df2.head()

Unnamed: 0,order_id,seller_id,product_id,price,freight_value,review_score,order_status,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,...,time_to_answer,avg_month_sell,age,avg_freight,distinct_cat,estimated_time,real_delevery_time,avg_real_delevery_time,avg_real_delivery_time,avg_estimated_time
93696,d455a8cb295653b55abda06d434ab492,0015a82c2db000af6aaaf3ae2ecb0532,a2ff5a97bf95719e38ea2e3b4105bce8,895.0,21.02,5.0,delivered,2016-10-18 15:53:51,2016-10-26 16:35:46,2016-12-14,...,2.0,0.166667,18.0,21.02,0,56,8.0,8.0,8.0,29.333333
69082,9dc8d1a6f16f1b89874c29c9d8d30447,0015a82c2db000af6aaaf3ae2ecb0532,a2ff5a97bf95719e38ea2e3b4105bce8,895.0,21.02,3.0,delivered,2018-01-26 17:49:06,2018-01-29 21:19:11,2018-02-09,...,3.0,0.166667,18.0,21.02,0,13,3.0,3.0,8.0,29.333333
55943,7f39ba4c9052be115350065d07583cac,0015a82c2db000af6aaaf3ae2ecb0532,a2ff5a97bf95719e38ea2e3b4105bce8,895.0,21.02,5.0,delivered,2018-03-14 18:14:50,2018-03-27 20:38:17,2018-04-03,...,2.0,0.166667,18.0,21.02,0,19,13.0,13.0,8.0,29.333333
62827,8f7176f38b6ac3f5e924b9b05716440c,001cca7ae9ae17fb1caed9dfb1094831,98a8c2fa16d7239c606640f5555768e4,109.0,44.84,5.0,delivered,2016-10-12 11:02:41,2016-10-28 14:11:26,2016-12-14,...,0.0,13.277778,18.0,37.369125,62,62,16.0,16.0,9.221519,21.53125
32068,48ac23662de1f4a94e29e7f3452a85d9,001cca7ae9ae17fb1caed9dfb1094831,0da9ffd92214425d880de3f94e74ce39,112.0,46.08,5.0,delivered,2016-10-25 14:16:48,2016-10-26 16:41:21,2016-11-24,...,6.0,13.277778,18.0,37.369125,62,29,1.0,1.0,9.221519,21.53125


## Prazo real

In [111]:
df2['avg_real_delivery_time'] = (df2['order_delivered_customer_date'] - df2['order_delivered_carrier_date']).dt.days
df2['avg_real_delivery_time'] = df2.groupby('seller_id')['avg_real_delivery_time'].transform('mean')
df2.head()

Unnamed: 0,order_id,seller_id,product_id,price,freight_value,review_score,order_status,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,...,avg_month_income,time_to_answer,avg_month_sell,age,avg_freight,distinct_cat,estimated_time,real_delevery_time,avg_real_delevery_time,avg_real_delivery_time
93696,d455a8cb295653b55abda06d434ab492,0015a82c2db000af6aaaf3ae2ecb0532,a2ff5a97bf95719e38ea2e3b4105bce8,895.0,21.02,5.0,delivered,2016-10-18 15:53:51,2016-10-26 16:35:46,2016-12-14,...,149.166667,2.0,0.166667,18.0,21.02,0,56,8.0,8.0,8.0
69082,9dc8d1a6f16f1b89874c29c9d8d30447,0015a82c2db000af6aaaf3ae2ecb0532,a2ff5a97bf95719e38ea2e3b4105bce8,895.0,21.02,3.0,delivered,2018-01-26 17:49:06,2018-01-29 21:19:11,2018-02-09,...,149.166667,3.0,0.166667,18.0,21.02,0,13,3.0,3.0,8.0
55943,7f39ba4c9052be115350065d07583cac,0015a82c2db000af6aaaf3ae2ecb0532,a2ff5a97bf95719e38ea2e3b4105bce8,895.0,21.02,5.0,delivered,2018-03-14 18:14:50,2018-03-27 20:38:17,2018-04-03,...,149.166667,2.0,0.166667,18.0,21.02,0,19,13.0,13.0,8.0
62827,8f7176f38b6ac3f5e924b9b05716440c,001cca7ae9ae17fb1caed9dfb1094831,98a8c2fa16d7239c606640f5555768e4,109.0,44.84,5.0,delivered,2016-10-12 11:02:41,2016-10-28 14:11:26,2016-12-14,...,936.62,0.0,13.277778,18.0,37.369125,62,62,16.0,16.0,9.221519
32068,48ac23662de1f4a94e29e7f3452a85d9,001cca7ae9ae17fb1caed9dfb1094831,0da9ffd92214425d880de3f94e74ce39,112.0,46.08,5.0,delivered,2016-10-25 14:16:48,2016-10-26 16:41:21,2016-11-24,...,936.62,6.0,13.277778,18.0,37.369125,62,29,1.0,1.0,9.221519


## Atrasa?

In [116]:
df2['islate'] = (df2.avg_estimated_time - df2.avg_real_delivery_time).map(lambda x: 1 if x < 0 else 0)
df2.head()

Unnamed: 0,order_id,seller_id,product_id,price,freight_value,review_score,order_status,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,...,avg_month_sell,age,avg_freight,distinct_cat,estimated_time,real_delevery_time,avg_real_delevery_time,avg_real_delivery_time,avg_estimated_time,islate
93696,d455a8cb295653b55abda06d434ab492,0015a82c2db000af6aaaf3ae2ecb0532,a2ff5a97bf95719e38ea2e3b4105bce8,895.0,21.02,5.0,delivered,2016-10-18 15:53:51,2016-10-26 16:35:46,2016-12-14,...,0.166667,18.0,21.02,0,56,8.0,8.0,8.0,29.333333,0
69082,9dc8d1a6f16f1b89874c29c9d8d30447,0015a82c2db000af6aaaf3ae2ecb0532,a2ff5a97bf95719e38ea2e3b4105bce8,895.0,21.02,3.0,delivered,2018-01-26 17:49:06,2018-01-29 21:19:11,2018-02-09,...,0.166667,18.0,21.02,0,13,3.0,3.0,8.0,29.333333,0
55943,7f39ba4c9052be115350065d07583cac,0015a82c2db000af6aaaf3ae2ecb0532,a2ff5a97bf95719e38ea2e3b4105bce8,895.0,21.02,5.0,delivered,2018-03-14 18:14:50,2018-03-27 20:38:17,2018-04-03,...,0.166667,18.0,21.02,0,19,13.0,13.0,8.0,29.333333,0
62827,8f7176f38b6ac3f5e924b9b05716440c,001cca7ae9ae17fb1caed9dfb1094831,98a8c2fa16d7239c606640f5555768e4,109.0,44.84,5.0,delivered,2016-10-12 11:02:41,2016-10-28 14:11:26,2016-12-14,...,13.277778,18.0,37.369125,62,62,16.0,16.0,9.221519,21.53125,0
32068,48ac23662de1f4a94e29e7f3452a85d9,001cca7ae9ae17fb1caed9dfb1094831,0da9ffd92214425d880de3f94e74ce39,112.0,46.08,5.0,delivered,2016-10-25 14:16:48,2016-10-26 16:41:21,2016-11-24,...,13.277778,18.0,37.369125,62,29,1.0,1.0,9.221519,21.53125,0
