# Regras de associação - Case Pizzaria

## Importação da biblioteca

In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

## Importação do Dataset

In [2]:
df = pd.read_csv('pizzaria.csv')
df.head(10)

Unnamed: 0,data_pedido,hora_pedido,tipo_entrega,valor_borda,valor_refrigerante,valor_total,tempo
0,2011-10-07,18:45:00,Buscar,0.0,0.0,32.0,00:30:00
1,2011-10-07,18:47:00,Buscar,0.0,0.0,30.0,00:43:00
2,2011-10-07,18:49:00,Buscar,0.0,0.0,27.0,00:42:00
3,2011-10-07,18:50:00,Buscar,0.0,0.0,30.0,00:40:00
4,2011-10-07,18:52:00,Buscar,0.0,0.0,30.0,00:23:00
5,2011-10-07,18:57:00,Entrega,0.0,4.5,30.0,00:38:00
6,2011-10-07,18:59:00,Buscar,2.5,0.0,30.5,00:21:00
7,2011-10-07,20:00:00,Entrega,0.0,4.5,35.0,00:32:00
8,2011-10-07,20:05:00,Buscar,0.0,0.0,23.0,00:35:00
9,2011-10-07,20:10:00,Entrega,0.0,0.0,25.5,00:55:00


## Exploração do dataset

In [3]:
df.dtypes

data_pedido            object
hora_pedido            object
tipo_entrega           object
valor_borda           float64
valor_refrigerante    float64
valor_total           float64
tempo                  object
dtype: object

In [4]:
df.describe()

Unnamed: 0,valor_borda,valor_refrigerante,valor_total
count,1000.0,1000.0,1000.0
mean,0.2765,0.741,28.1833
std,0.801524,1.692124,5.767485
min,0.0,0.0,10.0
25%,0.0,0.0,23.5
50%,0.0,0.0,29.0
75%,0.0,0.0,32.5
max,3.0,9.0,43.0


In [5]:
df['data_pedido'].value_counts()

2011-11-18    50
2011-11-19    39
2011-11-26    38
2011-10-22    36
2011-11-16    35
2011-12-03    33
2011-11-12    29
2011-10-08    27
2011-10-29    25
2011-12-02    25
2011-10-28    24
2011-11-05    23
2011-11-13    23
2011-12-09    23
2011-12-04    23
2011-10-07    23
2011-10-15    22
2011-11-27    22
2011-12-10    20
2011-11-02    20
2011-10-30    20
2011-12-06    20
2011-12-08    18
2011-12-07    18
2011-11-04    18
2011-11-24    18
2011-11-20    18
2011-10-12    17
2011-11-25    16
2011-10-21    15
2011-10-14    15
2011-11-15    15
2011-10-23    15
2011-10-09    14
2011-12-01    14
2011-10-27    14
2011-11-11    14
2011-11-01    13
2011-11-29    13
2011-11-10    12
2011-11-06    11
2011-10-20    11
2011-11-22    10
2011-10-11    10
2011-11-03     8
2011-10-25     8
2011-11-08     8
2011-10-26     8
2011-10-19     8
2011-11-17     7
2011-11-09     7
2011-10-13     6
2011-11-30     6
2011-11-23     6
2011-10-18     5
2011-10-16     4
Name: data_pedido, dtype: int64

In [6]:
df.isnull().sum()

data_pedido           0
hora_pedido           0
tipo_entrega          0
valor_borda           0
valor_refrigerante    0
valor_total           0
tempo                 0
dtype: int64

## Formatando as variáveis

### Formatando 'data_pedido'

In [7]:
## Alterando o tipo de dado
df['data_pedido'] = df['data_pedido'].astype('datetime64')

In [8]:
## Criando uma coluna com o mes
df['mes'] = df['data_pedido'].dt.month
df.sample(3)

Unnamed: 0,data_pedido,hora_pedido,tipo_entrega,valor_borda,valor_refrigerante,valor_total,tempo,mes
849,2011-12-03,18:35:00,Entrega,0.0,0.0,24.5,00:45:00,12
737,2011-11-26,19:28:00,Entrega,3.0,2.5,41.5,00:52:00,11
665,2011-11-20,20:29:00,Entrega,0.0,0.0,30.5,00:35:00,11


In [9]:
## Criando uma coluna com o dia da semana
df['dia_semana'] = df['data_pedido'].dt.day_name()

## A funcao 'dt.day_name() retorna valores em ingles
## criado um dicionario para traduzir os valores
dia_da_semana_pt = {'Saturday' : 'sabado', 'Friday': 'sexta', 'Sunday' : 'domingo', 'Wednesday' : 'quarta',
                   'Thursday' : 'quinta', 'Tuesday' : 'terca'}

## Substitindo os valores em ingles pelo dict 'dia_da_semana_pt'
df['dia_semana'] = df['dia_semana'].map(dia_da_semana_pt)

## Avaliando uma amostra de dados
df.sample(5)

Unnamed: 0,data_pedido,hora_pedido,tipo_entrega,valor_borda,valor_refrigerante,valor_total,tempo,mes,dia_semana
565,2011-11-17,20:23:00,Entrega,0.0,0.0,23.5,00:37:00,11,quinta
856,2011-12-03,20:13:00,Buscar,2.5,0.0,32.5,00:47:00,12,sabado
344,2011-11-02,19:36:00,Buscar,0.0,0.0,30.0,00:29:00,11,quarta
987,2011-12-10,19:57:00,Buscar,0.0,0.0,32.0,00:53:00,12,sabado
870,2011-12-03,21:44:00,Entrega,0.0,0.0,34.5,00:46:00,12,sabado


### Formatando a variável 'hora_pedido'

In [10]:
df['hora_pedido'].value_counts()

18:00:00    17
20:45:00    15
20:25:00    14
21:10:00    14
20:20:00    13
            ..
19:27:00     1
22:46:00     1
23:03:00     1
22:41:00     1
19:41:00     1
Name: hora_pedido, Length: 279, dtype: int64

In [11]:
## Analisando quais sao os horarios no dataframe

df['horario_pico'] = df['hora_pedido'].str.slice(start=0, stop = 2)
horarios_pico = df['horario_pico'].value_counts()

In [12]:
## horarios_pico.
horarios_pico = horarios_pico.reset_index()

## Ordenando os valores pelo numero de pedidos
horarios_pico.sort_values('horario_pico', ascending = False)

## 'rankiando' os horarios conforme a quantidade de pedidos
horarios_pico['rank'] = horarios_pico['horario_pico'].rank()

horarios_pico

Unnamed: 0,index,horario_pico,rank
0,20,315,6.0
1,21,241,5.0
2,19,166,4.0
3,22,130,3.0
4,18,101,2.0
5,23,47,1.0


In [13]:
## Separando entre "horarios de pico"
## maior quantidade de vendas == alto
## quantidade media == medio
## poucas vendas == baixo


## Com base no 'rank', criando os intervalos
condicoes = [
    (horarios_pico['rank'] > 4),
    (horarios_pico['rank'] > 2)]

## Criando os rótulos para as condicioes
rank = ['alto', 'medio']

## Concatenando as variaveis 'condicoes' e 'rank'
horarios_pico['rank2'] = np.select(condicoes, rank, default = 'baixo')

horarios_pico

Unnamed: 0,index,horario_pico,rank,rank2
0,20,315,6.0,alto
1,21,241,5.0,alto
2,19,166,4.0,medio
3,22,130,3.0,medio
4,18,101,2.0,baixo
5,23,47,1.0,baixo


In [14]:
## Selecionando apenas as colunas de horario e classificacao
horarios_pico = horarios_pico[['index', 'rank2']]

## Renomeando as colunas
horarios_pico.rename(columns={'index' : 'horario_pico', 'rank2' : 'interval_pico'}, inplace = True)

horarios_pico

Unnamed: 0,horario_pico,interval_pico
0,20,alto
1,21,alto
2,19,medio
3,22,medio
4,18,baixo
5,23,baixo


In [15]:
## Alterando o tipo de dado df
df['horario_pico'] = df['horario_pico'].astype('int64')

## Alterando o tipo de dado horarios_pico
horarios_pico['horario_pico'] = horarios_pico['horario_pico'].astype('int64')

In [16]:
## Fazendo merge dos horarios com as classificacoes 'baixo', 'medio' e 'alto'

df = pd.merge(df, horarios_pico,
                on = 'horario_pico',
                how = 'left')

In [17]:
df.sample(10)

Unnamed: 0,data_pedido,hora_pedido,tipo_entrega,valor_borda,valor_refrigerante,valor_total,tempo,mes,dia_semana,horario_pico,interval_pico
701,2011-11-24,21:05:00,Entrega,0.0,0.0,19.5,00:35:00,11,quinta,21,alto
707,2011-11-24,22:45:00,Entrega,0.0,0.0,21.5,00:45:00,11,quinta,22,medio
841,2011-12-02,23:16:00,Entrega,0.0,0.0,22.5,00:34:00,12,sexta,23,baixo
532,2011-11-16,20:10:00,Buscar,0.0,4.0,34.0,00:50:00,11,quarta,20,alto
416,2011-11-06,20:27:00,Entrega,0.0,0.0,34.5,00:38:00,11,domingo,20,alto
572,2011-11-18,18:35:00,Entrega,2.5,0.0,27.0,00:45:00,11,sexta,18,baixo
170,2011-10-21,21:13:00,Entrega,0.0,0.0,25.0,00:35:00,10,sexta,21,alto
90,2011-10-12,22:23:00,Entrega,0.0,0.0,25.5,00:21:00,10,quarta,22,medio
951,2011-12-08,21:59:00,Entrega,0.0,4.5,37.0,00:41:00,12,quinta,21,alto
804,2011-11-30,21:24:00,Entrega,0.0,0.0,29.0,00:46:00,11,quarta,21,alto


### Formatando a variável 'tempo'