## Analisando e tratando os dados do arquivp Sales.OrderDetail.csv que estão na pasta RAW
### Ao final desse notebook os dados serão carregados na pasta REFINED em formato parquet

In [1]:
# importando as bibliotecas necessárias
import pandas as pd
import numpy as np

In [2]:
# criando a variavel com o caminho do arquivo que iremos carregar
path_orderdetail = "gs://bike-factory-datalake/01.RAW/Sales.SalesOrderDetail.csv"

In [3]:
# carregando o arquivo CSV
order_detail = pd.read_csv(path_orderdetail,sep=';')

In [4]:
# criando o DF pandas
df_order_detail = pd.DataFrame(order_detail)

In [5]:
df_order_detail.dtypes

SalesOrderID               int64
SalesOrderDetailID         int64
CarrierTrackingNumber     object
OrderQty                   int64
ProductID                  int64
SpecialOfferID             int64
UnitPrice                 object
UnitPriceDiscount         object
LineTotal                float64
rowguid                   object
ModifiedDate              object
dtype: object

In [6]:
# verificando as primeiras linhas do DF
df_order_detail.head(100)

Unnamed: 0,SalesOrderID,SalesOrderDetailID,CarrierTrackingNumber,OrderQty,ProductID,SpecialOfferID,UnitPrice,UnitPriceDiscount,LineTotal,rowguid,ModifiedDate
0,43659,1,4911-403C-98,1,776,1,2024994,000,2024.9940,B207C96D-D9E6-402B-8470-2CC176C42283,2011-05-31 00:00:00.000
1,43659,2,4911-403C-98,3,777,1,2024994,000,6074.9820,7ABB600D-1E77-41BE-9FE5-B9142CFC08FA,2011-05-31 00:00:00.000
2,43659,3,4911-403C-98,1,778,1,2024994,000,2024.9940,475CF8C6-49F6-486E-B0AD-AFC6A50CDD2F,2011-05-31 00:00:00.000
3,43659,4,4911-403C-98,1,771,1,2039994,000,2039.9940,04C4DE91-5815-45D6-8670-F462719FBCE3,2011-05-31 00:00:00.000
4,43659,5,4911-403C-98,1,772,1,2039994,000,2039.9940,5A74C7D2-E641-438E-A7AC-37BF23280301,2011-05-31 00:00:00.000
...,...,...,...,...,...,...,...,...,...,...,...
95,43668,96,365D-4C9A-BE,2,764,1,4194589,000,838.9178,FFFB02E3-1B08-44A7-8DC4-1EE6B24253D4,2011-05-31 00:00:00.000
96,43668,97,365D-4C9A-BE,2,766,1,4194589,000,838.9178,B21F0EC7-4B3B-43EF-A6F9-DC2F4CA1B71A,2011-05-31 00:00:00.000
97,43668,98,365D-4C9A-BE,2,725,1,1839382,000,367.8764,0AAAD124-8C88-4B30-9533-65399B6390E2,2011-05-31 00:00:00.000
98,43668,99,365D-4C9A-BE,1,716,1,288404,000,28.8404,7C20E15E-66C1-45FA-A10F-C495DDB52BF4,2011-05-31 00:00:00.000


In [7]:
# Coletando informações básicas do DF
# A saída desse comando será a quantidade de linhas e colunas (linhas, colunas)
df_order_detail.shape

(121317, 11)

In [8]:
# verificando o index do arquivo
df_order_detail.index

RangeIndex(start=0, stop=121317, step=1)

In [9]:
# Verificando as colunas do DF
df_order_detail.columns

Index(['SalesOrderID', 'SalesOrderDetailID', 'CarrierTrackingNumber',
       'OrderQty', 'ProductID', 'SpecialOfferID', 'UnitPrice',
       'UnitPriceDiscount', 'LineTotal', 'rowguid', 'ModifiedDate'],
      dtype='object')

In [10]:
# Contagem de dados não nulos
df_order_detail.count()

SalesOrderID             121317
SalesOrderDetailID       121317
CarrierTrackingNumber     60919
OrderQty                 121317
ProductID                121317
SpecialOfferID           121317
UnitPrice                121317
UnitPriceDiscount        121317
LineTotal                121317
rowguid                  121317
ModifiedDate             121317
dtype: int64

In [11]:
# Identificando a quantidade de dados nulos em cada coluna
df_order_detail.isnull().sum()

SalesOrderID                 0
SalesOrderDetailID           0
CarrierTrackingNumber    60398
OrderQty                     0
ProductID                    0
SpecialOfferID               0
UnitPrice                    0
UnitPriceDiscount            0
LineTotal                    0
rowguid                      0
ModifiedDate                 0
dtype: int64

Nesse DataFrame vemos os seguintes problemas para serem tratados:

- Coluna CarrierTrackingNumber possui muitos valores nulos e não será utilizada em nenhuma análise, portanto iremos dropar ela.
- Coluna ModifiedDate está como tipo object, teremos que converter no tipo date.
- Colunas UnitPrice e UnitPriceDiscount estão com o tipo object e iremos trocar por float

In [12]:
# Dropando a coluna CarrierTrackingNumber
df_order_detail = df_order_detail.drop('CarrierTrackingNumber', axis=1)

In [13]:
# Alterando o tipo da coluna ModifiedDate
df_order_detail.ModifiedDate = pd.to_datetime(df_order_detail.ModifiedDate)

In [14]:
# Para converter as colunas UnitPrice e UnitPriceDiscount de str para float, primeiro temos que trocar a "," por "."
df_order_detail.UnitPrice = df_order_detail.UnitPrice.apply(lambda x: x.replace(',', '.'))
df_order_detail.UnitPriceDiscount = df_order_detail.UnitPriceDiscount.apply(lambda x: x.replace(',', '.'))

In [15]:
# Verificando se a alteração foi efetuada
df_order_detail.head()

Unnamed: 0,SalesOrderID,SalesOrderDetailID,OrderQty,ProductID,SpecialOfferID,UnitPrice,UnitPriceDiscount,LineTotal,rowguid,ModifiedDate
0,43659,1,1,776,1,2024.994,0.0,2024.994,B207C96D-D9E6-402B-8470-2CC176C42283,2011-05-31
1,43659,2,3,777,1,2024.994,0.0,6074.982,7ABB600D-1E77-41BE-9FE5-B9142CFC08FA,2011-05-31
2,43659,3,1,778,1,2024.994,0.0,2024.994,475CF8C6-49F6-486E-B0AD-AFC6A50CDD2F,2011-05-31
3,43659,4,1,771,1,2039.994,0.0,2039.994,04C4DE91-5815-45D6-8670-F462719FBCE3,2011-05-31
4,43659,5,1,772,1,2039.994,0.0,2039.994,5A74C7D2-E641-438E-A7AC-37BF23280301,2011-05-31


In [16]:
# Trocando o tipo
df_order_detail.UnitPrice = df_order_detail.UnitPrice.astype(float)
df_order_detail.UnitPriceDiscount = df_order_detail.UnitPriceDiscount.astype(float)

In [17]:
# Verificando se os tipos foram alterados
df_order_detail.dtypes

SalesOrderID                   int64
SalesOrderDetailID             int64
OrderQty                       int64
ProductID                      int64
SpecialOfferID                 int64
UnitPrice                    float64
UnitPriceDiscount            float64
LineTotal                    float64
rowguid                       object
ModifiedDate          datetime64[ns]
dtype: object

Por fim, já temos todas as nossas colunas tratadas, então iremos salvar o DF na pasta REFINED do nosso Lake em um arquivo parquet que é otimizado para o modelo analítico.

In [18]:
df_order_detail.to_parquet('gs://bike-factory-datalake/02.REFINED/sales.salesorderdetail.parquet')