## Analisando e tratando os dados do arquivp Sales.OrderDetail.csv que estão na pasta RAW
### Ao final desse notebook os dados serão carregados na pasta REFINED em formato parquet

In [1]:
# importando as bibliotecas necessárias
import pandas as pd
import numpy as np

In [2]:
# criando a variavel com o caminho do arquivo que iremos carregar
path_orderheader = "gs://bike-factory-datalake/01.RAW/Sales.SalesOrderHeader.csv"

In [3]:
# carregando o arquivo CSV
order_header = pd.read_csv(path_orderheader,sep=';')

In [4]:
# criando o DF pandas
df_order_header = pd.DataFrame(order_header)

In [5]:
df_order_header.dtypes

SalesOrderID                int64
RevisionNumber              int64
OrderDate                  object
DueDate                    object
ShipDate                   object
Status                      int64
OnlineOrderFlag             int64
SalesOrderNumber           object
PurchaseOrderNumber        object
AccountNumber              object
CustomerID                  int64
SalesPersonID             float64
TerritoryID                 int64
BillToAddressID             int64
ShipToAddressID             int64
ShipMethodID                int64
CreditCardID              float64
CreditCardApprovalCode     object
CurrencyRateID            float64
SubTotal                   object
TaxAmt                     object
Freight                    object
TotalDue                   object
Comment                   float64
rowguid                    object
ModifiedDate               object
dtype: object

In [6]:
# verificando as primeiras linhas do DF
df_order_header.head(100)

Unnamed: 0,SalesOrderID,RevisionNumber,OrderDate,DueDate,ShipDate,Status,OnlineOrderFlag,SalesOrderNumber,PurchaseOrderNumber,AccountNumber,...,CreditCardID,CreditCardApprovalCode,CurrencyRateID,SubTotal,TaxAmt,Freight,TotalDue,Comment,rowguid,ModifiedDate
0,43659,8,2011-05-31 00:00:00.000,2011-06-12 00:00:00.000,2011-06-07 00:00:00.000,5,0,SO43659,PO522145787,10-4020-000676,...,16281.0,105041Vi84182,,205656206,19715149,6160984,231532339,,79B65321-39CA-4115-9CBA-8FE0903E12E6,2011-06-07 00:00:00.000
1,43660,8,2011-05-31 00:00:00.000,2011-06-12 00:00:00.000,2011-06-07 00:00:00.000,5,0,SO43660,PO18850127500,10-4020-000117,...,5618.0,115213Vi29411,,12942529,1242483,388276,14573288,,738DC42D-D03B-48A1-9822-F95A67EA7389,2011-06-07 00:00:00.000
2,43661,8,2011-05-31 00:00:00.000,2011-06-12 00:00:00.000,2011-06-07 00:00:00.000,5,0,SO43661,PO18473189620,10-4020-000442,...,1346.0,85274Vi6854,4.0,327264786,31537696,985553,368658012,,D91B9131-18A4-4A11-BC3A-90B6F53E9D74,2011-06-07 00:00:00.000
3,43662,8,2011-05-31 00:00:00.000,2011-06-12 00:00:00.000,2011-06-07 00:00:00.000,5,0,SO43662,PO18444174044,10-4020-000227,...,10456.0,125295Vi53935,4.0,288325289,27751646,8672389,324749324,,4A1ECFC0-CC3A-4740-B028-1C50BB48711C,2011-06-07 00:00:00.000
4,43663,8,2011-05-31 00:00:00.000,2011-06-12 00:00:00.000,2011-06-07 00:00:00.000,5,0,SO43663,PO18009186470,10-4020-000510,...,4322.0,45303Vi22691,,4194589,402681,125838,4723108,,9B1E7A40-6AE0-4AD3-811C-A64951857C4B,2011-06-07 00:00:00.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,43754,8,2011-06-14 00:00:00.000,2011-06-26 00:00:00.000,2011-06-21 00:00:00.000,5,1,SO43754,,10-4030-027617,...,9734.0,630188Vi50171,,357827,2862616,894568,39539884,,86B0C2B9-6ABD-4B69-9C76-2B702B604D77,2011-06-21 00:00:00.000
96,43755,8,2011-06-14 00:00:00.000,2011-06-26 00:00:00.000,2011-06-21 00:00:00.000,5,1,SO43755,,10-4030-027670,...,14614.0,1030205Vi75785,,357827,2862616,894568,39539884,,98337979-2598-4E10-8666-4823D1EB0564,2011-06-21 00:00:00.000
97,43756,8,2011-06-14 00:00:00.000,2011-06-26 00:00:00.000,2011-06-21 00:00:00.000,5,1,SO43756,,10-4030-019941,...,9528.0,830399Vi49119,191.0,6990982,559279,174775,7725036,,6D3EE374-D017-4007-9BA3-8966F2B3CD00,2011-06-21 00:00:00.000
98,43757,8,2011-06-14 00:00:00.000,2011-06-26 00:00:00.000,2011-06-21 00:00:00.000,5,1,SO43757,,10-4030-011017,...,12801.0,635102Vi66203,184.0,337499,2699992,843748,3729364,,238FD676-5C9C-44A6-8C10-F890887887B6,2011-06-21 00:00:00.000


In [7]:
# Coletando informações básicas do DF
# A saída desse comando será a quantidade de linhas e colunas (linhas, colunas)
df_order_header.shape

(31465, 26)

In [8]:
# verificando o index do arquivo
df_order_header.index

RangeIndex(start=0, stop=31465, step=1)

In [9]:
# Verificando as colunas do DF
df_order_header.columns

Index(['SalesOrderID', 'RevisionNumber', 'OrderDate', 'DueDate', 'ShipDate',
       'Status', 'OnlineOrderFlag', 'SalesOrderNumber', 'PurchaseOrderNumber',
       'AccountNumber', 'CustomerID', 'SalesPersonID', 'TerritoryID',
       'BillToAddressID', 'ShipToAddressID', 'ShipMethodID', 'CreditCardID',
       'CreditCardApprovalCode', 'CurrencyRateID', 'SubTotal', 'TaxAmt',
       'Freight', 'TotalDue', 'Comment', 'rowguid', 'ModifiedDate'],
      dtype='object')

In [10]:
# Contagem de dados não nulos
df_order_header.count()

SalesOrderID              31465
RevisionNumber            31465
OrderDate                 31465
DueDate                   31465
ShipDate                  31465
Status                    31465
OnlineOrderFlag           31465
SalesOrderNumber          31465
PurchaseOrderNumber        3806
AccountNumber             31465
CustomerID                31465
SalesPersonID              3806
TerritoryID               31465
BillToAddressID           31465
ShipToAddressID           31465
ShipMethodID              31465
CreditCardID              30334
CreditCardApprovalCode    30334
CurrencyRateID            13976
SubTotal                  31465
TaxAmt                    31465
Freight                   31465
TotalDue                  31465
Comment                       0
rowguid                   31465
ModifiedDate              31465
dtype: int64

In [11]:
# Identificando a quantidade de dados nulos em cada coluna
df_order_header.isnull().sum()

SalesOrderID                  0
RevisionNumber                0
OrderDate                     0
DueDate                       0
ShipDate                      0
Status                        0
OnlineOrderFlag               0
SalesOrderNumber              0
PurchaseOrderNumber       27659
AccountNumber                 0
CustomerID                    0
SalesPersonID             27659
TerritoryID                   0
BillToAddressID               0
ShipToAddressID               0
ShipMethodID                  0
CreditCardID               1131
CreditCardApprovalCode     1131
CurrencyRateID            17489
SubTotal                      0
TaxAmt                        0
Freight                       0
TotalDue                      0
Comment                   31465
rowguid                       0
ModifiedDate                  0
dtype: int64

Nesse DataFrame vemos os seguintes problemas para serem tratados:

- Coluna Comment com todos os dados nulos, portanto iremos dropar;
- Coluna CurrencyRateID com 17489 dados nulos e apesar de ser um ID, não será usado nas próximas etapas;
- Coluna CreditCardID e CreditCardApprovalCode com 1.131 dados nulos, porém, essas dados são sensíveis e não estão no escopo do problema de negócio que precisa ser revolvido, portando essas colunas serão dropadas;
- Colunas PurchaseOrderNumber e SalesPersonID com 27.659 dados nulos, elas não são chaves essencias para nossa análise, portanto, droparemos essas colunas;
- As colunas OrderDate, DueDate, ShipDate e ModifiedDate estão com o tipo object e precisam ser alterados para date;
- As colunas SubTotal, TaxAmt, Freight, TotalDue estão com o tipo object mas são do tipo float e estão com o separador da casa decimal com vírgula, então antes de converter em float devemos trocar o separador de casa decimal.

In [12]:
# Dropando as colunas que não vamos precisar
df_order_header = df_order_header.drop(['Comment','CurrencyRateID','CreditCardID','CreditCardApprovalCode','PurchaseOrderNumber','SalesPersonID'],axis=1) 

In [13]:
df_order_header.isnull().sum()

SalesOrderID        0
RevisionNumber      0
OrderDate           0
DueDate             0
ShipDate            0
Status              0
OnlineOrderFlag     0
SalesOrderNumber    0
AccountNumber       0
CustomerID          0
TerritoryID         0
BillToAddressID     0
ShipToAddressID     0
ShipMethodID        0
SubTotal            0
TaxAmt              0
Freight             0
TotalDue            0
rowguid             0
ModifiedDate        0
dtype: int64

In [14]:
# Trocando os tipo das colunas 
df_order_header['OrderDate'] = pd.to_datetime(df_order_header['OrderDate'])
df_order_header['DueDate'] = pd.to_datetime(df_order_header['DueDate'])
df_order_header['ShipDate'] = pd.to_datetime(df_order_header['ShipDate'])
df_order_header['ModifiedDate'] = pd.to_datetime(df_order_header['ModifiedDate'])

In [15]:
df_order_header.dtypes

SalesOrderID                 int64
RevisionNumber               int64
OrderDate           datetime64[ns]
DueDate             datetime64[ns]
ShipDate            datetime64[ns]
Status                       int64
OnlineOrderFlag              int64
SalesOrderNumber            object
AccountNumber               object
CustomerID                   int64
TerritoryID                  int64
BillToAddressID              int64
ShipToAddressID              int64
ShipMethodID                 int64
SubTotal                    object
TaxAmt                      object
Freight                     object
TotalDue                    object
rowguid                     object
ModifiedDate        datetime64[ns]
dtype: object

In [16]:
# Trocando o caracterir dividor da casa decimal
df_order_header['SubTotal'] = df_order_header['SubTotal'].apply(lambda x: x.replace(',','.'))
df_order_header['TaxAmt'] = df_order_header['TaxAmt'].apply(lambda x: x.replace(',','.'))
df_order_header['Freight'] = df_order_header['Freight'].apply(lambda x: x.replace(',','.'))
df_order_header['TotalDue'] = df_order_header['TotalDue'].apply(lambda x: x.replace(',','.'))

In [17]:
df_order_header.head(10)

Unnamed: 0,SalesOrderID,RevisionNumber,OrderDate,DueDate,ShipDate,Status,OnlineOrderFlag,SalesOrderNumber,AccountNumber,CustomerID,TerritoryID,BillToAddressID,ShipToAddressID,ShipMethodID,SubTotal,TaxAmt,Freight,TotalDue,rowguid,ModifiedDate
0,43659,8,2011-05-31,2011-06-12,2011-06-07,5,0,SO43659,10-4020-000676,29825,5,985,985,5,20565.6206,1971.5149,616.0984,23153.2339,79B65321-39CA-4115-9CBA-8FE0903E12E6,2011-06-07
1,43660,8,2011-05-31,2011-06-12,2011-06-07,5,0,SO43660,10-4020-000117,29672,5,921,921,5,1294.2529,124.2483,38.8276,1457.3288,738DC42D-D03B-48A1-9822-F95A67EA7389,2011-06-07
2,43661,8,2011-05-31,2011-06-12,2011-06-07,5,0,SO43661,10-4020-000442,29734,6,517,517,5,32726.4786,3153.7696,985.553,36865.8012,D91B9131-18A4-4A11-BC3A-90B6F53E9D74,2011-06-07
3,43662,8,2011-05-31,2011-06-12,2011-06-07,5,0,SO43662,10-4020-000227,29994,6,482,482,5,28832.5289,2775.1646,867.2389,32474.9324,4A1ECFC0-CC3A-4740-B028-1C50BB48711C,2011-06-07
4,43663,8,2011-05-31,2011-06-12,2011-06-07,5,0,SO43663,10-4020-000510,29565,4,1073,1073,5,419.4589,40.2681,12.5838,472.3108,9B1E7A40-6AE0-4AD3-811C-A64951857C4B,2011-06-07
5,43664,8,2011-05-31,2011-06-12,2011-06-07,5,0,SO43664,10-4020-000397,29898,1,876,876,5,24432.6088,2344.9921,732.81,27510.4109,22A8A5DA-8C22-42AD-9241-839489B6EF0D,2011-06-07
6,43665,8,2011-05-31,2011-06-12,2011-06-07,5,0,SO43665,10-4020-000146,29580,1,849,849,5,14352.7713,1375.9427,429.9821,16158.6961,5602C304-853C-43D7-9E79-76E320D476CF,2011-06-07
7,43666,8,2011-05-31,2011-06-12,2011-06-07,5,0,SO43666,10-4020-000511,30052,4,1074,1074,5,5056.4896,486.3747,151.9921,5694.8564,E2A90057-1366-4487-8A7E-8085845FF770,2011-06-07
8,43667,8,2011-05-31,2011-06-12,2011-06-07,5,0,SO43667,10-4020-000646,29974,3,629,629,5,6107.082,586.1203,183.1626,6876.3649,86D5237D-432D-4B21-8ABC-671942F5789D,2011-06-07
9,43668,8,2011-05-31,2011-06-12,2011-06-07,5,0,SO43668,10-4020-000514,29614,6,529,529,5,35944.1562,3461.7654,1081.8017,40487.7233,281CC355-D538-494E-9B44-461B36A826C6,2011-06-07


In [18]:
df_order_header['SubTotal'] = df_order_header['SubTotal'].astype(float)
df_order_header['TaxAmt'] = df_order_header['TaxAmt'].astype(float)
df_order_header['Freight'] = df_order_header['Freight'].astype(float)
df_order_header['TotalDue'] = df_order_header['TotalDue'].astype(float)

In [19]:
df_order_header.dtypes

SalesOrderID                 int64
RevisionNumber               int64
OrderDate           datetime64[ns]
DueDate             datetime64[ns]
ShipDate            datetime64[ns]
Status                       int64
OnlineOrderFlag              int64
SalesOrderNumber            object
AccountNumber               object
CustomerID                   int64
TerritoryID                  int64
BillToAddressID              int64
ShipToAddressID              int64
ShipMethodID                 int64
SubTotal                   float64
TaxAmt                     float64
Freight                    float64
TotalDue                   float64
rowguid                     object
ModifiedDate        datetime64[ns]
dtype: object

Por fim, já temos todas as nossas colunas tratadas, então iremos salvar o DF na pasta REFINED do nosso Lake em um arquivo parquet que é otimizado para o modelo analítico.

In [20]:
df_order_header.to_parquet('gs://bike-factory-datalake/02.REFINED/sales.salesorderheader.parquet')