### 2RP Net - Data Engineer Test

#### Extração e validação de dados

In [1]:
# Import package
try:
    import pandas as pd  # Import pandas
    import numpy as np
    import os
    import sys
    import glob
    import time
    import urllib.request
    from urllib.request import urlretrieve
except Exception as e:
    print("Error : {'Falha Imports'} ".format(e))

In [2]:
# Marca o horário do início do programa
s_time_control = time.time()

#### Extração dos dados dos 3 ultimos meses de prescrição (english-prescribing-data-epd) sem contar o ultimo

In [3]:
# hiperlinks dos dados de referência
url1 = 'https://opendata.nhsbsa.net/dataset/65050ec0-5abd-48ce-989d-defc08ed837e/resource/fbed03dd-df68-46dc-a283-8e5beda931a3/download/epd_202205.csv'  
url2 = 'https://opendata.nhsbsa.net/dataset/65050ec0-5abd-48ce-989d-defc08ed837e/resource/fbed03dd-df68-46dc-a283-8e5beda931a3/download/epd_202206.csv'  
url3 = 'https://opendata.nhsbsa.net/dataset/65050ec0-5abd-48ce-989d-defc08ed837e/resource/fbed03dd-df68-46dc-a283-8e5beda931a3/download/epd_202207.csv'  

#### 3 ultimos meses de prescrição (english-prescribing-data-epd)
#### Salvar os dados localmente

In [4]:
# time taken to read data
s_time_dask = time.time()

In [5]:
# English Prescribing Dataset (EPD) - May 2022
urlretrieve(url1, 'epd_202205.csv')

('epd_202205.csv', <http.client.HTTPMessage at 0x296462764f0>)

In [6]:
e_time_dask = time.time()
print("Tempo de download (epd_202205.csv): ", round(e_time_dask-s_time_dask)/60, "minutes")

Tempo de download (epd_202205.csv):  3.2 minutes


In [7]:
# time taken to read data
s_time_dask = time.time()

In [8]:
# English Prescribing Dataset (EPD) - Jun 2022
urlretrieve(url2, 'epd_202206.csv')

('epd_202206.csv', <http.client.HTTPMessage at 0x296462ac5b0>)

In [9]:
e_time_dask = time.time()
print("Tempo de download (epd_202206.csv): ", round(e_time_dask-s_time_dask)/60, "minutes")

Tempo de download (epd_202206.csv):  3.0 minutes


In [10]:
# time taken to read data
s_time_dask = time.time()

In [11]:
# English Prescribing Dataset (EPD) - Jul 2022
urlretrieve(url3, 'epd_202207.csv')

('epd_202207.csv', <http.client.HTTPMessage at 0x296462ac520>)

In [12]:
e_time_dask = time.time()
print("Tempo de download (epd_202207.csv): ", round(e_time_dask-s_time_dask)/60, "minutes")

Tempo de download (epd_202207.csv):  3.05 minutes


#### df1 carregar e realização de conferencias dos dados baixados.

In [13]:
# Cria Dataset do arquivo epd_202205.csv
df1 = pd.read_csv(r'C://Jupyter//2RP//epd_202205.csv', sep=',')

In [14]:
# Número de caracteres
df1.size

457701400

In [15]:
# Validação de dados - Info Check
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17603900 entries, 0 to 17603899
Data columns (total 26 columns):
 #   Column                        Dtype  
---  ------                        -----  
 0   YEAR_MONTH                    int64  
 1   REGIONAL_OFFICE_NAME          object 
 2   REGIONAL_OFFICE_CODE          object 
 3   ICB_NAME                      object 
 4   ICB_CODE                      object 
 5   PCO_NAME                      object 
 6   PCO_CODE                      object 
 7   PRACTICE_NAME                 object 
 8   PRACTICE_CODE                 object 
 9   ADDRESS_1                     object 
 10  ADDRESS_2                     object 
 11  ADDRESS_3                     object 
 12  ADDRESS_4                     object 
 13  POSTCODE                      object 
 14  BNF_CHEMICAL_SUBSTANCE        object 
 15  CHEMICAL_SUBSTANCE_BNF_DESCR  object 
 16  BNF_CODE                      object 
 17  BNF_DESCRIPTION               object 
 18  BNF_CHAPTER_PLUS_COD

In [16]:
# Descreve valores estatísticos do dataset com arrendodamento(round)
round(df1.describe())

Unnamed: 0,YEAR_MONTH,QUANTITY,ITEMS,TOTAL_QUANTITY,ADQUSAGE,NIC,ACTUAL_COST
count,17603900.0,17603900.0,17603900.0,17603900.0,17603900.0,17603900.0,17603900.0
mean,202206.0,172.0,5.0,434.0,122.0,45.0,42.0
std,0.0,1203.0,19.0,2237.0,737.0,165.0,155.0
min,202206.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,202206.0,20.0,1.0,30.0,0.0,4.0,4.0
50%,202206.0,45.0,2.0,84.0,4.0,12.0,11.0
75%,202206.0,90.0,4.0,224.0,56.0,35.0,33.0
max,202206.0,168000.0,3455.0,1288000.0,136080.0,33824.0,31643.0


In [17]:
# Validação de dados - Length Check
len(df1)  # Mostra o número de linhas do dataset epd_202205.csv

17603900

In [18]:
# Validação de dados - Consistency Check
df1.index  # Valida o número de linhas do dataset epd_202205.csv

RangeIndex(start=0, stop=17603900, step=1)

In [19]:
# Validação de dados - Consistency Check
df1.shape  # Mostra o numero de linhas e de colunas do dataset epd_202205.csv

(17603900, 26)

In [20]:
df1.keys()

Index(['YEAR_MONTH', 'REGIONAL_OFFICE_NAME', 'REGIONAL_OFFICE_CODE',
       'ICB_NAME', 'ICB_CODE', 'PCO_NAME', 'PCO_CODE', 'PRACTICE_NAME',
       'PRACTICE_CODE', 'ADDRESS_1', 'ADDRESS_2', 'ADDRESS_3', 'ADDRESS_4',
       'POSTCODE', 'BNF_CHEMICAL_SUBSTANCE', 'CHEMICAL_SUBSTANCE_BNF_DESCR',
       'BNF_CODE', 'BNF_DESCRIPTION', 'BNF_CHAPTER_PLUS_CODE', 'QUANTITY',
       'ITEMS', 'TOTAL_QUANTITY', 'ADQUSAGE', 'NIC', 'ACTUAL_COST',
       'UNIDENTIFIED'],
      dtype='object')

In [21]:
# Validação de dados - Uniqueness Check
df1.columns  #  Mostra o index das colunas do dataset epd_202205.csv

Index(['YEAR_MONTH', 'REGIONAL_OFFICE_NAME', 'REGIONAL_OFFICE_CODE',
       'ICB_NAME', 'ICB_CODE', 'PCO_NAME', 'PCO_CODE', 'PRACTICE_NAME',
       'PRACTICE_CODE', 'ADDRESS_1', 'ADDRESS_2', 'ADDRESS_3', 'ADDRESS_4',
       'POSTCODE', 'BNF_CHEMICAL_SUBSTANCE', 'CHEMICAL_SUBSTANCE_BNF_DESCR',
       'BNF_CODE', 'BNF_DESCRIPTION', 'BNF_CHAPTER_PLUS_CODE', 'QUANTITY',
       'ITEMS', 'TOTAL_QUANTITY', 'ADQUSAGE', 'NIC', 'ACTUAL_COST',
       'UNIDENTIFIED'],
      dtype='object')

In [22]:
# Validação de dados - Data Type Check
df1.dtypes  # Mostra o tipo de cada dados por suas respectivas colunas

YEAR_MONTH                        int64
REGIONAL_OFFICE_NAME             object
REGIONAL_OFFICE_CODE             object
ICB_NAME                         object
ICB_CODE                         object
PCO_NAME                         object
PCO_CODE                         object
PRACTICE_NAME                    object
PRACTICE_CODE                    object
ADDRESS_1                        object
ADDRESS_2                        object
ADDRESS_3                        object
ADDRESS_4                        object
POSTCODE                         object
BNF_CHEMICAL_SUBSTANCE           object
CHEMICAL_SUBSTANCE_BNF_DESCR     object
BNF_CODE                         object
BNF_DESCRIPTION                  object
BNF_CHAPTER_PLUS_CODE            object
QUANTITY                        float64
ITEMS                             int64
TOTAL_QUANTITY                  float64
ADQUSAGE                        float64
NIC                             float64
ACTUAL_COST                     float64


In [23]:
# Imprime o dataset df1 carregado do arquivo epd_202205.csv (EPD) - May 2022
df1

Unnamed: 0,YEAR_MONTH,REGIONAL_OFFICE_NAME,REGIONAL_OFFICE_CODE,ICB_NAME,ICB_CODE,PCO_NAME,PCO_CODE,PRACTICE_NAME,PRACTICE_CODE,ADDRESS_1,...,BNF_CODE,BNF_DESCRIPTION,BNF_CHAPTER_PLUS_CODE,QUANTITY,ITEMS,TOTAL_QUANTITY,ADQUSAGE,NIC,ACTUAL_COST,UNIDENTIFIED
0,202206,NORTH WEST,Y62,NHS CHESHIRE AND MERSEYSIDE INTEGRATED C,QYG,WIRRAL COMMUNITY HEALTH AND CARE NHS FOU,RY700,WIRRAL COMMUNITY NMP,Y03836,ST CATHERINE'S HC,...,20020200701,Viscopaste PB7 bandage 7.5cm x 6m,20: Dressings,10.0,1,10.0,0.0,38.90,36.40326,N
1,202206,NORTH WEST,Y62,NHS CHESHIRE AND MERSEYSIDE INTEGRATED C,QYG,WIRRAL COMMUNITY HEALTH AND CARE NHS FOU,RY700,WIRRAL WIC (APH)_WIC APH,N85645,ARROWE PARK HOSPITAL,...,20030100079,Mepore dressing 11cm x 15cm,20: Dressings,5.0,1,5.0,0.0,1.85,1.74307,N
2,202206,NORTH EAST AND YORKSHIRE,Y63,NHS SOUTH YORKSHIRE INTEGRATED CARE BOAR,QF7,NHS NOTTINGHAM AND NOTTINGHAMSHIRE ICB -,02Q00,BASSETLAW HEALTH PARTNERSHIP,Y03762,C/O RETFORD HOSPITAL,...,20030100167,Dressit sterile dressing pack with gloves,20: Dressings,10.0,6,60.0,0.0,41.40,38.75440,N
3,202206,NORTH EAST AND YORKSHIRE,Y63,NHS SOUTH YORKSHIRE INTEGRATED CARE BOAR,QF7,NHS NOTTINGHAM AND NOTTINGHAMSHIRE ICB -,02Q00,BASSETLAW HEALTH PARTNERSHIP,Y03762,C/O RETFORD HOSPITAL,...,20030100167,Dressit sterile dressing pack with gloves,20: Dressings,20.0,2,40.0,0.0,27.60,25.81973,N
4,202206,NORTH EAST AND YORKSHIRE,Y63,NHS SOUTH YORKSHIRE INTEGRATED CARE BOAR,QF7,NHS NOTTINGHAM AND NOTTINGHAMSHIRE ICB -,02Q00,BASSETLAW HEALTH PARTNERSHIP,Y03762,C/O RETFORD HOSPITAL,...,20030600027,Allevyn Adhesive dressing 10cm x 10cm square,20: Dressings,10.0,2,20.0,0.0,46.60,43.60659,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17603895,202206,LONDON,Y56,NHS NORTH CENTRAL LONDON INTEGRATED CARE,QMJ,NHS NORTH CENTRAL LONDON ICB - 93C,93C00,THE RISE GROUP PRACTICE,F83039,HORNSEY RISE HEALTH CTR,...,1502010J0BDAABQ,Instillagel gel,15: Anaesthesia,55.0,1,55.0,0.0,5.50,5.25764,N
17603896,202206,LONDON,Y56,NHS NORTH CENTRAL LONDON INTEGRATED CARE,QMJ,NHS NORTH CENTRAL LONDON ICB - 93C,93C00,THE RISE GROUP PRACTICE,F83039,HORNSEY RISE HEALTH CTR,...,190201000AABLBL,Exception Handler Unspecified Item,19: Other Drugs and Preparations,10.0,1,10.0,0.0,8.49,7.95477,N
17603897,202206,LONDON,Y56,NHS NORTH CENTRAL LONDON INTEGRATED CARE,QMJ,NHS NORTH CENTRAL LONDON ICB - 93C,93C00,THE RISE GROUP PRACTICE,F83039,HORNSEY RISE HEALTH CTR,...,190205500BCCPA0,Neutrogena T/Gel shampoo for dry hair,19: Other Drugs and Preparations,250.0,1,250.0,0.0,5.52,5.17635,N
17603898,202206,LONDON,Y56,NHS NORTH CENTRAL LONDON INTEGRATED CARE,QMJ,NHS NORTH CENTRAL LONDON ICB - 93C,93C00,THE RISE GROUP PRACTICE,F83039,HORNSEY RISE HEALTH CTR,...,190605000AACACA,Olive oil liquid,19: Other Drugs and Preparations,10.0,1,10.0,0.0,1.40,1.32210,N


In [24]:
# Imprime o dataset df1 com suas 3 primeiras linhas
df1.head(3)

Unnamed: 0,YEAR_MONTH,REGIONAL_OFFICE_NAME,REGIONAL_OFFICE_CODE,ICB_NAME,ICB_CODE,PCO_NAME,PCO_CODE,PRACTICE_NAME,PRACTICE_CODE,ADDRESS_1,...,BNF_CODE,BNF_DESCRIPTION,BNF_CHAPTER_PLUS_CODE,QUANTITY,ITEMS,TOTAL_QUANTITY,ADQUSAGE,NIC,ACTUAL_COST,UNIDENTIFIED
0,202206,NORTH WEST,Y62,NHS CHESHIRE AND MERSEYSIDE INTEGRATED C,QYG,WIRRAL COMMUNITY HEALTH AND CARE NHS FOU,RY700,WIRRAL COMMUNITY NMP,Y03836,ST CATHERINE'S HC,...,20020200701,Viscopaste PB7 bandage 7.5cm x 6m,20: Dressings,10.0,1,10.0,0.0,38.9,36.40326,N
1,202206,NORTH WEST,Y62,NHS CHESHIRE AND MERSEYSIDE INTEGRATED C,QYG,WIRRAL COMMUNITY HEALTH AND CARE NHS FOU,RY700,WIRRAL WIC (APH)_WIC APH,N85645,ARROWE PARK HOSPITAL,...,20030100079,Mepore dressing 11cm x 15cm,20: Dressings,5.0,1,5.0,0.0,1.85,1.74307,N
2,202206,NORTH EAST AND YORKSHIRE,Y63,NHS SOUTH YORKSHIRE INTEGRATED CARE BOAR,QF7,NHS NOTTINGHAM AND NOTTINGHAMSHIRE ICB -,02Q00,BASSETLAW HEALTH PARTNERSHIP,Y03762,C/O RETFORD HOSPITAL,...,20030100167,Dressit sterile dressing pack with gloves,20: Dressings,10.0,6,60.0,0.0,41.4,38.7544,N


In [25]:
# Imprime o dataset df1 com suas 3 últimas linhas
df1.tail(3)

Unnamed: 0,YEAR_MONTH,REGIONAL_OFFICE_NAME,REGIONAL_OFFICE_CODE,ICB_NAME,ICB_CODE,PCO_NAME,PCO_CODE,PRACTICE_NAME,PRACTICE_CODE,ADDRESS_1,...,BNF_CODE,BNF_DESCRIPTION,BNF_CHAPTER_PLUS_CODE,QUANTITY,ITEMS,TOTAL_QUANTITY,ADQUSAGE,NIC,ACTUAL_COST,UNIDENTIFIED
17603897,202206,LONDON,Y56,NHS NORTH CENTRAL LONDON INTEGRATED CARE,QMJ,NHS NORTH CENTRAL LONDON ICB - 93C,93C00,THE RISE GROUP PRACTICE,F83039,HORNSEY RISE HEALTH CTR,...,190205500BCCPA0,Neutrogena T/Gel shampoo for dry hair,19: Other Drugs and Preparations,250.0,1,250.0,0.0,5.52,5.17635,N
17603898,202206,LONDON,Y56,NHS NORTH CENTRAL LONDON INTEGRATED CARE,QMJ,NHS NORTH CENTRAL LONDON ICB - 93C,93C00,THE RISE GROUP PRACTICE,F83039,HORNSEY RISE HEALTH CTR,...,190605000AACACA,Olive oil liquid,19: Other Drugs and Preparations,10.0,1,10.0,0.0,1.4,1.3221,N
17603899,202206,LONDON,Y56,NHS NORTH CENTRAL LONDON INTEGRATED CARE,QMJ,NHS NORTH CENTRAL LONDON ICB - 93C,93C00,THE RISE GROUP PRACTICE,F83039,HORNSEY RISE HEALTH CTR,...,190700000BBCJA0,Resource ThickenUp Clear powder,19: Other Drugs and Preparations,127.0,1,127.0,0.0,8.46,7.92671,N


#### Data Format Valitation

In [27]:
# Imprime a soma de dados nulos
(df1.isnull().sum())

YEAR_MONTH                            0
REGIONAL_OFFICE_NAME                  0
REGIONAL_OFFICE_CODE                  0
ICB_NAME                              0
ICB_CODE                              0
PCO_NAME                              0
PCO_CODE                              0
PRACTICE_NAME                         0
PRACTICE_CODE                         0
ADDRESS_1                         15513
ADDRESS_2                        978557
ADDRESS_3                        470271
ADDRESS_4                       2490847
POSTCODE                          15513
BNF_CHEMICAL_SUBSTANCE                0
CHEMICAL_SUBSTANCE_BNF_DESCR          0
BNF_CODE                              0
BNF_DESCRIPTION                       0
BNF_CHAPTER_PLUS_CODE                 0
QUANTITY                              0
ITEMS                                 0
TOTAL_QUANTITY                        0
ADQUSAGE                              0
NIC                                   0
ACTUAL_COST                           0


In [28]:
# Imprime a(s) linha(s) duplicadas
df1[df1.duplicated()]

Unnamed: 0,YEAR_MONTH,REGIONAL_OFFICE_NAME,REGIONAL_OFFICE_CODE,ICB_NAME,ICB_CODE,PCO_NAME,PCO_CODE,PRACTICE_NAME,PRACTICE_CODE,ADDRESS_1,...,BNF_CODE,BNF_DESCRIPTION,BNF_CHAPTER_PLUS_CODE,QUANTITY,ITEMS,TOTAL_QUANTITY,ADQUSAGE,NIC,ACTUAL_COST,UNIDENTIFIED
6582591,202206,UNIDENTIFIED,-,UNIDENTIFIED,-,UNIDENTIFIED,-,UNIDENTIFIED DOCTORS,-,-,...,0301011R0BWAABZ,Easyhaler Salbutamol sulfate 100micrograms/dos...,03: Respiratory System,1.0,1,1.0,50.0,3.31,3.1089,Y


#### Range check

In [29]:
# Drop dados dublicados e carrega em um novo dataset df1A
df1A = df1.drop_duplicates()

In [30]:
# Validação de dados - Info Check
df1A.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17603899 entries, 0 to 17603899
Data columns (total 26 columns):
 #   Column                        Dtype  
---  ------                        -----  
 0   YEAR_MONTH                    int64  
 1   REGIONAL_OFFICE_NAME          object 
 2   REGIONAL_OFFICE_CODE          object 
 3   ICB_NAME                      object 
 4   ICB_CODE                      object 
 5   PCO_NAME                      object 
 6   PCO_CODE                      object 
 7   PRACTICE_NAME                 object 
 8   PRACTICE_CODE                 object 
 9   ADDRESS_1                     object 
 10  ADDRESS_2                     object 
 11  ADDRESS_3                     object 
 12  ADDRESS_4                     object 
 13  POSTCODE                      object 
 14  BNF_CHEMICAL_SUBSTANCE        object 
 15  CHEMICAL_SUBSTANCE_BNF_DESCR  object 
 16  BNF_CODE                      object 
 17  BNF_DESCRIPTION               object 
 18  BNF_CHAPTER_PLUS_COD

In [31]:
# Números de caracteres.
df1A_caracter = df1A.size

In [32]:
# Números de caracteres.
df1_caracter = df1.size

In [33]:
# Número de caracteres depois das interações
print("Dados retirados", (df1_caracter-df1A_caracter), "caracteres")

Dados retirados 26 caracteres


In [34]:
# Salvar dataset df1A
df1A.to_parquet("/Jupyter/2RP/epd_202205_validado.parquet")

In [35]:
# Reset Datasets
df1 = 0
df1A = 0

#### df2 carregar e realização de conferencias dos dados baixados.

In [36]:
# Cria Dataset do arquivo epd_202206.csv
df2 = pd.read_csv(r'C://Jupyter//2RP//epd_202206.csv', sep=',')

In [37]:
# Número de caracteres
df2.size

457701400

In [38]:
# Validação de dados - Info Check
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17603900 entries, 0 to 17603899
Data columns (total 26 columns):
 #   Column                        Dtype  
---  ------                        -----  
 0   YEAR_MONTH                    int64  
 1   REGIONAL_OFFICE_NAME          object 
 2   REGIONAL_OFFICE_CODE          object 
 3   ICB_NAME                      object 
 4   ICB_CODE                      object 
 5   PCO_NAME                      object 
 6   PCO_CODE                      object 
 7   PRACTICE_NAME                 object 
 8   PRACTICE_CODE                 object 
 9   ADDRESS_1                     object 
 10  ADDRESS_2                     object 
 11  ADDRESS_3                     object 
 12  ADDRESS_4                     object 
 13  POSTCODE                      object 
 14  BNF_CHEMICAL_SUBSTANCE        object 
 15  CHEMICAL_SUBSTANCE_BNF_DESCR  object 
 16  BNF_CODE                      object 
 17  BNF_DESCRIPTION               object 
 18  BNF_CHAPTER_PLUS_COD

In [39]:
# Descreve valores estatísticos do dataset com arrendodamento(round)
round(df2.describe())

Unnamed: 0,YEAR_MONTH,QUANTITY,ITEMS,TOTAL_QUANTITY,ADQUSAGE,NIC,ACTUAL_COST
count,17603900.0,17603900.0,17603900.0,17603900.0,17603900.0,17603900.0,17603900.0
mean,202206.0,172.0,5.0,434.0,122.0,45.0,42.0
std,0.0,1203.0,19.0,2237.0,737.0,165.0,155.0
min,202206.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,202206.0,20.0,1.0,30.0,0.0,4.0,4.0
50%,202206.0,45.0,2.0,84.0,4.0,12.0,11.0
75%,202206.0,90.0,4.0,224.0,56.0,35.0,33.0
max,202206.0,168000.0,3455.0,1288000.0,136080.0,33824.0,31643.0


In [40]:
# Validação de dados - Length Check
len(df2)  # Mostra o número de linhas do dataset epd_202206.csv

17603900

In [41]:
# Validação de dados - Consistency Check
df2.index  # Valida o número de linhas do dataset epd_202206.csv

RangeIndex(start=0, stop=17603900, step=1)

In [42]:
# Validação de dados - Consistency Check
df2.shape  # Mostra o numero de linhas e de colunas do dataset epd_202206.csv

(17603900, 26)

In [43]:
df2.keys()

Index(['YEAR_MONTH', 'REGIONAL_OFFICE_NAME', 'REGIONAL_OFFICE_CODE',
       'ICB_NAME', 'ICB_CODE', 'PCO_NAME', 'PCO_CODE', 'PRACTICE_NAME',
       'PRACTICE_CODE', 'ADDRESS_1', 'ADDRESS_2', 'ADDRESS_3', 'ADDRESS_4',
       'POSTCODE', 'BNF_CHEMICAL_SUBSTANCE', 'CHEMICAL_SUBSTANCE_BNF_DESCR',
       'BNF_CODE', 'BNF_DESCRIPTION', 'BNF_CHAPTER_PLUS_CODE', 'QUANTITY',
       'ITEMS', 'TOTAL_QUANTITY', 'ADQUSAGE', 'NIC', 'ACTUAL_COST',
       'UNIDENTIFIED'],
      dtype='object')

In [44]:
# Validação de dados - Uniqueness Check
df2.columns  #  Mostra o index das colunas do dataset epd_202206.csv

Index(['YEAR_MONTH', 'REGIONAL_OFFICE_NAME', 'REGIONAL_OFFICE_CODE',
       'ICB_NAME', 'ICB_CODE', 'PCO_NAME', 'PCO_CODE', 'PRACTICE_NAME',
       'PRACTICE_CODE', 'ADDRESS_1', 'ADDRESS_2', 'ADDRESS_3', 'ADDRESS_4',
       'POSTCODE', 'BNF_CHEMICAL_SUBSTANCE', 'CHEMICAL_SUBSTANCE_BNF_DESCR',
       'BNF_CODE', 'BNF_DESCRIPTION', 'BNF_CHAPTER_PLUS_CODE', 'QUANTITY',
       'ITEMS', 'TOTAL_QUANTITY', 'ADQUSAGE', 'NIC', 'ACTUAL_COST',
       'UNIDENTIFIED'],
      dtype='object')

In [45]:
# Validação de dados - Data Type Check
df2.dtypes  # Mostra o tipo de cada dados por suas respectivas colunas

YEAR_MONTH                        int64
REGIONAL_OFFICE_NAME             object
REGIONAL_OFFICE_CODE             object
ICB_NAME                         object
ICB_CODE                         object
PCO_NAME                         object
PCO_CODE                         object
PRACTICE_NAME                    object
PRACTICE_CODE                    object
ADDRESS_1                        object
ADDRESS_2                        object
ADDRESS_3                        object
ADDRESS_4                        object
POSTCODE                         object
BNF_CHEMICAL_SUBSTANCE           object
CHEMICAL_SUBSTANCE_BNF_DESCR     object
BNF_CODE                         object
BNF_DESCRIPTION                  object
BNF_CHAPTER_PLUS_CODE            object
QUANTITY                        float64
ITEMS                             int64
TOTAL_QUANTITY                  float64
ADQUSAGE                        float64
NIC                             float64
ACTUAL_COST                     float64


In [46]:
# Imprime o dataset df1 carregado do arquivo epd_202206.csv (EPD) - Jun 2022
df2

Unnamed: 0,YEAR_MONTH,REGIONAL_OFFICE_NAME,REGIONAL_OFFICE_CODE,ICB_NAME,ICB_CODE,PCO_NAME,PCO_CODE,PRACTICE_NAME,PRACTICE_CODE,ADDRESS_1,...,BNF_CODE,BNF_DESCRIPTION,BNF_CHAPTER_PLUS_CODE,QUANTITY,ITEMS,TOTAL_QUANTITY,ADQUSAGE,NIC,ACTUAL_COST,UNIDENTIFIED
0,202206,NORTH WEST,Y62,NHS CHESHIRE AND MERSEYSIDE INTEGRATED C,QYG,WIRRAL COMMUNITY HEALTH AND CARE NHS FOU,RY700,WIRRAL COMMUNITY NMP,Y03836,ST CATHERINE'S HC,...,20020200701,Viscopaste PB7 bandage 7.5cm x 6m,20: Dressings,10.0,1,10.0,0.0,38.90,36.40326,N
1,202206,NORTH WEST,Y62,NHS CHESHIRE AND MERSEYSIDE INTEGRATED C,QYG,WIRRAL COMMUNITY HEALTH AND CARE NHS FOU,RY700,WIRRAL WIC (APH)_WIC APH,N85645,ARROWE PARK HOSPITAL,...,20030100079,Mepore dressing 11cm x 15cm,20: Dressings,5.0,1,5.0,0.0,1.85,1.74307,N
2,202206,NORTH EAST AND YORKSHIRE,Y63,NHS SOUTH YORKSHIRE INTEGRATED CARE BOAR,QF7,NHS NOTTINGHAM AND NOTTINGHAMSHIRE ICB -,02Q00,BASSETLAW HEALTH PARTNERSHIP,Y03762,C/O RETFORD HOSPITAL,...,20030100167,Dressit sterile dressing pack with gloves,20: Dressings,10.0,6,60.0,0.0,41.40,38.75440,N
3,202206,NORTH EAST AND YORKSHIRE,Y63,NHS SOUTH YORKSHIRE INTEGRATED CARE BOAR,QF7,NHS NOTTINGHAM AND NOTTINGHAMSHIRE ICB -,02Q00,BASSETLAW HEALTH PARTNERSHIP,Y03762,C/O RETFORD HOSPITAL,...,20030100167,Dressit sterile dressing pack with gloves,20: Dressings,20.0,2,40.0,0.0,27.60,25.81973,N
4,202206,NORTH EAST AND YORKSHIRE,Y63,NHS SOUTH YORKSHIRE INTEGRATED CARE BOAR,QF7,NHS NOTTINGHAM AND NOTTINGHAMSHIRE ICB -,02Q00,BASSETLAW HEALTH PARTNERSHIP,Y03762,C/O RETFORD HOSPITAL,...,20030600027,Allevyn Adhesive dressing 10cm x 10cm square,20: Dressings,10.0,2,20.0,0.0,46.60,43.60659,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17603895,202206,LONDON,Y56,NHS NORTH CENTRAL LONDON INTEGRATED CARE,QMJ,NHS NORTH CENTRAL LONDON ICB - 93C,93C00,THE RISE GROUP PRACTICE,F83039,HORNSEY RISE HEALTH CTR,...,1502010J0BDAABQ,Instillagel gel,15: Anaesthesia,55.0,1,55.0,0.0,5.50,5.25764,N
17603896,202206,LONDON,Y56,NHS NORTH CENTRAL LONDON INTEGRATED CARE,QMJ,NHS NORTH CENTRAL LONDON ICB - 93C,93C00,THE RISE GROUP PRACTICE,F83039,HORNSEY RISE HEALTH CTR,...,190201000AABLBL,Exception Handler Unspecified Item,19: Other Drugs and Preparations,10.0,1,10.0,0.0,8.49,7.95477,N
17603897,202206,LONDON,Y56,NHS NORTH CENTRAL LONDON INTEGRATED CARE,QMJ,NHS NORTH CENTRAL LONDON ICB - 93C,93C00,THE RISE GROUP PRACTICE,F83039,HORNSEY RISE HEALTH CTR,...,190205500BCCPA0,Neutrogena T/Gel shampoo for dry hair,19: Other Drugs and Preparations,250.0,1,250.0,0.0,5.52,5.17635,N
17603898,202206,LONDON,Y56,NHS NORTH CENTRAL LONDON INTEGRATED CARE,QMJ,NHS NORTH CENTRAL LONDON ICB - 93C,93C00,THE RISE GROUP PRACTICE,F83039,HORNSEY RISE HEALTH CTR,...,190605000AACACA,Olive oil liquid,19: Other Drugs and Preparations,10.0,1,10.0,0.0,1.40,1.32210,N


In [47]:
# Imprime o dataset df1 com suas 3 primeiras linhas
df2.head(3)

Unnamed: 0,YEAR_MONTH,REGIONAL_OFFICE_NAME,REGIONAL_OFFICE_CODE,ICB_NAME,ICB_CODE,PCO_NAME,PCO_CODE,PRACTICE_NAME,PRACTICE_CODE,ADDRESS_1,...,BNF_CODE,BNF_DESCRIPTION,BNF_CHAPTER_PLUS_CODE,QUANTITY,ITEMS,TOTAL_QUANTITY,ADQUSAGE,NIC,ACTUAL_COST,UNIDENTIFIED
0,202206,NORTH WEST,Y62,NHS CHESHIRE AND MERSEYSIDE INTEGRATED C,QYG,WIRRAL COMMUNITY HEALTH AND CARE NHS FOU,RY700,WIRRAL COMMUNITY NMP,Y03836,ST CATHERINE'S HC,...,20020200701,Viscopaste PB7 bandage 7.5cm x 6m,20: Dressings,10.0,1,10.0,0.0,38.9,36.40326,N
1,202206,NORTH WEST,Y62,NHS CHESHIRE AND MERSEYSIDE INTEGRATED C,QYG,WIRRAL COMMUNITY HEALTH AND CARE NHS FOU,RY700,WIRRAL WIC (APH)_WIC APH,N85645,ARROWE PARK HOSPITAL,...,20030100079,Mepore dressing 11cm x 15cm,20: Dressings,5.0,1,5.0,0.0,1.85,1.74307,N
2,202206,NORTH EAST AND YORKSHIRE,Y63,NHS SOUTH YORKSHIRE INTEGRATED CARE BOAR,QF7,NHS NOTTINGHAM AND NOTTINGHAMSHIRE ICB -,02Q00,BASSETLAW HEALTH PARTNERSHIP,Y03762,C/O RETFORD HOSPITAL,...,20030100167,Dressit sterile dressing pack with gloves,20: Dressings,10.0,6,60.0,0.0,41.4,38.7544,N


In [48]:
# Imprime o dataset df1 com suas 3 últimas linhas
df2.tail(3)

Unnamed: 0,YEAR_MONTH,REGIONAL_OFFICE_NAME,REGIONAL_OFFICE_CODE,ICB_NAME,ICB_CODE,PCO_NAME,PCO_CODE,PRACTICE_NAME,PRACTICE_CODE,ADDRESS_1,...,BNF_CODE,BNF_DESCRIPTION,BNF_CHAPTER_PLUS_CODE,QUANTITY,ITEMS,TOTAL_QUANTITY,ADQUSAGE,NIC,ACTUAL_COST,UNIDENTIFIED
17603897,202206,LONDON,Y56,NHS NORTH CENTRAL LONDON INTEGRATED CARE,QMJ,NHS NORTH CENTRAL LONDON ICB - 93C,93C00,THE RISE GROUP PRACTICE,F83039,HORNSEY RISE HEALTH CTR,...,190205500BCCPA0,Neutrogena T/Gel shampoo for dry hair,19: Other Drugs and Preparations,250.0,1,250.0,0.0,5.52,5.17635,N
17603898,202206,LONDON,Y56,NHS NORTH CENTRAL LONDON INTEGRATED CARE,QMJ,NHS NORTH CENTRAL LONDON ICB - 93C,93C00,THE RISE GROUP PRACTICE,F83039,HORNSEY RISE HEALTH CTR,...,190605000AACACA,Olive oil liquid,19: Other Drugs and Preparations,10.0,1,10.0,0.0,1.4,1.3221,N
17603899,202206,LONDON,Y56,NHS NORTH CENTRAL LONDON INTEGRATED CARE,QMJ,NHS NORTH CENTRAL LONDON ICB - 93C,93C00,THE RISE GROUP PRACTICE,F83039,HORNSEY RISE HEALTH CTR,...,190700000BBCJA0,Resource ThickenUp Clear powder,19: Other Drugs and Preparations,127.0,1,127.0,0.0,8.46,7.92671,N


#### Data Format Valitation

In [50]:
# Imprime a soma de dados nulos
(df2.isnull().sum())

YEAR_MONTH                            0
REGIONAL_OFFICE_NAME                  0
REGIONAL_OFFICE_CODE                  0
ICB_NAME                              0
ICB_CODE                              0
PCO_NAME                              0
PCO_CODE                              0
PRACTICE_NAME                         0
PRACTICE_CODE                         0
ADDRESS_1                         15513
ADDRESS_2                        978557
ADDRESS_3                        470271
ADDRESS_4                       2490847
POSTCODE                          15513
BNF_CHEMICAL_SUBSTANCE                0
CHEMICAL_SUBSTANCE_BNF_DESCR          0
BNF_CODE                              0
BNF_DESCRIPTION                       0
BNF_CHAPTER_PLUS_CODE                 0
QUANTITY                              0
ITEMS                                 0
TOTAL_QUANTITY                        0
ADQUSAGE                              0
NIC                                   0
ACTUAL_COST                           0


In [51]:
# Imprime a(s) linha(s) duplicadas
df2[df2.duplicated()]

Unnamed: 0,YEAR_MONTH,REGIONAL_OFFICE_NAME,REGIONAL_OFFICE_CODE,ICB_NAME,ICB_CODE,PCO_NAME,PCO_CODE,PRACTICE_NAME,PRACTICE_CODE,ADDRESS_1,...,BNF_CODE,BNF_DESCRIPTION,BNF_CHAPTER_PLUS_CODE,QUANTITY,ITEMS,TOTAL_QUANTITY,ADQUSAGE,NIC,ACTUAL_COST,UNIDENTIFIED
6582591,202206,UNIDENTIFIED,-,UNIDENTIFIED,-,UNIDENTIFIED,-,UNIDENTIFIED DOCTORS,-,-,...,0301011R0BWAABZ,Easyhaler Salbutamol sulfate 100micrograms/dos...,03: Respiratory System,1.0,1,1.0,50.0,3.31,3.1089,Y


#### Range check

In [52]:
# Drop dados dublicados e carrega em um novo dataset df2A
df2A = df2.drop_duplicates()

In [53]:
# Validação de dados - Info Check
df2A.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17603899 entries, 0 to 17603899
Data columns (total 26 columns):
 #   Column                        Dtype  
---  ------                        -----  
 0   YEAR_MONTH                    int64  
 1   REGIONAL_OFFICE_NAME          object 
 2   REGIONAL_OFFICE_CODE          object 
 3   ICB_NAME                      object 
 4   ICB_CODE                      object 
 5   PCO_NAME                      object 
 6   PCO_CODE                      object 
 7   PRACTICE_NAME                 object 
 8   PRACTICE_CODE                 object 
 9   ADDRESS_1                     object 
 10  ADDRESS_2                     object 
 11  ADDRESS_3                     object 
 12  ADDRESS_4                     object 
 13  POSTCODE                      object 
 14  BNF_CHEMICAL_SUBSTANCE        object 
 15  CHEMICAL_SUBSTANCE_BNF_DESCR  object 
 16  BNF_CODE                      object 
 17  BNF_DESCRIPTION               object 
 18  BNF_CHAPTER_PLUS_COD

In [54]:
# Números de caracteres.
df2A_caracter = df2A.size

In [55]:
# Números de caracteres.
df2_caracter = df2.size

In [56]:
# Número de caracteres depois das interações
print("Dados retirados", (df2_caracter-df2A_caracter), "caracteres")

Dados retirados 26 caracteres


In [57]:
# Salvar dataset df1A
df2A.to_parquet("/Jupyter/2RP/epd_202206_validado.parquet")

In [58]:
# Reset Datasets
df2 = 0
df2A = 0

#### df3 carregar e realização de conferencias dos dados baixados.

In [59]:
# Cria Dataset do arquivo epd_202207.csv
df3 = pd.read_csv(r'C://Jupyter//2RP//epd_202207.csv', sep=',')

In [60]:
# Número de caracteres
df3.size

457701400

In [61]:
# Validação de dados - Info Check
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17603900 entries, 0 to 17603899
Data columns (total 26 columns):
 #   Column                        Dtype  
---  ------                        -----  
 0   YEAR_MONTH                    int64  
 1   REGIONAL_OFFICE_NAME          object 
 2   REGIONAL_OFFICE_CODE          object 
 3   ICB_NAME                      object 
 4   ICB_CODE                      object 
 5   PCO_NAME                      object 
 6   PCO_CODE                      object 
 7   PRACTICE_NAME                 object 
 8   PRACTICE_CODE                 object 
 9   ADDRESS_1                     object 
 10  ADDRESS_2                     object 
 11  ADDRESS_3                     object 
 12  ADDRESS_4                     object 
 13  POSTCODE                      object 
 14  BNF_CHEMICAL_SUBSTANCE        object 
 15  CHEMICAL_SUBSTANCE_BNF_DESCR  object 
 16  BNF_CODE                      object 
 17  BNF_DESCRIPTION               object 
 18  BNF_CHAPTER_PLUS_COD

In [62]:
# Descreve valores estatísticos do dataset com arrendodamento(round)
round(df3.describe())

Unnamed: 0,YEAR_MONTH,QUANTITY,ITEMS,TOTAL_QUANTITY,ADQUSAGE,NIC,ACTUAL_COST
count,17603900.0,17603900.0,17603900.0,17603900.0,17603900.0,17603900.0,17603900.0
mean,202206.0,172.0,5.0,434.0,122.0,45.0,42.0
std,0.0,1203.0,19.0,2237.0,737.0,165.0,155.0
min,202206.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,202206.0,20.0,1.0,30.0,0.0,4.0,4.0
50%,202206.0,45.0,2.0,84.0,4.0,12.0,11.0
75%,202206.0,90.0,4.0,224.0,56.0,35.0,33.0
max,202206.0,168000.0,3455.0,1288000.0,136080.0,33824.0,31643.0


In [63]:
# Validação de dados - Length Check
len(df3)  # Mostra o número de linhas do dataset epd_202207.csv

17603900

In [64]:
# Validação de dados - Consistency Check
df3.index  # Valida o número de linhas do dataset epd_202207.csv

RangeIndex(start=0, stop=17603900, step=1)

In [65]:
# Validação de dados - Consistency Check
df3.shape  # Mostra o numero de linhas e de colunas do dataset epd_202207.csv

(17603900, 26)

In [66]:
df3.keys()

Index(['YEAR_MONTH', 'REGIONAL_OFFICE_NAME', 'REGIONAL_OFFICE_CODE',
       'ICB_NAME', 'ICB_CODE', 'PCO_NAME', 'PCO_CODE', 'PRACTICE_NAME',
       'PRACTICE_CODE', 'ADDRESS_1', 'ADDRESS_2', 'ADDRESS_3', 'ADDRESS_4',
       'POSTCODE', 'BNF_CHEMICAL_SUBSTANCE', 'CHEMICAL_SUBSTANCE_BNF_DESCR',
       'BNF_CODE', 'BNF_DESCRIPTION', 'BNF_CHAPTER_PLUS_CODE', 'QUANTITY',
       'ITEMS', 'TOTAL_QUANTITY', 'ADQUSAGE', 'NIC', 'ACTUAL_COST',
       'UNIDENTIFIED'],
      dtype='object')

In [67]:
# Validação de dados - Uniqueness Check
df3.columns  #  Mostra o index das colunas do dataset epd_202207.csv

Index(['YEAR_MONTH', 'REGIONAL_OFFICE_NAME', 'REGIONAL_OFFICE_CODE',
       'ICB_NAME', 'ICB_CODE', 'PCO_NAME', 'PCO_CODE', 'PRACTICE_NAME',
       'PRACTICE_CODE', 'ADDRESS_1', 'ADDRESS_2', 'ADDRESS_3', 'ADDRESS_4',
       'POSTCODE', 'BNF_CHEMICAL_SUBSTANCE', 'CHEMICAL_SUBSTANCE_BNF_DESCR',
       'BNF_CODE', 'BNF_DESCRIPTION', 'BNF_CHAPTER_PLUS_CODE', 'QUANTITY',
       'ITEMS', 'TOTAL_QUANTITY', 'ADQUSAGE', 'NIC', 'ACTUAL_COST',
       'UNIDENTIFIED'],
      dtype='object')

In [68]:
# Validação de dados - Data Type Check
df3.dtypes  # Mostra o tipo de cada dados por suas respectivas colunas

YEAR_MONTH                        int64
REGIONAL_OFFICE_NAME             object
REGIONAL_OFFICE_CODE             object
ICB_NAME                         object
ICB_CODE                         object
PCO_NAME                         object
PCO_CODE                         object
PRACTICE_NAME                    object
PRACTICE_CODE                    object
ADDRESS_1                        object
ADDRESS_2                        object
ADDRESS_3                        object
ADDRESS_4                        object
POSTCODE                         object
BNF_CHEMICAL_SUBSTANCE           object
CHEMICAL_SUBSTANCE_BNF_DESCR     object
BNF_CODE                         object
BNF_DESCRIPTION                  object
BNF_CHAPTER_PLUS_CODE            object
QUANTITY                        float64
ITEMS                             int64
TOTAL_QUANTITY                  float64
ADQUSAGE                        float64
NIC                             float64
ACTUAL_COST                     float64


In [69]:
# Imprime o dataset df1 carregado do arquivo epd_202207.csv (EPD) - Jul 2022
df3

Unnamed: 0,YEAR_MONTH,REGIONAL_OFFICE_NAME,REGIONAL_OFFICE_CODE,ICB_NAME,ICB_CODE,PCO_NAME,PCO_CODE,PRACTICE_NAME,PRACTICE_CODE,ADDRESS_1,...,BNF_CODE,BNF_DESCRIPTION,BNF_CHAPTER_PLUS_CODE,QUANTITY,ITEMS,TOTAL_QUANTITY,ADQUSAGE,NIC,ACTUAL_COST,UNIDENTIFIED
0,202206,NORTH WEST,Y62,NHS CHESHIRE AND MERSEYSIDE INTEGRATED C,QYG,WIRRAL COMMUNITY HEALTH AND CARE NHS FOU,RY700,WIRRAL COMMUNITY NMP,Y03836,ST CATHERINE'S HC,...,20020200701,Viscopaste PB7 bandage 7.5cm x 6m,20: Dressings,10.0,1,10.0,0.0,38.90,36.40326,N
1,202206,NORTH WEST,Y62,NHS CHESHIRE AND MERSEYSIDE INTEGRATED C,QYG,WIRRAL COMMUNITY HEALTH AND CARE NHS FOU,RY700,WIRRAL WIC (APH)_WIC APH,N85645,ARROWE PARK HOSPITAL,...,20030100079,Mepore dressing 11cm x 15cm,20: Dressings,5.0,1,5.0,0.0,1.85,1.74307,N
2,202206,NORTH EAST AND YORKSHIRE,Y63,NHS SOUTH YORKSHIRE INTEGRATED CARE BOAR,QF7,NHS NOTTINGHAM AND NOTTINGHAMSHIRE ICB -,02Q00,BASSETLAW HEALTH PARTNERSHIP,Y03762,C/O RETFORD HOSPITAL,...,20030100167,Dressit sterile dressing pack with gloves,20: Dressings,10.0,6,60.0,0.0,41.40,38.75440,N
3,202206,NORTH EAST AND YORKSHIRE,Y63,NHS SOUTH YORKSHIRE INTEGRATED CARE BOAR,QF7,NHS NOTTINGHAM AND NOTTINGHAMSHIRE ICB -,02Q00,BASSETLAW HEALTH PARTNERSHIP,Y03762,C/O RETFORD HOSPITAL,...,20030100167,Dressit sterile dressing pack with gloves,20: Dressings,20.0,2,40.0,0.0,27.60,25.81973,N
4,202206,NORTH EAST AND YORKSHIRE,Y63,NHS SOUTH YORKSHIRE INTEGRATED CARE BOAR,QF7,NHS NOTTINGHAM AND NOTTINGHAMSHIRE ICB -,02Q00,BASSETLAW HEALTH PARTNERSHIP,Y03762,C/O RETFORD HOSPITAL,...,20030600027,Allevyn Adhesive dressing 10cm x 10cm square,20: Dressings,10.0,2,20.0,0.0,46.60,43.60659,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17603895,202206,LONDON,Y56,NHS NORTH CENTRAL LONDON INTEGRATED CARE,QMJ,NHS NORTH CENTRAL LONDON ICB - 93C,93C00,THE RISE GROUP PRACTICE,F83039,HORNSEY RISE HEALTH CTR,...,1502010J0BDAABQ,Instillagel gel,15: Anaesthesia,55.0,1,55.0,0.0,5.50,5.25764,N
17603896,202206,LONDON,Y56,NHS NORTH CENTRAL LONDON INTEGRATED CARE,QMJ,NHS NORTH CENTRAL LONDON ICB - 93C,93C00,THE RISE GROUP PRACTICE,F83039,HORNSEY RISE HEALTH CTR,...,190201000AABLBL,Exception Handler Unspecified Item,19: Other Drugs and Preparations,10.0,1,10.0,0.0,8.49,7.95477,N
17603897,202206,LONDON,Y56,NHS NORTH CENTRAL LONDON INTEGRATED CARE,QMJ,NHS NORTH CENTRAL LONDON ICB - 93C,93C00,THE RISE GROUP PRACTICE,F83039,HORNSEY RISE HEALTH CTR,...,190205500BCCPA0,Neutrogena T/Gel shampoo for dry hair,19: Other Drugs and Preparations,250.0,1,250.0,0.0,5.52,5.17635,N
17603898,202206,LONDON,Y56,NHS NORTH CENTRAL LONDON INTEGRATED CARE,QMJ,NHS NORTH CENTRAL LONDON ICB - 93C,93C00,THE RISE GROUP PRACTICE,F83039,HORNSEY RISE HEALTH CTR,...,190605000AACACA,Olive oil liquid,19: Other Drugs and Preparations,10.0,1,10.0,0.0,1.40,1.32210,N


In [70]:
# Imprime o dataset df3 com suas 3 primeiras linhas
df3.head(3)

Unnamed: 0,YEAR_MONTH,REGIONAL_OFFICE_NAME,REGIONAL_OFFICE_CODE,ICB_NAME,ICB_CODE,PCO_NAME,PCO_CODE,PRACTICE_NAME,PRACTICE_CODE,ADDRESS_1,...,BNF_CODE,BNF_DESCRIPTION,BNF_CHAPTER_PLUS_CODE,QUANTITY,ITEMS,TOTAL_QUANTITY,ADQUSAGE,NIC,ACTUAL_COST,UNIDENTIFIED
0,202206,NORTH WEST,Y62,NHS CHESHIRE AND MERSEYSIDE INTEGRATED C,QYG,WIRRAL COMMUNITY HEALTH AND CARE NHS FOU,RY700,WIRRAL COMMUNITY NMP,Y03836,ST CATHERINE'S HC,...,20020200701,Viscopaste PB7 bandage 7.5cm x 6m,20: Dressings,10.0,1,10.0,0.0,38.9,36.40326,N
1,202206,NORTH WEST,Y62,NHS CHESHIRE AND MERSEYSIDE INTEGRATED C,QYG,WIRRAL COMMUNITY HEALTH AND CARE NHS FOU,RY700,WIRRAL WIC (APH)_WIC APH,N85645,ARROWE PARK HOSPITAL,...,20030100079,Mepore dressing 11cm x 15cm,20: Dressings,5.0,1,5.0,0.0,1.85,1.74307,N
2,202206,NORTH EAST AND YORKSHIRE,Y63,NHS SOUTH YORKSHIRE INTEGRATED CARE BOAR,QF7,NHS NOTTINGHAM AND NOTTINGHAMSHIRE ICB -,02Q00,BASSETLAW HEALTH PARTNERSHIP,Y03762,C/O RETFORD HOSPITAL,...,20030100167,Dressit sterile dressing pack with gloves,20: Dressings,10.0,6,60.0,0.0,41.4,38.7544,N


In [71]:
# Imprime o dataset df3 com suas 3 últimas linhas
df3.tail(3)

Unnamed: 0,YEAR_MONTH,REGIONAL_OFFICE_NAME,REGIONAL_OFFICE_CODE,ICB_NAME,ICB_CODE,PCO_NAME,PCO_CODE,PRACTICE_NAME,PRACTICE_CODE,ADDRESS_1,...,BNF_CODE,BNF_DESCRIPTION,BNF_CHAPTER_PLUS_CODE,QUANTITY,ITEMS,TOTAL_QUANTITY,ADQUSAGE,NIC,ACTUAL_COST,UNIDENTIFIED
17603897,202206,LONDON,Y56,NHS NORTH CENTRAL LONDON INTEGRATED CARE,QMJ,NHS NORTH CENTRAL LONDON ICB - 93C,93C00,THE RISE GROUP PRACTICE,F83039,HORNSEY RISE HEALTH CTR,...,190205500BCCPA0,Neutrogena T/Gel shampoo for dry hair,19: Other Drugs and Preparations,250.0,1,250.0,0.0,5.52,5.17635,N
17603898,202206,LONDON,Y56,NHS NORTH CENTRAL LONDON INTEGRATED CARE,QMJ,NHS NORTH CENTRAL LONDON ICB - 93C,93C00,THE RISE GROUP PRACTICE,F83039,HORNSEY RISE HEALTH CTR,...,190605000AACACA,Olive oil liquid,19: Other Drugs and Preparations,10.0,1,10.0,0.0,1.4,1.3221,N
17603899,202206,LONDON,Y56,NHS NORTH CENTRAL LONDON INTEGRATED CARE,QMJ,NHS NORTH CENTRAL LONDON ICB - 93C,93C00,THE RISE GROUP PRACTICE,F83039,HORNSEY RISE HEALTH CTR,...,190700000BBCJA0,Resource ThickenUp Clear powder,19: Other Drugs and Preparations,127.0,1,127.0,0.0,8.46,7.92671,N


#### Data Format Valitation

In [73]:
# Imprime a soma de dados nulos
(df3.isnull().sum())

YEAR_MONTH                            0
REGIONAL_OFFICE_NAME                  0
REGIONAL_OFFICE_CODE                  0
ICB_NAME                              0
ICB_CODE                              0
PCO_NAME                              0
PCO_CODE                              0
PRACTICE_NAME                         0
PRACTICE_CODE                         0
ADDRESS_1                         15513
ADDRESS_2                        978557
ADDRESS_3                        470271
ADDRESS_4                       2490847
POSTCODE                          15513
BNF_CHEMICAL_SUBSTANCE                0
CHEMICAL_SUBSTANCE_BNF_DESCR          0
BNF_CODE                              0
BNF_DESCRIPTION                       0
BNF_CHAPTER_PLUS_CODE                 0
QUANTITY                              0
ITEMS                                 0
TOTAL_QUANTITY                        0
ADQUSAGE                              0
NIC                                   0
ACTUAL_COST                           0


In [74]:
# Imprime a(s) linha(s) duplicadas
df3[df3.duplicated()]

Unnamed: 0,YEAR_MONTH,REGIONAL_OFFICE_NAME,REGIONAL_OFFICE_CODE,ICB_NAME,ICB_CODE,PCO_NAME,PCO_CODE,PRACTICE_NAME,PRACTICE_CODE,ADDRESS_1,...,BNF_CODE,BNF_DESCRIPTION,BNF_CHAPTER_PLUS_CODE,QUANTITY,ITEMS,TOTAL_QUANTITY,ADQUSAGE,NIC,ACTUAL_COST,UNIDENTIFIED
6582591,202206,UNIDENTIFIED,-,UNIDENTIFIED,-,UNIDENTIFIED,-,UNIDENTIFIED DOCTORS,-,-,...,0301011R0BWAABZ,Easyhaler Salbutamol sulfate 100micrograms/dos...,03: Respiratory System,1.0,1,1.0,50.0,3.31,3.1089,Y


#### Range check

In [75]:
# Drop dados dublicados e carrega em um novo dataset df3A
df3A = df3.drop_duplicates()

In [76]:
# Validação de dados - Info Check
df3A.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17603899 entries, 0 to 17603899
Data columns (total 26 columns):
 #   Column                        Dtype  
---  ------                        -----  
 0   YEAR_MONTH                    int64  
 1   REGIONAL_OFFICE_NAME          object 
 2   REGIONAL_OFFICE_CODE          object 
 3   ICB_NAME                      object 
 4   ICB_CODE                      object 
 5   PCO_NAME                      object 
 6   PCO_CODE                      object 
 7   PRACTICE_NAME                 object 
 8   PRACTICE_CODE                 object 
 9   ADDRESS_1                     object 
 10  ADDRESS_2                     object 
 11  ADDRESS_3                     object 
 12  ADDRESS_4                     object 
 13  POSTCODE                      object 
 14  BNF_CHEMICAL_SUBSTANCE        object 
 15  CHEMICAL_SUBSTANCE_BNF_DESCR  object 
 16  BNF_CODE                      object 
 17  BNF_DESCRIPTION               object 
 18  BNF_CHAPTER_PLUS_COD

In [77]:
# Números de caracteres.
df3A_caracter = df3A.size

In [78]:
# Números de caracteres.
df3_caracter = df3.size

In [79]:
# Número de caracteres depois das interações
print("Dados retirados", (df3_caracter-df3A_caracter), "caracteres")

Dados retirados 26 caracteres


In [80]:
# Salvar dataset df3A
df3A.to_parquet("/Jupyter/2RP/epd_202207_validado.parquet")

In [81]:
# Reset Datasets
df3 = 0
df3A = 0

In [82]:
# Marca o horário do final do programa
e_time_control = time.time()
print("O tempo total da execução do programa foi", round(e_time_control-s_time_control)/60, "minutes")

O tempo total da execução do programa foi 26.783333333333335 minutes
