# Clean Excel data from Dalinde Hospital

In [1]:
import pandas as pd
import matplotlib as plt
import numpy as np
import sklearn
import os
from tqdm.auto import tqdm, trange

_________________

## Informes de consumo 2018-2019

In [2]:
# Retrieve names of files
data_files = os.listdir('data/')
data_files = [file for file in data_files if 'Informe de consumo por areas Tuxpan' in file]
data_files

['Informe de consumo por areas Tuxpan enero 2018.xlsx',
 'Informe de consumo por areas Tuxpan Febrero 2018.xlsx',
 'Informe de consumo por areas Tuxpan Diciembre 2018.xlsx',
 'Informe de consumo por areas Tuxpan Agosto 2019.xlsx',
 'Informe de consumo por areas Tuxpan Septiembre 2019.xlsx',
 'Informe de consumo por areas Tuxpan Noviembre 2018.xlsx',
 'Informe de consumo por areas Tuxpan Junio 2019.xlsx',
 'Informe de consumo por areas Tuxpan mayo 2018.xlsx',
 'Informe de consumo por areas Tuxpan Mayo 2019.xlsx',
 'Informe de consumo por areas Tuxpan Marzo 2019.xlsx',
 'Informe de consumo por areas Tuxpan Noviembre 2019.xlsx',
 'Informe de consumo por areas Tuxpan Julio 2019.xlsx',
 'Informe de consumo por areas Tuxpan marzo 2018.xlsx',
 'Informe de consumo por areas Tuxpan junio 2018.xlsx',
 'Informe de consumo por areas Tuxpan Octubre 2019.xlsx',
 'Informe de consumo por areas Tuxpan Agosto 2018.xlsx',
 'Informe de consumo por areas Tuxpan Abril 2019.xlsx',
 'Informe de consumo por ar

In [3]:
# Months dictionary
months = {'ENERO':'01', 'FEBRERO':'02', 'MARZO':'03', 'ABRIL':'04', 'MAYO':'05', 'JUNIO':'06', 'JULIO':'07', 'AGOSTO':'08', 'SEPTIEMBRE':'09', 'OCTUBRE':'10', 'NOVIEMBRE':'11', 'DICIEMBRE':'12'}
# Empty DataFrame
data = pd.DataFrame()

for i in trange(len(data_files), desc = 'Processing files'):
      
    filename = data_files[i] # Name of file
    
    # Retreive MONTH and YEAR from filename
    month = months[filename.split(' ')[6].upper()]
    year = filename.split(' ')[7].split('.')[0]
    # Set date with first day of the month
    date = pd.to_datetime(year+month, format='%Y%m')
    # Assign last day of the month
    date = pd.to_datetime(year+month+str(date.days_in_month), format='%Y%m%d')
    
    print('Processing data from '+str(date)) # For tracking progress
    
    # Load file
    raw_df = pd.read_excel('data/'+filename, header=5)
    
    # Fill ÁREA and DEPARTAMENTO columns
    df = raw_df.copy()
    df[["ÁREA","DEPARTAMENTO"]] = df[["ÁREA","DEPARTAMENTO"]].fillna(method='ffill')
    # Drop all rows without a CAPTURA id
    df = df.loc[df['CAPTURA'].dropna().index, :]
    # Retrieve items which DEPARTAMENTO contains 'PATENTE'
    df = df.loc[df['DEPARTAMENTO'].str.contains('PATENTE'), :]
    # Add FECHA column
    df["FECHA"] = date
    df = df.loc[:, ["FECHA", "DEPARTAMENTO", "CAPTURA", "CÓDIGO", "ARTÍCULO", "CANTIDAD", "COSTO UNITARIO","TOTAL NETO"]]
    
    # Concatenate df
    data = pd.concat([data, df], ignore_index=True)

# Sort values by date
data = data.sort_values(['FECHA','DEPARTAMENTO','ARTÍCULO'], ignore_index=True)
# Write data to CSV file
data.to_csv('Informe de consumo 2018-2019.csv', index=False)
data

HBox(children=(FloatProgress(value=0.0, description='Processing files', max=22.0, style=ProgressStyle(descript…

Processed data from 2018-01-31 00:00:00
Processed data from 2018-02-28 00:00:00
Processed data from 2018-12-31 00:00:00
Processed data from 2019-08-31 00:00:00
Processed data from 2019-09-30 00:00:00
Processed data from 2018-11-30 00:00:00
Processed data from 2019-06-30 00:00:00
Processed data from 2018-05-31 00:00:00
Processed data from 2019-05-31 00:00:00
Processed data from 2019-03-31 00:00:00
Processed data from 2019-11-30 00:00:00
Processed data from 2019-07-31 00:00:00
Processed data from 2018-03-31 00:00:00
Processed data from 2018-06-30 00:00:00
Processed data from 2019-10-31 00:00:00
Processed data from 2018-08-31 00:00:00
Processed data from 2019-04-30 00:00:00
Processed data from 2019-01-31 00:00:00
Processed data from 2018-10-31 00:00:00
Processed data from 2018-04-30 00:00:00
Processed data from 2019-02-28 00:00:00
Processed data from 2018-10-31 00:00:00



_________________

## Requisiciones vs órdenes de compra

In [53]:
# Retrieve names of files
data_files = os.listdir('data/')
data_files = [file for file in data_files if 'requisiciones vs ordenes de compra' in file]
data_files

['requisiciones vs ordenes de compra 2019.xlsx',
 'requisiciones vs ordenes de compra 2018 (1).xlsx']

In [58]:
# Empty DataFrame
data = pd.DataFrame()

for i in trange(len(data_files), desc = 'Processing files'):

    filename = data_files[i] # Name of file
    print('Processing data from '+filename) # For tracking progress
    
    # Load file
    raw_df = pd.read_excel('data/'+filename, header=[4,5])

    df = raw_df.copy()
    # Restore columns names
    df.columns = [(col[0] if 'Unnamed' in col[1] else col[0]+' '+col[1]) for col in df.columns.to_flat_index()]
    # Drop unused columns
    df = df.loc[:, df.columns[:-2]]
    # Standirize date format
    df['FECHA'] = pd.to_datetime(df['FECHA'], format='%d-%m-%Y')
    df['FECHA PEDIDO'] = pd.to_datetime(df['FECHA PEDIDO'], format='%d-%m-%Y')
    df['FECHA ENTREGA'] = pd.to_datetime(df['FECHA ENTREGA'], format='%d-%m-%Y')
    
    # Concatenate df
    data = pd.concat([data, df], ignore_index=True)

# Sort values by REQUISICIÓN id
data = data.sort_values(['REQUISICIÓN','FECHA'], ignore_index=True)
# Write data to CSV file
data.to_csv('Requisiciones vs ordenes de compra 2018-2019.csv', index=False)
data

HBox(children=(FloatProgress(value=0.0, description='Processing files', max=2.0, style=ProgressStyle(descripti…

Processing data from requisiciones vs ordenes de compra 2019.xlsx
Processing data from requisiciones vs ordenes de compra 2018 (1).xlsx



Unnamed: 0,REQUISICIÓN,FECHA,ORDEN COMPRA,FECHA PEDIDO,FECHA ENTREGA,PROVEEDOR,ARTÍCULO,DESCRIPCIÓN,DESCRIPCIÓN SALIDAS,CANTIDAD REQUERIDA,CANTIDAD PEDIDA
0,52666,2018-01-02,257941.0,2018-01-03,2018-01-10,"NADRO, S.A.P.I. DE C.V.",2900110,#DEXERYL CREMA EMOLIENTE 250G,199.0,1.0,1
1,52666,2018-01-02,257941.0,2018-01-03,2018-01-10,"NADRO, S.A.P.I. DE C.V.",2900233,#ISODINE 8G BUCOFARINGEO SOL 120ML,198.0,2.0,2
2,52666,2018-01-02,257941.0,2018-01-03,2018-01-10,"NADRO, S.A.P.I. DE C.V.",2900274,#VASELINE PURO 100G,196.0,2.0,2
3,52666,2018-01-02,257941.0,2018-01-03,2018-01-10,"NADRO, S.A.P.I. DE C.V.",2900493,VIVERA 2G 8 SOBRES LACTOBACILLUS RHAMNOSUS GG,197.0,2.0,2
4,52666,2018-01-02,257942.0,2018-01-03,2018-01-10,FARMACOS NACIONALES S.A. DE C.V. (FANASA),2900150,#BICARBONATO DE SODIO 100G,198.0,2.0,2
...,...,...,...,...,...,...,...,...,...,...,...
64167,66178,2019-12-30,315609.0,2019-12-30,2020-01-03,DISTRIBUIDORA MEDICA CODIMED S.A. DE C.V.,3800068,FISIOLOGICA 50 ML FCO. PISA,1282.0,1200.0,1200
64168,66178,2019-12-30,315609.0,2019-12-30,2020-01-03,DISTRIBUIDORA MEDICA CODIMED S.A. DE C.V.,3800066,FISIOLOGICA 500ML FCO. PISA,166.0,168.0,168
64169,66178,2019-12-30,315609.0,2019-12-30,2020-01-03,DISTRIBUIDORA MEDICA CODIMED S.A. DE C.V.,3800064,HARTMAN 1000ML FRASCO PISA,143.0,120.0,120
64170,66178,2019-12-30,315609.0,2019-12-30,2020-01-03,DISTRIBUIDORA MEDICA CODIMED S.A. DE C.V.,3800077,HARTMAN 500 ML FLEXOVAL SOLUCION PISA 4000088,78.0,72.0,72


_________________

## Informe órdenes incompletas

In [64]:
# Retrieve names of files
data_files = os.listdir('data/')
data_files = [file for file in data_files if 'Requisiciones ordenes incompletas' in file]
data_files

['Requisiciones ordenes incompletas 2018.xlsx',
 'Requisiciones ordenes incompletas 2019.xlsx']

In [65]:
filename = data_files[0] # Name of file
print('Processing data from '+filename) # For tracking progress

# Load file
raw_df = pd.read_excel('data/'+filename, header=[4,5])
raw_df

Processing data from Requisiciones ordenes incompletas 2018.xlsx


Unnamed: 0_level_0,PROVEEDOR,PEDIDO,FECHA,FECHA,REQUISICION,FECHA,C. COSTOS,ARTÍCULO,DESCRIPCIÓN,FACTURA,FECHA,CANTIDAD,CANTIDAD,CANTIDAD,USUARIO,%,SEGUIMIENTO
Unnamed: 0_level_1,Unnamed: 0_level_1,Unnamed: 1_level_1,PEDIDO,ENTREGA,Unnamed: 4_level_1,SOLICITUD,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,RECEPCION,REQUISICION,SOLICITADA,RECIBIDA,COMPRAS,Unnamed: 15_level_1,Unnamed: 16_level_1
0,"ABASTECEDOR TERAPEUTICO, S.A. DE C.V. (ATSA)",258679.0,15-01-2018,22-01-2018,52889.0,15-01-2018,MATERIAL DE CURACION,1900240,#EXTENSION K/750 50CM MEDEX MX450FL,,,250.0,250.0,0,jigonzalez,0.0,
1,"ABASTECEDOR TERAPEUTICO, S.A. DE C.V. (ATSA)",262080.0,26-02-2018,05-03-2018,53711.0,26-02-2018,MATERIAL DE CURACION,1900947,#EXTENSION MINIVOLUMEN 536035 1.0ML VOL RESIDU...,,,50.0,50.0,0,jigonzalez,0.0,
2,,,,,,,,1900979,#KIT P/MONITOREO MX9505T TRANSDUCTOR MEDEX,,,10.0,10.0,0,jigonzalez,0.0,
3,,,,,,,,1901413,#KIT P/MONITOREO MX9604A TRANSDUCTOR MEDEX,,,20.0,20.0,0,jigonzalez,0.0,
4,"ABASTECEDOR TERAPEUTICO, S.A. DE C.V. (ATSA)",268297.0,21-05-2018,28-05-2018,55285.0,21-05-2018,MATERIAL DE CURACION,1900313,#LLAVE 3 VIAS S/EXTENSION MX5311L MEDEX,,,250.0,250.0,0,jigonzalez,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1773,VITA INGENII S.A. DE C.V.,278608.0,24-09-2018,01-10-2018,57670.0,24-09-2018,MATERIAL DE CURACION,1901828,#CUBIERTA P/TERMOMETRO C/25 R.05031-110,,,100.0,100.0,0,jigonzalez,0.0,
1774,VITASANITAS S.A. DE .C.V.,262389.0,02-03-2018,06-03-2018,53801.0,02-03-2018,MEDICAMENTOS,3305045,ALBUMINA HUMANA AL 20% 50ML GRIFOLS,,,20.0,270.0,0,latapiad,0.0,
1775,"VITASANITAS, S.A. DE C.V.",260043.0,01-02-2018,05-02-2018,53227.0,01-02-2018,MEDICAMENTOS,3800003,ALBUMINA HUMANA AL 20% FCO 50ML,,,5.0,5.0,0,jigonzalez,0.0,
1776,"VITASANITAS, S.A. DE C.V.",260191.0,02-02-2018,05-02-2018,53250.0,02-02-2018,MEDICAMENTOS,3800003,ALBUMINA HUMANA AL 20% FCO 50ML,,,10.0,10.0,0,jigonzalez,0.0,


In [88]:
df = raw_df.copy()
# Restore columns names
df.columns = [(col[0] if 'Unnamed' in col[1] else col[0]+' '+col[1]) for col in df.columns.to_flat_index()]
# Drop unused columns
df = df.loc[:, np.append(df.columns[:-3], df.columns[-2])]
# Standirize date format
df['FECHA PEDIDO'] = pd.to_datetime(df['FECHA PEDIDO'], format='%d-%m-%Y', errors='coerce')
df['FECHA ENTREGA'] = pd.to_datetime(df['FECHA ENTREGA'], format='%d-%m-%Y', errors='coerce')
df['FECHA ENTREGA'] = pd.to_datetime(df['FECHA ENTREGA'], format='%d-%m-%Y', errors='coerce')
df['FECHA SOLICITUD'] = pd.to_datetime(df['FECHA SOLICITUD'], format='%d-%m-%Y', errors='coerce')
df

Unnamed: 0,PROVEEDOR,PEDIDO,FECHA PEDIDO,FECHA ENTREGA,REQUISICION,FECHA SOLICITUD,C. COSTOS,ARTÍCULO,DESCRIPCIÓN,FACTURA,FECHA RECEPCION,CANTIDAD REQUISICION,CANTIDAD SOLICITADA,CANTIDAD RECIBIDA,%
0,"ABASTECEDOR TERAPEUTICO, S.A. DE C.V. (ATSA)",258679.0,2018-01-15,2018-01-22,52889.0,2018-01-15,MATERIAL DE CURACION,1900240,#EXTENSION K/750 50CM MEDEX MX450FL,,,250.0,250.0,0,0.0
1,"ABASTECEDOR TERAPEUTICO, S.A. DE C.V. (ATSA)",262080.0,2018-02-26,2018-03-05,53711.0,2018-02-26,MATERIAL DE CURACION,1900947,#EXTENSION MINIVOLUMEN 536035 1.0ML VOL RESIDU...,,,50.0,50.0,0,0.0
2,,,NaT,NaT,,NaT,,1900979,#KIT P/MONITOREO MX9505T TRANSDUCTOR MEDEX,,,10.0,10.0,0,0.0
3,,,NaT,NaT,,NaT,,1901413,#KIT P/MONITOREO MX9604A TRANSDUCTOR MEDEX,,,20.0,20.0,0,0.0
4,"ABASTECEDOR TERAPEUTICO, S.A. DE C.V. (ATSA)",268297.0,2018-05-21,2018-05-28,55285.0,2018-05-21,MATERIAL DE CURACION,1900313,#LLAVE 3 VIAS S/EXTENSION MX5311L MEDEX,,,250.0,250.0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1773,VITA INGENII S.A. DE C.V.,278608.0,2018-09-24,2018-10-01,57670.0,2018-09-24,MATERIAL DE CURACION,1901828,#CUBIERTA P/TERMOMETRO C/25 R.05031-110,,,100.0,100.0,0,0.0
1774,VITASANITAS S.A. DE .C.V.,262389.0,2018-03-02,2018-03-06,53801.0,2018-03-02,MEDICAMENTOS,3305045,ALBUMINA HUMANA AL 20% 50ML GRIFOLS,,,20.0,270.0,0,0.0
1775,"VITASANITAS, S.A. DE C.V.",260043.0,2018-02-01,2018-02-05,53227.0,2018-02-01,MEDICAMENTOS,3800003,ALBUMINA HUMANA AL 20% FCO 50ML,,,5.0,5.0,0,0.0
1776,"VITASANITAS, S.A. DE C.V.",260191.0,2018-02-02,2018-02-05,53250.0,2018-02-02,MEDICAMENTOS,3800003,ALBUMINA HUMANA AL 20% FCO 50ML,,,10.0,10.0,0,0.0


In [89]:
df['C. COSTOS'].value_counts()

MEDICAMENTOS                              533
MATERIAL DE CURACION                      263
MATERIAL DE CURACION/ENFERMERIA            43
ORTOPEDIA                                  20
MATERIAL DE CURACION/QUINTANA ROO           8
NUTRICIONES PARENTERALES                    3
MATERIAL DE CURACION/TERAPIA INTENSIVA      2
MENSAJERIA                                  1
FARMACIA                                    1
Name: C. COSTOS, dtype: int64