In [1]:
import pandas as pd
from typing import List

# FUNCTIONS
def get_sheet_names(file_path: str) -> List[str]:
    """
    Get the names of all sheets in an Excel file.

    Args:
    file_path (str): The path to the Excel file from which the sheet names are to be retrieved.

    Returns:
    List[str]: A list of strings, where each string is the name of a sheet in the Excel file.

    Functionality:
    - Reads an Excel file using pandas.
    - Retrieves the names of all sheets in the Excel file.
    """
    # Read the excel file using pandas
    xls = pd.ExcelFile(file_path)
    
    # Get the names of all the sheets in the excel file
    sheet_names = xls.sheet_names
    
    # Return the list of sheet names
    return sheet_names


# 1. Extract

In [2]:
# 1. Get sheet names from xlsx file
file_path = "data/ingreso_joyas_acero.xlsx"
sheet_names = get_sheet_names(file_path)
print("Sheet names:")
print(sheet_names)

Sheet names:
['aretes', 'aretes_pistola', 'cadenas acero', 'dijes', 'pulseras', 'pulseras_varios', 'collares', 'collares_varios', 'anillos', 'juegos', 'Varios acero', 'proveedores']


In [3]:
# 2. Use pop method to exclude last sheet (proveedores) from the data consolidation process
# Then read provedores sheet in adifferent df
proveedores_name = sheet_names.pop()
proveedores_df = pd.read_excel(file_path, sheet_name=proveedores_name)
proveedores_df.shape 

(51, 3)

In [4]:
# 3. Use a for loop to read spreadsheets in sheet_names and append them in a list
# Replace the word "inventario" in column names with "ingreso" to simplify consolidation
list_of_df = []
for s in sheet_names:
    print(f"Sheet Name: {s}")
    # Read sheet and add new column with sheet name
    df = pd.read_excel(file_path, sheet_name=s)
    df["source"] = s

    # Replace "invetario" in column names with "ingreso"
    df.columns = [c.replace("inventario", "ingreso") for c in df.columns]
    
    # Append and print info
    list_of_df.append(df)
    print(f"Shape: {df.shape}")
    print(f"Column Names: {df.columns}\n")

Sheet Name: aretes
Shape: (374, 15)
Column Names: Index(['fecha_compra', 'codigo', 'ingreso_14/03/23', 'ingreso_02/05/23',
       'ingreso_19/07/23', 'ingreso_22/08/23', 'ingreso_22/02/24',
       'ingreso_27/05/24', 'ingreso_21/06/24', 'costo', 'pvp', 'detalle',
       'proveedor', 'nota', 'source'],
      dtype='object')

Sheet Name: aretes_pistola
Shape: (1, 11)
Column Names: Index(['fecha_compra', 'codigo', 'ingreso_14/03/23', 'ingreso_09/05/23',
       'ingreso_30/05/24', 'costo', 'pvp', 'detalle', 'proveedor', 'nota',
       'source'],
      dtype='object')

Sheet Name: cadenas acero
Shape: (45, 11)
Column Names: Index(['fecha_compra', 'codigo', 'ingreso_14/03/23', 'ingreso_17/07/23',
       'ingreso_20/02/24', 'costo', 'pvp', 'detalle', 'proveedor', 'nota',
       'source'],
      dtype='object')

Sheet Name: dijes
Shape: (57, 12)
Column Names: Index(['fecha_compra', 'codigo', 'ingreso_14/03/23', 'ingreso_17/07/23',
       'ingreso_20/02/24', 'ingreso_27/05/2024', 'costo', 'pvp'

In [5]:
# Consolidate (concat) spreadsheets in one dataframe
all = pd.concat(list_of_df)
print(f"Sheet Name: all")
print(f"Shape: {all.shape}")
all.info() 

Sheet Name: all
Shape: (852, 29)
<class 'pandas.core.frame.DataFrame'>
Index: 852 entries, 0 to 3
Data columns (total 29 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   fecha_compra        852 non-null    object 
 1   codigo              852 non-null    object 
 2   ingreso_14/03/23    357 non-null    float64
 3   ingreso_02/05/23    100 non-null    float64
 4   ingreso_19/07/23    21 non-null     float64
 5   ingreso_22/08/23    29 non-null     float64
 6   ingreso_22/02/24    129 non-null    float64
 7   ingreso_27/05/24    19 non-null     float64
 8   ingreso_21/06/24    49 non-null     float64
 9   costo               852 non-null    float64
 10  pvp                 852 non-null    float64
 11  detalle             852 non-null    object 
 12  proveedor           852 non-null    object 
 13  nota                49 non-null     object 
 14  source              852 non-null    object 
 15  ingreso_09/05/23    18 non-null

In [6]:
# la columna fecha compra esta definida como objeto y deberia ser tipo date 
cont_string = all["fecha_compra"].apply(lambda x: isinstance(x, str))
print("Rows with fecha_compra as string:")
print(all[cont_string][["fecha_compra","codigo", "source"]])


# Forzamos la transformación a tipo fecha con pd.to_datetime
all["fecha_compra"] = pd.to_datetime(all.fecha_compra, infer_datetime_format=True)

# Chequeamos nuevamen
print("\nNew dtype of fecha_compra:")
all.fecha_compra.info()

Rows with fecha_compra as string:
    fecha_compra  codigo        source
10    14/12/2023  SSA011        aretes
11    14/12/2023  SSA012        aretes
320   27/05/2024  SSA321        aretes
321   27/05/2024  SSA322        aretes
322   27/05/2024  SSA323        aretes
..           ...     ...           ...
117    15/6/2024  SSJ118        juegos
0     19/11/2023  SSV001  Varios acero
1     15/06/2024  SSV002  Varios acero
2     15/06/2024  SSV003  Varios acero
3     15/06/2024  SSV004  Varios acero

[108 rows x 3 columns]

New dtype of fecha_compra:
<class 'pandas.core.series.Series'>
Index: 852 entries, 0 to 3
Series name: fecha_compra
Non-Null Count  Dtype         
--------------  -----         
852 non-null    datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 13.3 KB


  all["fecha_compra"] = pd.to_datetime(all.fecha_compra, infer_datetime_format=True)


In [7]:
all.info()

<class 'pandas.core.frame.DataFrame'>
Index: 852 entries, 0 to 3
Data columns (total 29 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   fecha_compra        852 non-null    datetime64[ns]
 1   codigo              852 non-null    object        
 2   ingreso_14/03/23    357 non-null    float64       
 3   ingreso_02/05/23    100 non-null    float64       
 4   ingreso_19/07/23    21 non-null     float64       
 5   ingreso_22/08/23    29 non-null     float64       
 6   ingreso_22/02/24    129 non-null    float64       
 7   ingreso_27/05/24    19 non-null     float64       
 8   ingreso_21/06/24    49 non-null     float64       
 9   costo               852 non-null    float64       
 10  pvp                 852 non-null    float64       
 11  detalle             852 non-null    object        
 12  proveedor           852 non-null    object        
 13  nota                49 non-null     object        
 14  s

# 2. Transform (data quality)

- 2.1 Verify data types of every column (manual)
- 2.2 Check missing values for each column
- 2.3 Check ```costo``` or ```pvp``` equal to ```0```
- 2.4 Rename provedores using ```proveedores_df```
- 2.5 Melt: Dataframe from wide to long format


In [8]:
# 2.2 Check missing values per columns
print(all.isnull().sum())

# A. RESULTS
# COLNAME: FINDINGS / ACTIONS:
# fecha_compra: 4 missings/ Inspect and correct spreadsheet
# talla: only anillos should have non missings / No action needed
# ingreso_: missings are expected / No action needed
# peso: 9 missings / Inspect and corrected in file
# costo_gramo: 11 missings / Inspect and corrected in file

# B. NOTES:
# For juegos_piedras_perlas is ok to have missings in peso and costo gramo

fecha_compra            0
codigo                  0
ingreso_14/03/23      495
ingreso_02/05/23      752
ingreso_19/07/23      831
ingreso_22/08/23      823
ingreso_22/02/24      723
ingreso_27/05/24      833
ingreso_21/06/24      803
costo                   0
pvp                     0
detalle                 0
proveedor               0
nota                  803
source                  0
ingreso_09/05/23      834
ingreso_30/05/24      851
ingreso_17/07/23      837
ingreso_20/02/24      798
ingreso_27/05/2024    843
ingreso_12/03/24      847
ingreso_31/05/24      836
ingreso_20/06/24      850
ingreso_31/01/24      817
ingreso_15/07/23      838
ingreso 27/05/24      822
ingreso_24/09/22      851
ingreso_05/06/24      825
ingreso_25/06/24      848
dtype: int64


In [9]:
# INSPECT:
# fecha_compra
print('Missings in fecha_compra:')
print(all.loc[all.fecha_compra.isnull(), ['codigo', 'source']])

# costo
print('\nMissings in costo:')
print(all.loc[all.costo.isnull(), ['codigo', 'source']])

# costo
print('\nMissings pvp :')
print(all.loc[all.pvp.isnull(), ['codigo', 'source']])

Missings in fecha_compra:
Empty DataFrame
Columns: [codigo, source]
Index: []

Missings in costo:
Empty DataFrame
Columns: [codigo, source]
Index: []

Missings pvp :
Empty DataFrame
Columns: [codigo, source]
Index: []


In [10]:
# 2.3 Check costo or pvp <= 0
# ACTION NEEDED: Manually fix the file with non-zero values
print('Rows with costo <= 0')
print(all[all.costo<=0][['codigo', 'source', 'costo']])

print('\nRows with pvp <= 0')
print(all[all.pvp<=0][['codigo', 'source', 'pvp', 'costo']])



Rows with costo <= 0
Empty DataFrame
Columns: [codigo, source, costo]
Index: []

Rows with pvp <= 0
Empty DataFrame
Columns: [codigo, source, pvp, costo]
Index: []


In [11]:
# 2.4 Rename proveedores
# Create dictionary where key is old name and values are new name
dict_of_proveedores_1 = {k:v for k,v in zip(proveedores_df.Proveedor, proveedores_df['Nuevo Nombre'])}

# Use the dictionary to replace old values
all['proveedor'] = all.proveedor.replace(dict_of_proveedores_1)

# Validate unuique values. Create a list of values that need manual changes
print(f'Unique providers after 1st iteration: \n{all.proveedor.unique()} \n')



Unique providers after 1st iteration: 
['FREDY VARGAS' 'IMPORTADORA PADILLA' 'FABIAN CARVAJAL'
 'IMPORTADORA MALES' 'PALACIO JOYAS' 'CIRKON' 'SOLEDAD SAENZ' 'P&K'
 'BODY JEWELZ' 'CHINA' 'ANGELICA ANRANGO' 'FABIAN GUZMAN'
 'CASA DEL INSUMO' 'IMPORTADORA NUÑEZ' 'XUPING ' 'MILANUS USA'
 'FABIAN GUZMAN ' 'CECILIA CACHIMUEL' 'CASA DEL SUMINISTRO'
 'JOYERIA MARTHITA CHORDE' 'VENEZIA' 'ANTONIO CALASANZ' 'KEN ' 'ZULENIS'
 'VOGA' 'OTAVALO CENTRO' 'CENTRO ' 'XAVIER CASTRO' 'CECILIA CHACHIMUEL'] 



## 2.5 Melt

The pd.melt function in pandas is used to transform a DataFrame from a wide format to a long format. In the wide format, data is typically spread across multiple columns, while in the long format, data is stacked in a single column with an additional column indicating the original variable name (typically the column name in the wide format).

Parameters of pd.melt:

	• id_vars: Specifies the columns to keep unchanged (identifier variables). These columns remain unpivoted.
	• value_vars (optional): Specifies the columns to unpivot. If not provided, all columns not specified in id_vars are used.
	• var_name: The name to use for the ‘variable’ column in the resulting DataFrame.
	• value_name: The name to use for the ‘value’ column in the resulting DataFrame.

In [12]:
all.columns

Index(['fecha_compra', 'codigo', 'ingreso_14/03/23', 'ingreso_02/05/23',
       'ingreso_19/07/23', 'ingreso_22/08/23', 'ingreso_22/02/24',
       'ingreso_27/05/24', 'ingreso_21/06/24', 'costo', 'pvp', 'detalle',
       'proveedor', 'nota', 'source', 'ingreso_09/05/23', 'ingreso_30/05/24',
       'ingreso_17/07/23', 'ingreso_20/02/24', 'ingreso_27/05/2024',
       'ingreso_12/03/24', 'ingreso_31/05/24', 'ingreso_20/06/24',
       'ingreso_31/01/24', 'ingreso_15/07/23', 'ingreso 27/05/24',
       'ingreso_24/09/22', 'ingreso_05/06/24', 'ingreso_25/06/24'],
      dtype='object')

In [30]:
# Especificar las columnas que no deben ser melted
id_vars = ['fecha_compra', 'codigo','costo', 'pvp', 'detalle', 'proveedor', 'nota', "source"]

# Realizar el melt del dataframe
all_melted = pd.melt(all, id_vars=id_vars, var_name='fecha_ingreso', value_name='count_items')

# Limpiar la columna 'fecha' para extraer la fecha en el formato correcto
all_melted['fecha_ingreso'] = all_melted['fecha_ingreso'].str.replace('ingreso_', '')

# Eliminar las filas con valores perdidos en la columna 'items'
# Esto corresponde a items que no tienen ingresos registrados
all_melted.dropna(subset=['count_items'], inplace=True)

#transformar "count items" a entero
all_melted["count_items"] = all_melted['count_items'].astype(int)

all_melted.info()



<class 'pandas.core.frame.DataFrame'>
Index: 935 entries, 2 to 17891
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   fecha_compra   935 non-null    datetime64[ns]
 1   codigo         935 non-null    object        
 2   costo          935 non-null    float64       
 3   pvp            935 non-null    float64       
 4   detalle        935 non-null    object        
 5   proveedor      935 non-null    object        
 6   nota           71 non-null     object        
 7   source         935 non-null    object        
 8   fecha_ingreso  935 non-null    object        
 9   count_items    935 non-null    int64         
dtypes: datetime64[ns](1), float64(2), int64(1), object(6)
memory usage: 80.4+ KB


In [32]:
all_melted.head()

Unnamed: 0,fecha_compra,codigo,costo,pvp,detalle,proveedor,nota,source,fecha_ingreso,count_items
2,2021-11-23,SSA003,2.0,14.0,dorado semilarg piedras,IMPORTADORA PADILLA,0FERTA,aretes,14/03/23,1
3,2021-11-23,SSA004,2.5,16.8,dorado indú,IMPORTADORA PADILLA,0FERTA,aretes,14/03/23,3
8,2023-07-25,SSA009,1.25,8.0,"dorado serpiente sircon, cabza verde/unidad",FABIAN CARVAJAL,,aretes,14/03/23,2
10,2023-12-14,SSA011,0.1,2.0,plateado sircon transp. para nariz,IMPORTADORA MALES,,aretes,14/03/23,39
11,2023-12-14,SSA012,1.0,8.75,G.filled dorado ortopedico niña,PALACIO JOYAS,,aretes,14/03/23,36


# 3. Load

In [33]:
import os

# Obtener el directorio de trabajo actual
directorio_actual = os.getcwd()

# Imprimir el directorio de trabajo actual
print("El directorio de trabajo actual es:", directorio_actual)

El directorio de trabajo actual es: /workspaces/Voga/data_migration


In [35]:
# 4. Save dataframe as Excel
all_melted.to_excel("data/clean_files/all_acero_melted.xlsx", index=False)