In [2]:
import pandas as pd
from typing import List

# FUNCTIONS
def get_sheet_names(file_path: str) -> List[str]:
    """
    Get the names of all sheets in an Excel file.

    Args:
    file_path (str): The path to the Excel file from which the sheet names are to be retrieved.

    Returns:
    List[str]: A list of strings, where each string is the name of a sheet in the Excel file.

    Functionality:
    - Reads an Excel file using pandas.
    - Retrieves the names of all sheets in the Excel file.
    """
    # Read the excel file using pandas
    xls = pd.ExcelFile(file_path)
    
    # Get the names of all the sheets in the excel file
    sheet_names = xls.sheet_names
    
    # Return the list of sheet names
    return sheet_names



# 1. Extract

In [3]:
# 1. Get sheet names from xlsx file
file_path = "data/ingreso_joyas_plata.xlsx"
sheet_names = get_sheet_names(file_path)
print("Sheet names:")
print(sheet_names)

Sheet names:
['anillos_plata', 'dijes_plata', 'juegos_plata', 'collares_plata', 'aretes_plata', 'cadena_plata', 'pulseras_plata', 'varios_plata', 'juegos_piedras_perlas']


In [4]:
# 2. Read spreadsheets and consolidate files
list_of_df = []
for s in sheet_names:
    print(f"Sheet Name: {s}")
    df = pd.read_excel(file_path, sheet_name=s)
    df["source"] = s.replace(' ','_')
    list_of_df.append(df)
    print(f"Shape: {df.shape}")
    print(f"Column Names: {df.columns}\n")

Sheet Name: anillos_plata
Shape: (601, 15)
Column Names: Index(['fecha_compra', 'codigo', 'talla', 'inventario_14/03/23',
       'ingreso_25/04/23', 'ingreso_27/03/24', 'ingreso_09/05/24', 'peso',
       'costo_gramo', 'costo', 'pvp', 'detalle', 'proveedor', 'nota',
       'source'],
      dtype='object')

Sheet Name: dijes_plata
Shape: (719, 19)
Column Names: Index(['fecha_compra', 'codigo', 'inventario_14/03/23', 'ingreso_28/04/23',
       'ingreso_15/06/23', 'ingreso_18/08/23', 'ingreso_28/09/23',
       'ingreso_19/12/23', 'ingreso_07/03/24', 'ingreso_10/05/24',
       'ingreso_20/06/24', 'peso', 'costo_gramo', 'costo', 'pvp', 'detalle',
       'proveedor', 'nota', 'source'],
      dtype='object')

Sheet Name: juegos_plata
Shape: (215, 18)
Column Names: Index(['fecha_compra', 'codigo', 'ingreso_14/03/23', 'ingreso_28/04/23',
       'ingreso_20/06/23', 'ingreso_28/09/23', 'ingreso_18/11/23',
       'ingreso_15/12/23', 'ingreso_07/03/24', 'ingreso_16/05/24', 'peso',
       'costo_gra

In [5]:
# Consolidate
all = pd.concat(list_of_df)
print(f"Sheet Name: all")
print(f"Shape: {all.shape}")
all.info()

Sheet Name: all
Shape: (2568, 37)
<class 'pandas.core.frame.DataFrame'>
Index: 2568 entries, 0 to 8
Data columns (total 37 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   fecha_compra         2564 non-null   datetime64[ns]
 1   codigo               2567 non-null   object        
 2   talla                599 non-null    object        
 3   inventario_14/03/23  1875 non-null   float64       
 4   ingreso_25/04/23     74 non-null     float64       
 5   ingreso_27/03/24     1 non-null      float64       
 6   ingreso_09/05/24     51 non-null     float64       
 7   peso                 2501 non-null   float64       
 8   costo_gramo          2556 non-null   float64       
 9   costo                2568 non-null   float64       
 10  pvp                  2567 non-null   float64       
 11  detalle              2567 non-null   object        
 12  proveedor            2568 non-null   object        
 13  nota   

# 2. Transform (data quality)

In [8]:
# missing código
all[all.codigo.isnull()][["source", "proveedor"]]



Unnamed: 0,source,proveedor
495,anillos_plata,CAMBIO CODIGOS VIEJOS


costo missing
costo = 0
missing peso
peso 0

## Consolidate provider name

In [18]:
## b. Clean
all.proveedor.value_counts(dropna=False)
rename_provedor_dict = {
    's.s.' : 'Soledad PRUEBA',
}
all.proveedor.value_counts(dropna=True)



proveedor
cirkon                   506
s.s.                     403
CAMBIO CODIGOS VIEJOS    251
alpha                    178
Cirkon                   177
                        ... 
panam                      1
taller                     1
sempertegui 2,60           1
Palac.Joyas                1
G.Parra                    1
Name: count, Length: 66, dtype: int64

# 3. Load

In [None]:
# 4. Save dataframe as Excel
all.to_excel("/workspaces/Voga/data_migration/data/all_acero.xlsx", index=False)