In [23]:
import pandas as pd
from typing import List

# FUNCTIONS
def get_sheet_names(file_path: str) -> List[str]:
    """
    Get the names of all sheets in an Excel file.

    Args:
    file_path (str): The path to the Excel file from which the sheet names are to be retrieved.

    Returns:
    List[str]: A list of strings, where each string is the name of a sheet in the Excel file.

    Functionality:
    - Reads an Excel file using pandas.
    - Retrieves the names of all sheets in the Excel file.
    """
    # Read the excel file using pandas
    xls = pd.ExcelFile(file_path)
    
    # Get the names of all the sheets in the excel file
    sheet_names = xls.sheet_names
    
    # Return the list of sheet names
    return sheet_names



# 1. Extract

In [24]:
# 1. Get sheet names from xlsx file
file_path = "data/ingreso_joyas_acero.xlsx"
sheet_names = get_sheet_names(file_path)
print(sheet_names)

['aretes', 'aretes_pistola', 'cadenas', 'dijes', 'pulseras', 'pulseras_varios', 'collares', 'collares_varios', 'anillos', 'juegos', 'varios']


In [29]:
# 2. Read spreadsheets and consolidate files
list_of_df = []
for s in sheet_names:
    print(f"Sheet Name: {s}")
    df = pd.read_excel(file_path, sheet_name=s)
    df["source"] = s
    list_of_df.append(df)
    print(f"Shape: {df.shape}")
    print(f"Column Names: {df.columns}\n")

Sheet Name: aretes
Shape: (333, 15)
Column Names: Index(['fecha_compra', 'codigo', 'ingreso_14/03/23', 'ingreso_02/05/23',
       'ingreso_19/07/23', 'ingreso_22/08/23', 'ingreso_22/02/24',
       'ingreso_27/05/24', 'ingreso_20/06/24', 'costo', 'pvp', 'detalle',
       'proveedor', 'nota', 'source'],
      dtype='object')

Sheet Name: aretes_pistola
Shape: (1, 12)
Column Names: Index(['fecha_compra', 'codigo', 'ingreso_09/03/22', 'ingreso_14/03/23',
       'ingreso_09/05/23', 'ingreso_30/05/24', 'costo', 'pvp', 'detalle',
       'proveedor', 'nota', 'source'],
      dtype='object')

Sheet Name: cadenas
Shape: (45, 12)
Column Names: Index(['fecha_compra', 'codigo', 'ingreso_14/12/21', 'ingreso_14/03/23',
       'ingreso_17/07/23', 'ingreso_20/02/24', 'costo', 'pvp', 'detalle',
       'proveedor', 'nota', 'source'],
      dtype='object')

Sheet Name: dijes
Shape: (56, 13)
Column Names: Index(['fecha_compra', 'codigo', 'ingreso_13/12/21', 'ingreso_14/03/23',
       'ingreso_17/07/23', 'i

In [30]:
# Consolidate
all = pd.concat(list_of_df)
print(f"Sheet Name: all")
print(f"Shape: {all.shape}")
all.info()

Sheet Name: all
Shape: (801, 30)
<class 'pandas.core.frame.DataFrame'>
Index: 801 entries, 0 to 0
Data columns (total 30 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   fecha_compra      801 non-null    object 
 1   codigo            801 non-null    object 
 2   ingreso_14/03/23  493 non-null    float64
 3   ingreso_02/05/23  100 non-null    float64
 4   ingreso_19/07/23  21 non-null     float64
 5   ingreso_22/08/23  29 non-null     float64
 6   ingreso_22/02/24  129 non-null    float64
 7   ingreso_27/05/24  27 non-null     float64
 8   ingreso_20/06/24  7 non-null      float64
 9   costo             748 non-null    object 
 10  pvp               748 non-null    float64
 11  detalle           748 non-null    object 
 12  proveedor         793 non-null    object 
 13  nota              49 non-null     object 
 14  source            801 non-null    object 
 15  ingreso_09/03/22  1 non-null      float64
 16  ingreso_09/05/23  

# 2. Transform (data quality)

## Invalid column type

Sheet Name: all

| Column Index | Column Name       | Non-null Count | Data Type |
|--------------|-------------------|----------------|-----------|
| 0            | fecha_compra      | 801 non-null   | object    |
| 9            | costo             | 748 non-null   | object    |
| 28           | ingreso_05/06/24  | 27 non-null    | object    |



## Missing detalle

In [34]:
# Fix in file
all[all.detalle.isnull()][["codigo", "source"]]

Unnamed: 0,codigo,source
3,PV004,pulseras_varios
4,PV005,pulseras_varios
5,PV006,pulseras_varios
6,PV007,pulseras_varios
7,PV008,pulseras_varios
8,PV009,pulseras_varios
9,PV010,pulseras_varios
10,PV011,pulseras_varios
11,PV012,pulseras_varios
12,PV013,pulseras_varios


## items with missing costo

In [31]:
# Fix in file
all[all.costo.isnull()][["codigo", "source"]]

Unnamed: 0,codigo,source
3,PV004,pulseras_varios
4,PV005,pulseras_varios
5,PV006,pulseras_varios
6,PV007,pulseras_varios
7,PV008,pulseras_varios
8,PV009,pulseras_varios
9,PV010,pulseras_varios
10,PV011,pulseras_varios
11,PV012,pulseras_varios
12,PV013,pulseras_varios


## items missing price (pvp)


In [33]:
# Fix in file
all[all.pvp.isnull()][["codigo", "source"]]

Unnamed: 0,codigo,source
3,PV004,pulseras_varios
4,PV005,pulseras_varios
5,PV006,pulseras_varios
6,PV007,pulseras_varios
7,PV008,pulseras_varios
8,PV009,pulseras_varios
9,PV010,pulseras_varios
10,PV011,pulseras_varios
11,PV012,pulseras_varios
12,PV013,pulseras_varios


## Missing proveedor

In [32]:
# Fix in file
all[all.proveedor.isnull()][["codigo", "source"]]

Unnamed: 0,codigo,source
283,SSA284,aretes
284,SSA285,aretes
285,SSA286,aretes
331,SSA332,aretes
332,SSA333,aretes
114,SSJ115,juegos
115,SSJ116,juegos
116,SSJ117,juegos


## Consolidate provider name

In [None]:
## b. Clean
all.proveedor.value_counts(dropna=False)
rename_provedor_dict = {
    's.s.' : 'Soledad PRUEBA',
}
all.proveedor.replace(rename_provedor_dict).value_counts(dropna=False)

# Uncomment when cleanup is ready
#all = all.proveedor.replace(rename_provedor_dict)

# 3. Load

In [None]:
# 4. Save dataframe as Excel
all.to_excel("/workspaces/Voga/data_migration/data/all_acero.xlsx", index=False)