In [1]:
import pandas as pd
from typing import List

# FUNCTIONS
def get_sheet_names(file_path: str) -> List[str]:
    """
    Get the names of all sheets in an Excel file.

    Args:
    file_path (str): The path to the Excel file from which the sheet names are to be retrieved.

    Returns:
    List[str]: A list of strings, where each string is the name of a sheet in the Excel file.

    Functionality:
    - Reads an Excel file using pandas.
    - Retrieves the names of all sheets in the Excel file.
    """
    # Read the excel file using pandas
    xls = pd.ExcelFile(file_path)
    
    # Get the names of all the sheets in the excel file
    sheet_names = xls.sheet_names
    
    # Return the list of sheet names
    return sheet_names



# 1. Extract

In [2]:
# 1. Get sheet names from xlsx file
file_path = "data/ingreso_joyas_acero.xlsx"
sheet_names = get_sheet_names(file_path)
print("Sheet names:")
print(sheet_names)

Sheet names:
['aretes', 'aretes_pistola', 'cadenas acero', 'dijes', 'pulseras', 'pulseras_varios', 'collares', 'collares_varios', 'anillos', 'juegos', 'Varios acero', 'proveedores']


In [None]:
# 2. Use pop method to exclude last sheet (proveedores) from the data consolidation process
# Then read provedores sheet in adifferent df
proveedores_name = sheet_names.pop()
proveedores_df = pd.read_excel(file_path, sheet_name=proveedores_name)
proveedores_df.shape

In [None]:
# 3. Use a for loop to read spreadsheets in sheet_names and append them in a list
# Replace the word "inventario" in column names with "ingreso" to simplify consolidation
list_of_df = []
for s in sheet_names:
    print(f"Sheet Name: {s}")
    # Read sheet and add new column with sheet name
    df = pd.read_excel(file_path, sheet_name=s)
    df["source"] = s

    # Replace "invetario" in column names with "ingreso"
    df.columns = [c.replace("inventario", "ingreso") for c in df.columns]
    
    # Append and print info
    list_of_df.append(df)
    print(f"Shape: {df.shape}")
    print(f"Column Names: {df.columns}\n")

In [None]:
# Consolidate (concat) spreadsheets in one dataframe
all = pd.concat(list_of_df)
print(f"Sheet Name: all")
print(f"Shape: {all.shape}")
all.info()

# 2. Transform (data quality)

- 2.1 Verify data types of every column (manual)
- 2.2 Check missing values for each column
- 2.3 Check ```peso``` or ```costo_gramo``` equal to ```0```
- 2.4 Rename provedores using ```proveedores_df```
- 2.5 Melt: Dataframe from wide to long format

In [None]:
# 2.2 Check missing values per columns
print(all.isna().sum())

# A. RESULTS
# COLNAME: FINDINGS / ACTIONS:
# fecha_compra: 4 missings/ Inspect and correct spreadsheet
# talla: only anillos should have non missings / No action needed
# ingreso_: missings are expected / No action needed
# peso: 9 missings / Inspect and corrected in file
# costo_gramo: 11 missings / Inspect and corrected in file

# B. NOTES:
# For juegos_piedras_perlas is ok to have missings in peso and costo gramo

In [None]:
# INSPECT:
# fecha_compra
print('Missings in fecha_compra:')
print(all.loc[all.fecha_compra.isnull(), ['codigo', 'source']])

# peso
print('\nMissings in peso:')
print(all.loc[all.peso.isnull(), ['codigo', 'source']])

# peso
print('\nMissings in costo_gramo:')
print(all.loc[all.costo_gramo.isnull(), ['codigo', 'source']])


In [None]:
# 2.3 Check peso or costo_gramo <= 0
# ACTION NEEDED: Manually fix the file with non-zero values
print('Rows with costo_gramo <= 0')
print(all[all.costo_gramo<=0][['codigo', 'source', 'costo_gramo', 'peso', 'costo']])

print('\nRows with peso <= 0')
print(all[all.peso<=0][['codigo', 'source', 'costo_gramo', 'peso', 'costo']])

print('\nRows with costo <= 0 as consequense of peso <= 0 OR costo_gramo <= 0')
print(all[all.costo<=0][['codigo', 'source', 'costo_gramo', 'peso', 'costo']])

In [None]:
# 2.4 Rename proveedores
# Create dictionary where key is old name and values are new name
dict_of_proveedores_1 = {k:v for k,v in zip(proveedores_df.Proveedor, proveedores_df['Nuevo Nombre'])}

# Use the dictionary to replace old values
all['proveedor'] = all.proveedor.replace(dict_of_proveedores_1)

# Validate unuique values. Create a list of values that need manual changes
print(f'Unique providers after 1st iteration: \n{all.proveedor.unique()} \n')

# Create new dictionary for remaining values {old_name : new_name}
dict_of_proveedores_2 = {
    'China':'CHINA', 
    'Cirkon ':'CIRKON', 
    'Cirkon  ':'CIRKON',
    'flavio jara':'FLAVIO JARA', 
    'CAMBIO DE CODIGO':'VOGA',
        ' ':'VOGA',
        '*': 'VOGA',
        'LX, N.Y.':'LX USA',
        'Alina ' :'ALINA PAZ',
        'Andres ' :'ANDRES CADAVID',
        'Feria Mia' : 'MIAMI',
        'pedir': 'VOGA'
    }

# Use the new dictionary to replace remaining values
all['proveedor'] = all.proveedor.replace(dict_of_proveedores_2)

# Validate unique names again
print(f'Unique providers after 2nd iteration: \n{all.proveedor.unique()}')


## 2.5 Melt

The pd.melt function in pandas is used to transform a DataFrame from a wide format to a long format. In the wide format, data is typically spread across multiple columns, while in the long format, data is stacked in a single column with an additional column indicating the original variable name (typically the column name in the wide format).

Parameters of pd.melt:

	•	id_vars: Specifies the columns to keep unchanged (identifier variables). These columns remain unpivoted.
	•	value_vars (optional): Specifies the columns to unpivot. If not provided, all columns not specified in id_vars are used.
	•	var_name: The name to use for the ‘variable’ column in the resulting DataFrame.
	•	value_name: The name to use for the ‘value’ column in the resulting DataFrame.

In [None]:
# Especificar las columnas que no deben ser melted
id_vars = ['fecha_compra', 'codigo', 'talla', 'peso',
       'costo_gramo', 'costo', 'pvp', 'detalle', 'proveedor', 'nota', 'source']

# Realizar el melt del dataframe
all_melted = pd.melt(all, id_vars=id_vars, var_name='fecha_ingreso', value_name='count_items')

# Limpiar la columna 'fecha' para extraer la fecha en el formato correcto
all_melted['fecha_ingreso'] = all_melted['fecha_ingreso'].str.replace('ingreso_', '')


# Eliminar las filas con valores perdidos en la columna 'items'
# Esto corresponde a items que no tienen ingresos registrados
all_melted.dropna(subset=['count_items'], inplace=True)

all_melted

In [None]:



all_melted.info()

In [None]:
# Fix data types

In [None]:
all_melted.count_items.value_counts().sort_index()

In [None]:
# Catch values with count_items == 0 and clean in file
print(all_melted[all_melted.count_items==0][['codigo', 'source', 'fecha_ingreso', 'count_items']])

# 3. Load

In [None]:
# 4. Save dataframe as Excel
all_melted.to_excel("/workspaces/Voga/data_migration/data/all_plata_melted.xlsx", index=False)