In [None]:
import pandas as pd
import os

In [None]:
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath('alba_mf.xlsm')))
DATA_RAW = os.path.join(BASE_DIR, 'data', 'raw')

In [None]:
excel_path = os.path.join(DATA_RAW, 'alba_mf.xlsm')
df = pd.read_excel(excel_path, sheet_name='EC DATA')

In [None]:
alba_mf_path = os.path.join(DATA_RAW, 'alba_mf.csv')
df.to_csv(alba_mf_path, index=False)

In [None]:
print(df.iloc[1])

In [None]:
# Obtener los nombres de columna de la segunda fila (índice 1)
column_names = df.iloc[1].values.tolist()

# Renombrar la segunda columna a 'datetime'
columns_to_rename = [1, 18, 35, 47, 55]

# Convertir todos los nombres de columnas a strings primero
column_names = [str(col) for col in column_names]

# Ahora aplicar la transformación a minúsculas y guiones bajos
column_names = [col.lower().replace(' ', '_').replace('-', '_') for col in column_names]

for i, pos in enumerate(columns_to_rename):
    column_names[pos] = f'date_{i}'

# Asignar los nuevos nombres de columna
df.columns = column_names

# Eliminar las dos primeras filas (índices 0 y 1) y restablecer índices
df = df.drop(df.index[:2]).reset_index(drop=True)
print(df)


In [None]:
df = df.iloc[:, 1:50]

In [None]:
print(df.info())

In [None]:
date_columns = [col for col in df.columns if 'date' in str(col).lower()]
print(date_columns)

In [None]:
for col in date_columns:
    df[col] = pd.to_datetime(df[col], errors='coerce').dt.floor('D')
    print(df[col].dtype)

In [None]:
print(df.dtypes)

In [None]:
# Encontrar columnas que literalmente se llaman "nan" (como texto)
columnas_nan = [col for col in df.columns if col == "nan"]
print(f"Columnas con nombre 'nan': {len(columnas_nan)}")

# Eliminar estas columnas
if columnas_nan:
    df = df.drop(columns=columnas_nan)
    print(f"Se eliminaron {len(columnas_nan)} columnas llamadas 'nan'")

In [None]:
columns_to_exclude = date_columns + ['tank_name', 'product']
print(df.columns)

In [None]:
for col in df.columns:
    if col not in columns_to_exclude:
        df[col] = pd.to_numeric(df[col], errors='coerce')

print(df.dtypes)

In [None]:
# Separar en diferentes df
df_liquid_hydrocarbons_cached = df.iloc[:, 0:11]

In [None]:
print(df_liquid_hydrocarbons_cached.head())

In [None]:
df_gas_production = df.iloc[:, 12:24]

In [None]:
print(df_gas_production.head())

In [None]:
df_tank_data = df.iloc[:, 27:35]

In [None]:
print(df_tank_data.head())

In [None]:
df_daily_lifting_data = df.iloc[:, 35:38]

In [None]:
print(df_daily_lifting_data.head())

In [None]:
# df_liquid_hydrocarbons_cached = df_liquid_hydrocarbons_cached.drop(df_liquid_hydrocarbons_cached.columns[[0, 5, 6, 10, 12]], axis=1)

In [None]:
# Configurar pandas para mostrar todos los registros (desactivando la limitación por defecto)
pd.set_option('display.max_rows', None)

# Mostrar el DataFrame completo
print(df_liquid_hydrocarbons_cached)

# Para restablecer la configuración posteriormente (opcional)
pd.reset_option('display.max_rows')

In [None]:
# Elimina filas solo si todos sus valores son NaN
df_liquid_hydrocarbons_cached = df_liquid_hydrocarbons_cached.dropna(how='all')

In [None]:
print(df_liquid_hydrocarbons_cached.shape)
print(df_liquid_hydrocarbons_cached.info())

In [None]:
lhc_columns_to_delete = ['eglng_propane_sales', 'llc_share_of_secondary_condensate', 'psc_share_of_secondary_condensate']

In [None]:
df_liquid_hydrocarbons_cached = df_liquid_hydrocarbons_cached.drop(columns=lhc_columns_to_delete)
df_liquid_hydrocarbons_cached = df_liquid_hydrocarbons_cached.rename(columns={'date_0': 'date'})

In [None]:
print(df_liquid_hydrocarbons_cached.head())

In [None]:
df_gas_production = df_gas_production.dropna(how='all')

In [None]:
print(df_gas_production.shape)
print(df_gas_production.info())

In [None]:
gp_columns = ['date_1', 'ampco_gas_sales', 'eglng_gas_sales', 'gas_sales', 'offshore_gas']
df_gas_production = df_gas_production[gp_columns]
df_gas_production = df_gas_production.rename(columns={'date_1': 'date'})

In [None]:
print(df_gas_production.head())

In [None]:
df_tank_data = df_tank_data.dropna(how='all')

In [None]:
print(df_tank_data.shape)
df_tank_data.info()

In [None]:
td_columns = ['date_2', 'tank_name', 'standard_net_oil_volume_(bbls)']
df_tank_data = df_tank_data[td_columns]
df_tank_data = df_tank_data.rename(columns={'date_2': 'date'})

In [None]:
print(df_tank_data.head())

In [None]:
df_daily_lifting_data = df_daily_lifting_data.dropna(how='all')

In [None]:
print(df_daily_lifting_data.shape)
df_daily_lifting_data.info()

In [None]:
df_daily_lifting_data = df_daily_lifting_data.rename(columns={'date_3': 'date'})