In [6]:
import os
import pandas as pd
import numpy as np

# Ruta de la carpeta que contiene los archivos Excel
folder_path = '../../data/raw/'

# Lista para almacenar los dataframes
dfs = []

# Iterar sobre los archivos en la carpeta
print("Iniciando lectura de archivos...")
for filename in os.listdir(folder_path):
    # Verificar si el archivo es un archivo Excel de un biorreactor y no acaba con ":Zone.Identifier"
    if filename.startswith("Biorreactor") and filename.endswith(".xlsx") and not filename.endswith(":Zone.Identifier"):
        print(f"Leyendo archivo {filename}...")
        # Leer el archivo Excel
        print("Leyendo archivo...")
        df = pd.read_excel(os.path.join(folder_path, filename), sheet_name="Datos")
        
        # Extraer el número del biorreactor del nombre del archivo
        print("Extrayendo número del biorreactor...")
        num_biorreactor = filename.split()[1].split('.')[0]
        
        # Añadir la columna "Num_biorreactor"
        print("Añadiendo columna 'Num_biorreactor'...")
        df['Num_biorreactor'] = float(num_biorreactor)
        
        # Renombrar columnas para quitar el "XXXXX_" del principio
        print("Renombrando columnas...")
        df.columns = ("DateTime", "Agitation_PV",	"Air_Sparge_PV", "Biocontainer_Pressure_PV", "DO_1_PV", "DO_2_PV",	"Gas_Overlay_PV", "Load_Cell_Net_PV", 
                          "pH_1_PV", "pH_2_PV",  "PUMP_1_PV","PUMP_1_TOTAL", "PUMP_2_PV", "PUMP_2_TOTAL", "Single_Use_DO_PV", "Single_Use_pH_PV", "Temperatura_PV", "Num_Biorreactor")
        
        # Añadir el dataframe a la lista
        print("Añadiendo dataframe a la lista...")
        dfs.append(df)

# Concatenar todos los dataframes en uno solo (uniendo filas)
print("Concatenando dataframes...")
df_total = pd.concat(dfs, ignore_index=True)

# Guardar el dataframe resultante en un nuevo archivo Excel
print("Guardando archivo Excel...")
df_total.to_excel('../../data/processed/Biorreactor_Total.xlsx', index=False)

Iniciando lectura de archivos...
Leyendo archivo Biorreactor 14614.xlsx...
Leyendo archivo...
Extrayendo número del biorreactor...
Añadiendo columna 'Num_biorreactor'...
Renombrando columnas...
Añadiendo dataframe a la lista...
Leyendo archivo Biorreactor 13171.xlsx...
Leyendo archivo...
Extrayendo número del biorreactor...
Añadiendo columna 'Num_biorreactor'...
Renombrando columnas...
Añadiendo dataframe a la lista...
Leyendo archivo Biorreactor 14615.xlsx...
Leyendo archivo...
Extrayendo número del biorreactor...
Añadiendo columna 'Num_biorreactor'...
Renombrando columnas...
Añadiendo dataframe a la lista...
Leyendo archivo Biorreactor 13172.xlsx...
Leyendo archivo...
Extrayendo número del biorreactor...
Añadiendo columna 'Num_biorreactor'...
Renombrando columnas...
Añadiendo dataframe a la lista...
Leyendo archivo Biorreactor 14618.xlsx...
Leyendo archivo...
Extrayendo número del biorreactor...
Añadiendo columna 'Num_biorreactor'...
Renombrando columnas...
Añadiendo dataframe a la l

In [7]:
#print row with missing values

print(df_total[df_total.isnull().any(axis=1)])

                       DateTime  Agitation_PV  Air_Sparge_PV  \
12238   2023-07-20 12:30:00.000           NaN            NaN   
12239   2023-07-20 12:45:00.000           NaN            NaN   
12240   2023-07-20 13:00:00.000           NaN            NaN   
12241   2023-07-20 13:15:00.000           NaN            NaN   
12242   2023-07-20 13:30:00.000           NaN            NaN   
...                         ...           ...            ...   
470308  2024-08-27 08:00:00.000           NaN            NaN   
470309  2024-08-27 08:15:00.000           NaN            NaN   
470310  2024-08-27 08:30:00.000           NaN            NaN   
470311  2024-08-27 08:45:00.000           NaN            NaN   
470312  2024-08-27 09:00:00.000           NaN            NaN   

        Biocontainer_Pressure_PV  DO_1_PV  DO_2_PV  Gas_Overlay_PV  \
12238                        NaN      NaN      NaN             NaN   
12239                        NaN      NaN      NaN             NaN   
12240                

In [9]:
# Guardar el dataframe resultante en un nuevo archivo Excel
print("Guardando archivo Excel...")
df_total.to_excel('../../data/processed/Biorreactor_Total.xlsx', index=False)

Guardando archivo Excel...


In [10]:
df_total

Unnamed: 0,DateTime,Agitation_PV,Air_Sparge_PV,Biocontainer_Pressure_PV,DO_1_PV,DO_2_PV,Gas_Overlay_PV,Load_Cell_Net_PV,pH_1_PV,pH_2_PV,PUMP_1_PV,PUMP_1_TOTAL,PUMP_2_PV,PUMP_2_TOTAL,Single_Use_DO_PV,Single_Use_pH_PV,Temperatura_PV,Num_Biorreactor
0,2023-03-15 00:00:00.000,80.0,0.0,0.572660,0.0,-0.00553,4.000087,1576.8,-0.156925,5.888288,0.0,14.880000,0.0,191.200293,799.991992,799.967969,30.216161,14614.0
1,2023-03-15 00:15:00.000,80.0,0.0,0.491942,0.0,-0.00553,4.000084,1576.8,-0.156925,5.896104,0.0,14.880000,0.0,191.200293,799.991992,799.967969,30.135999,14614.0
2,2023-03-15 00:30:00.000,80.0,0.0,0.552688,0.0,-0.00553,4.000161,1576.8,-0.156925,5.896104,0.0,14.880000,0.0,191.200293,799.991992,799.967969,29.823988,14614.0
3,2023-03-15 00:45:00.000,80.0,0.0,0.613318,0.0,-0.00553,4.000088,1576.8,-0.156925,5.896104,0.0,14.880000,0.0,191.200293,799.991992,799.967969,29.408039,14614.0
4,2023-03-15 01:00:00.000,80.0,0.0,0.552421,0.0,-0.00553,3.999864,1577.2,-0.156925,5.904210,0.0,14.880000,0.0,191.200293,799.991992,799.967969,29.191720,14614.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
471712,2024-09-10 23:00:00.000,0.0,0.0,480.000000,0.0,0.00000,0.000000,-17.2,1.306835,-0.232956,0.0,22.320001,0.0,0.000000,799.991992,799.919971,15.268666,14616.0
471713,2024-09-10 23:15:00.000,0.0,0.0,480.000000,0.0,0.00000,0.000000,-17.2,1.306835,-0.232956,0.0,22.320001,0.0,0.000000,799.991992,799.919971,15.216003,14616.0
471714,2024-09-10 23:30:00.000,0.0,0.0,480.000000,0.0,0.00000,0.000000,-17.2,1.306835,-0.232956,0.0,22.320001,0.0,0.000000,799.991992,799.919971,15.169230,14616.0
471715,2024-09-10 23:45:00.000,0.0,0.0,480.000000,0.0,0.00000,0.000000,-17.2,1.306835,-0.232956,0.0,22.320001,0.0,0.000000,799.991992,799.919971,15.171162,14616.0


In [11]:
#Tipo de dato de cada columna
print("Tipo de dato de cada columna:")
print(df_total.dtypes)


Tipo de dato de cada columna:
DateTime                     object
Agitation_PV                float64
Air_Sparge_PV               float64
Biocontainer_Pressure_PV    float64
DO_1_PV                     float64
DO_2_PV                     float64
Gas_Overlay_PV              float64
Load_Cell_Net_PV            float64
pH_1_PV                     float64
pH_2_PV                     float64
PUMP_1_PV                   float64
PUMP_1_TOTAL                float64
PUMP_2_PV                   float64
PUMP_2_TOTAL                float64
Single_Use_DO_PV            float64
Single_Use_pH_PV            float64
Temperatura_PV              float64
Num_Biorreactor             float64
dtype: object
