# Join all files and tables for train into a single dataframe by lote, fecha and orden

In [587]:
import pandas as pd

In [588]:
# Load the pkl files
path = '../../data/processed/'
ct = pd.read_pickle(path + 'centrifuga_total.pkl')
cf = pd.read_pickle(path + 'cf.pkl')
h = pd.read_pickle(path + 'horas.pkl')
mc = pd.read_pickle(path + 'mov_componentes.pkl')
th = pd.read_pickle(path + 'th.pkl')
ino = pd.read_pickle(path + 'inoculo.pkl')
pino = pd.read_pickle(path + 'preinoculo.pkl')
of = pd.read_pickle(path + 'of.pkl')
bt = pd.read_pickle(path + 'biorreactor_total.pkl')


## Indexed by Lote

### cf + of

In [589]:
# Merge cf and of on lote using left join
cf_of = pd.merge(cf, of, on='lote', how='left')

### cf_ino

In [590]:
# Merge
cf_ino = cf.merge(ino, on=['lote'], how='left')

### cf_pino 

In [591]:
# Merge
cf_pino = cf.merge(pino, on=['lote'], how='left')

### MC

In [592]:
# Max material unique values
mc['material_mc'].nunique()

13

In [593]:
# Do all lotes have the same material?
mc.groupby('lote')['material_mc'].nunique().max()

np.int64(13)

We could add 13 mc columns

In [594]:
# For each material, we can add a column
mc_copy = mc.copy()
for m in mc['material_mc'].unique():
    mc_copy[m + '_mc'] = 0
mc_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4920 entries, 0 to 4919
Data columns (total 19 columns):
 #   Column          Non-Null Count  Dtype              
---  ------          --------------  -----              
 0   lote            4920 non-null   object             
 1   material_mc     4920 non-null   object             
 2   qty_mc          4920 non-null   float64            
 3   f_recepcion_mc  4920 non-null   datetime64[ns, UTC]
 4   f_traslado_mc   4920 non-null   datetime64[ns, UTC]
 5   duracion_mc     4920 non-null   float64            
 6   100001_mc       4920 non-null   int64              
 7   100002_mc       4920 non-null   int64              
 8   100003_mc       4920 non-null   int64              
 9   100004_mc       4920 non-null   int64              
 10  100005_mc       4920 non-null   int64              
 11  100006_mc       4920 non-null   int64              
 12  100007_mc       4920 non-null   int64              
 13  100008_mc       4920 non-null   i

In [595]:
# For each if the material is in the row, we set the value to dur_mc
for i, row in mc.iterrows():
    for m in mc['material_mc'].unique():
        if row['material_mc'] == m:
            mc_copy.at[i, m + '_mc'] = row['duracion_mc']

In [596]:

# Perform average for each material and lote combination on duracion_mc
mc_copy = mc_copy.groupby(['lote', 'material_mc']).mean().reset_index()
mc_copy.head(2)

Unnamed: 0,lote,material_mc,qty_mc,f_recepcion_mc,f_traslado_mc,duracion_mc,100001_mc,100002_mc,100003_mc,100004_mc,100005_mc,100006_mc,100007_mc,100008_mc,100009_mc,100010_mc,100011_mc,100012_mc,100013_mc
0,23019,100001,45.616,2023-01-11 23:00:00+00:00,2023-03-08 15:00:00+00:00,4809600.0,4809600.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,23019,100002,168.768,2022-03-06 23:00:00+00:00,2023-03-14 23:00:00+00:00,32227200.0,0.0,32227200.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [597]:
# Drop material_mc, duracion_mc, f_recepcion_mc, f_entrega_mc, qty_mc
mc_copy.drop(columns=['material_mc', 'duracion_mc', 'f_recepcion_mc', 'f_traslado_mc', 'qty_mc'], inplace=True)

In [598]:
mc_copy.head(2)


Unnamed: 0,lote,100001_mc,100002_mc,100003_mc,100004_mc,100005_mc,100006_mc,100007_mc,100008_mc,100009_mc,100010_mc,100011_mc,100012_mc,100013_mc
0,23019,4809600.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,23019,0.0,32227200.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [599]:
# zeros to nan
mc_copy.replace(0, pd.NA, inplace=True)

In [600]:
# Groupby lote and sum
mc_copy = mc_copy.groupby(['lote']).sum().reset_index()
mc_copy.head(1)

Unnamed: 0,lote,100001_mc,100002_mc,100003_mc,100004_mc,100005_mc,100006_mc,100007_mc,100008_mc,100009_mc,100010_mc,100011_mc,100012_mc,100013_mc
0,23019,4809600.0,32227200.0,48643200.0,40608000.0,10281600.0,27302400.0,2937600.0,15379200.0,35035200.0,14947200.0,6134400.0,172800.0,0


In [601]:
# Merge with cf
cf_mc = cf.merge(mc_copy, on='lote', how='left')

In [602]:
# Merge everything
cf_mc_ino = cf_mc.merge(ino, on=['lote'], how='left')

In [603]:
# Merge everything
cf_mc_ino_pino = cf_mc_ino.merge(pino, on=['lote'], how='left')

In [604]:
# Convert materials to float
for m in mc['material_mc'].unique():
    cf_mc_ino_pino[m + '_mc'] = cf_mc_ino_pino[m + '_mc'].astype(float)

In [605]:
# Merge of
cf_mc_ino_pino_of = cf_mc_ino_pino.merge(of, on=['lote'], how='left')


In [606]:
# to pkl
cf_mc_ino_pino_of.to_pickle(path + 'cf_mc_ino_pino_of.pkl')

## Add Bio

In [607]:
# Merge Bio with cf_mc_ino_pino_of
from losca.tools.utils import mean_values_biorreactor

# Iterrows
cf_mc_ino_pino_of_bt = cf_mc_ino_pino_of.copy()
cf_mc_ino_pino_of_bt.iloc[10]['f_h_inicio_ino']
for i, row in cf_mc_ino_pino_of.iterrows():
    # Mean_valus_biorreactor returns a row with 4 columns
    bt_mean_row = mean_values_biorreactor(bt, th, row['f_h_inicio_ino'], row['f_h_fin_ino'], row['id_bio_ino'])
    # For each column returned, we set the value in the row
    # Print col 0 values
    for col in bt_mean_row.columns:
        cf_mc_ino_pino_of_bt.at[i, col + '_ino'] = bt_mean_row[col].values[0]

cf_mc_ino_pino_of_bt.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 152 entries, 0 to 151
Columns: 132 entries, lote to h_bio_th_ino
dtypes: datetime64[ns, UTC](6), float64(119), int64(1), object(6)
memory usage: 156.9+ KB


In [608]:
# Merge Bio with cf_mc_ino_pino_of
from losca.tools.utils import mean_values_biorreactor
# Iterrows
cf_mc_ino_pino_of_bt = cf_mc_ino_pino_of.copy()
cf_mc_ino_pino_of_bt.iloc[10]['f_h_inicio_ino']
for i, row in cf_mc_ino_pino_of.iterrows():
    # Mean_valus_biorreactor returns a row with 4 columns
    bt_mean_row = mean_values_biorreactor(bt, th, row['f_h_inicio_cf'], row['f_h_fin_cf'], row['id_bio'])
    # For each column returned, we set the value in the row
    for col in bt_mean_row.columns:
        cf_mc_ino_pino_of_bt.at[i, col + '_cf'] = bt_mean_row[col].values[0]

cf_mc_ino_pino_of_bt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 152 entries, 0 to 151
Columns: 132 entries, lote to h_bio_th_cf
dtypes: datetime64[ns, UTC](6), float64(119), int64(1), object(6)
memory usage: 156.9+ KB


In [609]:
# To pkl
cf_mc_ino_pino_of_bt.to_pickle(path + 'cf_mc_ino_pino_of_bt.pkl')

## Add Centr

In [610]:
from losca.tools.utils import mean_values_centrifuga

# Iterrows
cf_mc_ino_pino_of_bt_ct = cf_mc_ino_pino_of_bt.copy()
cf_mc_ino_pino_of_bt_ct.iloc[10]['f_h_inicio_ino']
for i, row in cf_mc_ino_pino_of_bt.iterrows():
    # Mean_valus_biorreactor returns a row with 4 columns
    ct_mean_row = mean_values_centrifuga(ct, th, row['f_h_inicio_cf'], row['f_h_fin_cf'], row['id_centr'])
    # For each column returned, we set the value in the row
    for col in ct_mean_row.columns:
        cf_mc_ino_pino_of_bt_ct.at[i, col + '_cf'] = ct_mean_row[col].values[0]



In [611]:
# For those rows with orden_encadenado_cf == 2 or 3 we need to get the values from the 'lote_parental_cf'
# We could do this with merge and being efficient etc but we are pressed on time
# Get those rows with orden_encadenado_cf == 2 or 3
cf_mc_ino_pino_of_bt_ct[(cf_mc_ino_pino_of_bt_ct['orden_encadenado_cf'] == 2) | (cf_mc_ino_pino_of_bt_ct['orden_encadenado_cf'] == 3)]

# For each row with orden_encadenado_cf == 2 or 3, we get the values from the row with lote == lote_parental_cf
for i, row in cf_mc_ino_pino_of_bt_ct[(cf_mc_ino_pino_of_bt_ct['orden_encadenado_cf'] == 2) | (cf_mc_ino_pino_of_bt_ct['orden_encadenado_cf'] == 3)].iterrows():
    # Get the row with lote == lote_parental_cf
    row_parental = cf_mc_ino_pino_of_bt_ct[cf_mc_ino_pino_of_bt_ct['lote'] == str(row['lote_parental_cf']).split('.')[0]]
    # If lote_parental is null
    if row_parental.empty:
        continue
    # Get the index of the row
    ind = row_parental.index[0]
    cols = list(cf_mc_ino_pino_of_bt_ct.filter(like='_ino').columns)
    cols += list(cf_mc_ino_pino_of_bt_ct.filter(like='_pino').columns)
    cols += list(cf_mc_ino_pino_of_bt_ct.filter(like='_mc').columns)
    cols.pop(cols.index('vol_ino_util_cf'))
    # Change the values of the row with the values of the row with lote == lote_parental_cf
    for col in cols:
        cf_mc_ino_pino_of_bt_ct.at[i, col] = row_parental.at[ind, col]



In [612]:
# To pkl
cf_mc_ino_pino_of_bt_ct.to_pickle(path + 'cf_mc_ino_pino_of_bt_ct.pkl')