In [593]:
import pandas as pd
import numpy as np

In [594]:
# Load pino
pino = pd.read_csv('../../data/processed/preinoculo.csv')

In [595]:
pino.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 161 entries, 0 to 160
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Lote        161 non-null    object 
 1   f_h_inicio  161 non-null    object 
 2   f_h_fin     161 non-null    object 
 3   ph_1        159 non-null    float64
 4   ph_2        148 non-null    float64
 5   ph_3        130 non-null    float64
 6   turb_1      159 non-null    float64
 7   turb_2      148 non-null    float64
 8   turb_3      131 non-null    float64
 9   usada_1     161 non-null    int64  
 10  usada_2     161 non-null    int64  
 11  usada_3     161 non-null    int64  
 12  duracion    161 non-null    object 
dtypes: float64(6), int64(3), object(4)
memory usage: 16.5+ KB


We do not need all these columns. Lets make remake pino so it has one value per row and drop not used values.

In [596]:
# We should repeat Lote, for each row, but also duracion and f_h_inicio and f_h_fin

# We will create a new dataframe for each usada_1==1 value and from that we need 
# ['ph_1']['turb_1']['f_h_inicio']['f_h_fin']['duracion']['lote']

# For each usada 
lineas = [1,2,3]
new_pino = pd.DataFrame()
for linea in lineas:
    usada = pino[pino['usada_'+str(linea)]==1]
    usada = usada[['ph_'+str(linea),'turb_'+str(linea),'f_h_inicio','f_h_fin','duracion','Lote']].reset_index(drop=True)
    usada.rename(columns={'ph_'+str(linea):'ph','turb_'+str(linea):'turb'},inplace=True)
    new_pino = pd.concat([new_pino,usada],ignore_index=True)
new_pino

Unnamed: 0,ph,turb,f_h_inicio,f_h_fin,duracion,Lote
0,5.496,28.32,2023-03-26 05:00:00,2023-03-27 07:21:00,1 days 02:21:00,23023
1,5.496,28.32,2023-03-26 05:00:00,2023-03-27 07:21:00,1 days 02:21:00,23024
2,5.480,26.56,2023-03-17 06:00:00,2023-03-28 07:42:00,11 days 01:42:00,23025
3,5.480,26.56,2023-03-17 06:00:00,2023-03-28 07:42:00,11 days 01:42:00,23026
4,5.384,33.84,2023-04-02 05:00:00,2023-04-03 13:30:00,1 days 08:30:00,23027
...,...,...,...,...,...,...
299,5.480,29.68,2024-05-30 00:30:00,2024-05-31 07:46:00,1 days 07:16:00,24089
300,5.392,26.64,2024-06-30 00:30:00,2024-07-01 07:01:00,1 days 06:31:00,24104
301,5.392,26.64,2024-06-30 00:30:00,2024-07-01 07:01:00,1 days 06:31:00,24105
302,5.504,25.60,2024-07-04 00:30:00,2024-07-05 07:04:00,1 days 06:34:00,24108


In [597]:
# Lote to str
new_pino['Lote'] = new_pino['Lote'].astype(str)
# f_h_inicio and f_h_fin to datetime
new_pino['f_h_inicio'] = pd.to_datetime(new_pino['f_h_inicio'])
new_pino['f_h_fin'] = pd.to_datetime(new_pino['f_h_fin'])
# Convert them from Europe/Madrid to UTC
new_pino['f_h_inicio'] = new_pino['f_h_inicio'].dt.tz_localize('Europe/Madrid').dt.tz_convert('UTC')
new_pino['f_h_fin'] = new_pino['f_h_fin'].dt.tz_localize('Europe/Madrid').dt.tz_convert('UTC')

# duracion to timedelta
new_pino['duracion'] = pd.to_timedelta(new_pino['duracion'])
# from timedelta to int 
new_pino['duracion'] = new_pino['duracion'].dt.total_seconds().astype(float)
new_pino.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 304 entries, 0 to 303
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype              
---  ------      --------------  -----              
 0   ph          303 non-null    float64            
 1   turb        304 non-null    float64            
 2   f_h_inicio  304 non-null    datetime64[ns, UTC]
 3   f_h_fin     304 non-null    datetime64[ns, UTC]
 4   duracion    304 non-null    float64            
 5   Lote        304 non-null    object             
dtypes: datetime64[ns, UTC](2), float64(3), object(1)
memory usage: 14.4+ KB


In [598]:
# Rename Lote to lote
new_pino.rename(columns={'Lote':'lote', 'ph': 'ph_pino', 'turb': 'turb_pino', 
                         'f_h_inicio': 'f_h_inicio_pino', 'f_h_fin': 'f_h_fin_pino', 
                         'duracion': 'dur_pino'}, inplace=True)

In [599]:
# Show the nan row
new_pino[new_pino.isnull().any(axis=1)]

Unnamed: 0,ph_pino,turb_pino,f_h_inicio_pino,f_h_fin_pino,dur_pino,lote
297,,27.84,2024-05-11 22:30:00+00:00,2024-05-13 05:54:00+00:00,113040.0,24075


In [600]:
# get lote where 23023
new_pino[new_pino['lote']=='23023']

Unnamed: 0,ph_pino,turb_pino,f_h_inicio_pino,f_h_fin_pino,dur_pino,lote
0,5.496,28.32,2023-03-26 03:00:00+00:00,2023-03-27 05:21:00+00:00,94860.0,23023
129,5.504,27.92,2023-03-26 03:00:00+00:00,2023-03-27 05:21:00+00:00,94860.0,23023


In [601]:
pino_orig = new_pino.copy()
# Groupby by lote size == 1 values

In [602]:
# Convert the df back to ph_1, turb_1, ph_2, turb_2
# We should group by lote and then split the rows into columns in a new dataframe
pino_orig = new_pino.copy()
# Now for each lote, we will have 2 rows. Now use groupby and apply
# Group by lote
pino_orig = pino_orig.groupby('lote').apply(lambda x: x.reset_index(drop=True))
pino_orig['ph_1'] = pino_orig['ph_pino']
pino_orig['turb_1'] = pino_orig['turb_pino']
pino_orig['ph_2'] = pino_orig['ph_pino'].shift(-1)
pino_orig['turb_2'] = pino_orig['turb_pino'].shift(-1)

pino_orig = pino_orig.reset_index(drop=True)
# Drop dupes
pino_orig = pino_orig.drop_duplicates(subset=['lote'])
pino_orig

  pino_orig = pino_orig.groupby('lote').apply(lambda x: x.reset_index(drop=True))


Unnamed: 0,ph_pino,turb_pino,f_h_inicio_pino,f_h_fin_pino,dur_pino,lote,ph_1,turb_1,ph_2,turb_2
0,5.496,28.32,2023-03-26 03:00:00+00:00,2023-03-27 05:21:00+00:00,94860.0,23023,5.496,28.32,5.504,27.92
2,5.496,28.32,2023-03-26 03:00:00+00:00,2023-03-27 05:21:00+00:00,94860.0,23024,5.496,28.32,5.504,27.92
4,5.480,26.56,2023-03-17 05:00:00+00:00,2023-03-28 05:42:00+00:00,956520.0,23025,5.480,26.56,5.520,27.52
6,5.480,26.56,2023-03-17 05:00:00+00:00,2023-03-28 05:42:00+00:00,956520.0,23026,5.480,26.56,5.520,27.52
8,5.384,33.84,2023-04-02 03:00:00+00:00,2023-04-03 11:30:00+00:00,117000.0,23027,5.384,33.84,5.400,32.48
...,...,...,...,...,...,...,...,...,...,...
295,5.408,27.44,2024-06-29 22:30:00+00:00,2024-07-01 05:01:00+00:00,109860.0,24104,5.408,27.44,5.392,26.64
297,5.408,27.44,2024-06-29 22:30:00+00:00,2024-07-01 05:01:00+00:00,109860.0,24105,5.408,27.44,5.392,26.64
299,5.432,28.80,2024-07-03 22:30:00+00:00,2024-07-05 05:04:00+00:00,110040.0,24108,5.432,28.80,5.504,25.60
301,5.432,28.80,2024-07-03 22:30:00+00:00,2024-07-05 05:04:00+00:00,110040.0,24111,5.432,28.80,5.504,25.60


In [603]:
# ph_2 null
pino_orig[pino_orig['ph_2'].isnull()]

Unnamed: 0,ph_pino,turb_pino,f_h_inicio_pino,f_h_fin_pino,dur_pino,lote,ph_1,turb_1,ph_2,turb_2
262,5.424,26.96,2024-05-11 22:30:00+00:00,2024-05-13 05:54:00+00:00,113040.0,24075,5.424,26.96,,27.84
303,5.352,26.8,2023-11-11 23:30:00+00:00,2023-11-13 06:25:00+00:00,111300.0,P23462,5.352,26.8,,


# Original

In [604]:
# as pkl
new_pino.to_pickle('../../data/processed/preinoculo_alternate.pkl')

In [605]:
# as pkl
pino_orig.to_pickle('../../data/processed/preinoculo.pkl')

# Harsh method

In [606]:
# Drop duplicates
print(new_pino.shape)
new_pino.drop_duplicates(inplace=True)
print(new_pino.shape)

(304, 6)
(304, 6)


In [607]:
# Drop na
print(new_pino.shape)
new_pino.dropna(inplace=True)
print(new_pino.shape)


(304, 6)
(303, 6)


In [608]:
# as pkl
#new_pino.to_pickle('../../data/processed/preinoculo.pkl')

# Mega harsh method: drop all which only have 1 línea (value per lote)

In [609]:
# Drop all which only have 1 value per lote
print(new_pino.shape)
new_pino = new_pino.groupby('lote').filter(lambda x: len(x)>1)
print(new_pino.shape)


(303, 6)
(284, 6)


In [610]:
# Save as pkl
#new_pino.to_pickle('../../data/processed/preinoculo.pkl')