# Centrifuga: anomalies detection and analysis

In [24]:
import pandas as pd

In [25]:
ct = pd.read_excel('../../data/processed/Centrifuga_Total.xlsx')
ct.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157239 entries, 0 to 157238
Data columns (total 9 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   DateTime               157239 non-null  object 
 1   EN_parcial             154983 non-null  float64
 2   EN_total               155199 non-null  float64
 3   Apertura valvula agua  156224 non-null  float64
 4   Caudal                 156223 non-null  float64
 5   Contrapresion          156223 non-null  float64
 6   Presion agua           111674 non-null  float64
 7   Velocidad separacion   156224 non-null  float64
 8   Num_centrifuga         157239 non-null  int64  
dtypes: float64(7), int64(1), object(1)
memory usage: 10.8+ MB


In [26]:
ct.head()

Unnamed: 0,DateTime,EN_parcial,EN_total,Apertura valvula agua,Caudal,Contrapresion,Presion agua,Velocidad separacion,Num_centrifuga
0,2023-03-15 00:00:00.000,0.0,0.0,100.0,0.0,0.411225,,0.0,17825
1,2023-03-15 00:15:00.000,0.0,0.0,100.0,0.0,0.411225,,0.0,17825
2,2023-03-15 00:30:00.000,0.0,0.0,100.0,0.0,0.410938,,0.0,17825
3,2023-03-15 00:45:00.000,0.0,0.0,100.0,0.0,0.411229,,0.0,17825
4,2023-03-15 01:00:00.000,0.0,0.0,100.0,0.0,0.411227,,0.0,17825


In [27]:
# Are there any dupe rows?
ct.duplicated().sum()

np.int64(0)

In [28]:
# Describe each float column
ct.describe()

Unnamed: 0,EN_parcial,EN_total,Apertura valvula agua,Caudal,Contrapresion,Presion agua,Velocidad separacion,Num_centrifuga
count,154983.0,155199.0,156224.0,156223.0,156223.0,111674.0,156224.0,157239.0
mean,0.001361,0.05089,95.838774,24.764024,0.319711,0.402061,358.151126,14994.333333
std,0.097877,1.084302,16.723563,136.241759,0.709665,0.451891,1486.449974,2074.35699
min,0.0,0.0,17.0,0.0,-0.8,-0.14253,0.0,12912.0
25%,0.0,0.0,100.0,0.0,0.00208,0.024688,0.0,12912.0
50%,0.0,0.0,100.0,0.0,0.406304,0.412799,0.0,14246.0
75%,0.0,0.0,100.0,0.0,0.428968,0.453626,0.0,17825.0
max,12.0,66.0,100.0,2400.0,7.074342,4.0,6694.48742,17825.0


Interesting values, specially those with a mean near 0, considering their max is way more than 0. There are a lot of 0 values, and a lot of nulls.

In [29]:
# For each column show the number of missing values
ct.isnull().sum()

DateTime                     0
EN_parcial                2256
EN_total                  2040
Apertura valvula agua     1015
Caudal                    1016
Contrapresion             1016
Presion agua             45565
Velocidad separacion      1015
Num_centrifuga               0
dtype: int64

In [30]:
# For each column show the number of 0 values
(ct == 0).sum()

DateTime                      0
EN_parcial               154939
EN_total                 153787
Apertura valvula agua         0
Caudal                   148744
Contrapresion                 0
Presion agua                 15
Velocidad separacion     147338
Num_centrifuga                0
dtype: int64

Why does the data exhibit this pattern?  Some values are never zero meanwhile others are almost always zero.

In [31]:
# Describe using groupby Num_centrifuga
print(ct.groupby('Num_centrifuga')['EN_parcial'].describe())

                  count      mean       std  min  25%  50%  75%   max
Num_centrifuga                                                       
12912           51327.0  0.000156  0.016515  0.0  0.0  0.0  0.0   3.0
14246           51828.0  0.003859  0.168256  0.0  0.0  0.0  0.0  12.0
17825           51828.0  0.000058  0.007608  0.0  0.0  0.0  0.0   1.0


In [32]:
print(ct.groupby('Num_centrifuga')['EN_total'].describe())

                  count      mean       std  min  25%  50%  75%   max
Num_centrifuga                                                       
12912           51058.0  0.055310  1.093471  0.0  0.0  0.0  0.0  62.0
14246           51828.0  0.035251  0.814950  0.0  0.0  0.0  0.0  62.0
17825           52313.0  0.062069  1.289459  0.0  0.0  0.0  0.0  66.0


In [33]:
print(ct.groupby('Num_centrifuga')['Apertura valvula agua'].describe())

                  count       mean        std        min    25%    50%    75%  \
Num_centrifuga                                                                  
12912           51487.0  96.426133  15.408582  17.000000  100.0  100.0  100.0   
14246           52368.0  95.374782  17.798649  22.863174  100.0  100.0  100.0   
17825           52369.0  95.725292  16.838906  23.652878  100.0  100.0  100.0   

                  max  
Num_centrifuga         
12912           100.0  
14246           100.0  
17825           100.0  


In [34]:
print(ct.groupby('Num_centrifuga')['Caudal'].describe())

                  count       mean         std  min  25%  50%  75%  \
Num_centrifuga                                                       
12912           51487.0  22.781724  131.454222  0.0  0.0  0.0  0.0   
14246           52368.0  26.400984  137.303909  0.0  0.0  0.0  0.0   
17825           52368.0  25.076015  139.733254  0.0  0.0  0.0  0.0   

                        max  
Num_centrifuga               
12912           2309.516897  
14246           2400.000000  
17825           2181.597266  


In [35]:
print(ct.groupby('Num_centrifuga')['Contrapresion'].describe())

                  count      mean       std       min       25%       50%  \
Num_centrifuga                                                              
12912           51487.0  0.138600  0.783119 -0.800000 -0.005302 -0.001100   
14246           52368.0  0.412692  0.668566 -0.051759  0.008449  0.416701   
17825           52368.0  0.404795  0.635882 -0.800000  0.002105  0.407345   

                     75%       max  
Num_centrifuga                      
12912           0.418577  7.074342  
14246           0.428009  6.895808  
17825           0.457419  6.853874  


In [36]:
print(ct.groupby('Num_centrifuga')['Presion agua'].describe())

                  count      mean       std       min       25%       50%  \
Num_centrifuga                                                              
12912           36637.0  0.452688  0.577266 -0.142530  0.018148  0.401221   
14246           37518.0  0.386954  0.382353 -0.136785  0.083663  0.418113   
17825           37519.0  0.367732  0.363757 -0.026177  0.021007  0.452082   

                     75%       max  
Num_centrifuga                      
12912           0.419799  4.000000  
14246           0.423275  3.227609  
17825           0.464062  2.595497  


In [37]:
print(ct.groupby('Num_centrifuga')['Velocidad separacion'].describe())

                  count        mean          std  min  25%  50%  75%  \
Num_centrifuga                                                         
12912           51487.0  317.621721  1404.263199  0.0  0.0  0.0  0.0   
14246           52368.0  396.126960  1558.386114  0.0  0.0  0.0  0.0   
17825           52369.0  360.022824  1490.337105  0.0  0.0  0.0  0.0   

                        max  
Num_centrifuga               
12912           6689.361345  
14246           6694.331250  
17825           6694.487420  


In [38]:
# Convert to datetime
ct['DateTime'] = pd.to_datetime(ct['DateTime'])

In [39]:
# Do all groups follow the +15 minutes rule? (sort first before diff)
weird_minutes = ct.groupby('Num_centrifuga')['DateTime'].apply(lambda x: x.sort_values().diff().dt.total_seconds())
weird_minutes = weird_minutes[weird_minutes > 900]
weird_minutes


Num_centrifuga        
12912           53477     4500.0
                89093     4500.0
14246           105890    4500.0
                141506    4500.0
17825           1064      4500.0
                36680     4500.0
Name: DateTime, dtype: float64

In [40]:
# Loc weird_minutes by second level index
ind = weird_minutes.index.get_level_values(1)
ct.loc[ind]

Unnamed: 0,DateTime,EN_parcial,EN_total,Apertura valvula agua,Caudal,Contrapresion,Presion agua,Velocidad separacion,Num_centrifuga
53477,2023-03-26 03:00:00,0.0,0.0,100.0,0.0,0.427999,,0.0,12912
89093,2024-03-31 03:00:00,0.0,0.0,100.0,0.0,0.410648,0.403475,0.0,12912
105890,2023-03-26 03:00:00,0.0,0.0,100.0,0.0,0.006434,,0.0,14246
141506,2024-03-31 03:00:00,0.0,0.0,100.0,0.0,0.428031,0.423958,0.0,14246
1064,2023-03-26 03:00:00,0.0,0.0,100.0,0.0,0.000652,,0.0,17825
36680,2024-03-31 03:00:00,0.0,0.0,100.0,0.0,0.457228,0.460594,0.0,17825


In [41]:
weird_minutes_oct = ct.groupby('Num_centrifuga')['DateTime'].apply(lambda x: x.sort_values().diff().dt.total_seconds())
weird_minutes_oct = weird_minutes_oct[weird_minutes_oct < 900]
weird_minutes_oct

Num_centrifuga        
12912           74309     0.0
                74306     0.0
                74311     0.0
                74308     0.0
14246           126722    0.0
                126719    0.0
                126724    0.0
                126721    0.0
17825           21896     0.0
                21893     0.0
                21898     0.0
                21895     0.0
Name: DateTime, dtype: float64

In [42]:
# Loc weird_minutes by second level index
ind = weird_minutes_oct.index.get_level_values(1)
ct.loc[ind]

Unnamed: 0,DateTime,EN_parcial,EN_total,Apertura valvula agua,Caudal,Contrapresion,Presion agua,Velocidad separacion,Num_centrifuga
74309,2023-10-29 02:00:00,0.0,0.0,100.0,0.0,-0.004572,-0.021802,0.0,12912
74306,2023-10-29 02:15:00,0.0,0.0,100.0,0.0,-0.004687,-0.017789,0.0,12912
74311,2023-10-29 02:30:00,0.0,0.0,100.0,0.0,-0.004128,-0.018124,0.0,12912
74308,2023-10-29 02:45:00,0.0,0.0,100.0,0.0,-0.00515,-0.021354,0.0,12912
126722,2023-10-29 02:00:00,0.0,0.0,100.0,0.0,0.007718,0.052668,0.0,14246
126719,2023-10-29 02:15:00,0.0,0.0,100.0,0.0,0.007716,0.072119,0.0,14246
126724,2023-10-29 02:30:00,0.0,0.0,100.0,0.0,0.007736,0.056925,0.0,14246
126721,2023-10-29 02:45:00,0.0,0.0,100.0,0.0,0.007581,0.038081,0.0,14246
21896,2023-10-29 02:00:00,0.0,0.0,100.0,0.0,0.002373,0.02066,0.0,17825
21893,2023-10-29 02:15:00,0.0,0.0,100.0,0.0,0.002103,0.022384,0.0,17825


### What happened in march and october?  SUMMER TIME CHANGE
#### In March, +1h. Other data not affected.
#### In October, -1h. Makes repeated values appear.

We fix this in rebuild_centrifuga converting to UTC

In [43]:
# pRINT october weird minutes
ct.iloc[weird_minutes_oct.index.get_level_values(1)]

Unnamed: 0,DateTime,EN_parcial,EN_total,Apertura valvula agua,Caudal,Contrapresion,Presion agua,Velocidad separacion,Num_centrifuga
74309,2023-10-29 02:00:00,0.0,0.0,100.0,0.0,-0.004572,-0.021802,0.0,12912
74306,2023-10-29 02:15:00,0.0,0.0,100.0,0.0,-0.004687,-0.017789,0.0,12912
74311,2023-10-29 02:30:00,0.0,0.0,100.0,0.0,-0.004128,-0.018124,0.0,12912
74308,2023-10-29 02:45:00,0.0,0.0,100.0,0.0,-0.00515,-0.021354,0.0,12912
126722,2023-10-29 02:00:00,0.0,0.0,100.0,0.0,0.007718,0.052668,0.0,14246
126719,2023-10-29 02:15:00,0.0,0.0,100.0,0.0,0.007716,0.072119,0.0,14246
126724,2023-10-29 02:30:00,0.0,0.0,100.0,0.0,0.007736,0.056925,0.0,14246
126721,2023-10-29 02:45:00,0.0,0.0,100.0,0.0,0.007581,0.038081,0.0,14246
21896,2023-10-29 02:00:00,0.0,0.0,100.0,0.0,0.002373,0.02066,0.0,17825
21893,2023-10-29 02:15:00,0.0,0.0,100.0,0.0,0.002103,0.022384,0.0,17825


In [44]:
# Print 2023-10-29 02:00:00, compare without taking into account the time change but take into account hours
ct[(ct['DateTime'] >= '2023-03-26 01:00:00') & (ct['DateTime'] < '2023-03-26 04:00:00')]

Unnamed: 0,DateTime,EN_parcial,EN_total,Apertura valvula agua,Caudal,Contrapresion,Presion agua,Velocidad separacion,Num_centrifuga
1060,2023-03-26 01:00:00,0.0,0.0,100.0,0.0,0.000637,,0.0,17825
1061,2023-03-26 01:15:00,0.0,0.0,100.0,0.0,0.000349,,0.0,17825
1062,2023-03-26 01:30:00,0.0,0.0,100.0,0.0,0.000926,,0.0,17825
1063,2023-03-26 01:45:00,0.0,0.0,100.0,0.0,0.001487,,0.0,17825
1064,2023-03-26 03:00:00,0.0,0.0,100.0,0.0,0.000652,,0.0,17825
1065,2023-03-26 03:15:00,0.0,0.0,100.0,0.0,0.000637,,0.0,17825
1066,2023-03-26 03:30:00,0.0,0.0,100.0,0.0,0.00036,,0.0,17825
1067,2023-03-26 03:45:00,0.0,0.0,100.0,0.0,0.000637,,0.0,17825
53473,2023-03-26 01:00:00,0.0,0.0,100.0,0.0,0.426297,,0.0,12912
53474,2023-03-26 01:15:00,0.0,0.0,100.0,0.0,0.429164,,0.0,12912


In [45]:
# Same for october
ct[(ct['DateTime'] >= '2023-10-29 01:00:00') & (ct['DateTime'] < '2023-10-29 04:00:00')]

Unnamed: 0,DateTime,EN_parcial,EN_total,Apertura valvula agua,Caudal,Contrapresion,Presion agua,Velocidad separacion,Num_centrifuga
21888,2023-10-29 01:00:00,0.0,0.0,100.0,0.0,0.001928,0.021875,0.0,17825
21889,2023-10-29 01:15:00,0.0,0.0,100.0,0.0,0.002373,0.02146,0.0,17825
21890,2023-10-29 01:30:00,0.0,0.0,100.0,0.0,0.001794,0.020486,0.0,17825
21891,2023-10-29 01:45:00,0.0,0.0,100.0,0.0,0.002083,0.020074,0.0,17825
21892,2023-10-29 02:00:00,0.0,0.0,100.0,0.0,0.002216,0.02158,0.0,17825
21893,2023-10-29 02:15:00,0.0,0.0,100.0,0.0,0.002103,0.022384,0.0,17825
21894,2023-10-29 02:30:00,0.0,0.0,100.0,0.0,0.002238,0.021088,0.0,17825
21895,2023-10-29 02:45:00,0.0,0.0,100.0,0.0,0.002237,0.020405,0.0,17825
21896,2023-10-29 02:00:00,0.0,0.0,100.0,0.0,0.002373,0.02066,0.0,17825
21897,2023-10-29 02:15:00,0.0,0.0,100.0,0.0,0.001794,0.022569,0.0,17825


In [46]:
ct[(ct['DateTime'] >= '2023-03-26 00:00:00') & (ct['DateTime'] < '2023-03-26 04:00:00')]

Unnamed: 0,DateTime,EN_parcial,EN_total,Apertura valvula agua,Caudal,Contrapresion,Presion agua,Velocidad separacion,Num_centrifuga
1056,2023-03-26 00:00:00,0.0,0.0,100.0,0.0,0.000933,,0.0,17825
1057,2023-03-26 00:15:00,0.0,0.0,100.0,0.0,0.000926,,0.0,17825
1058,2023-03-26 00:30:00,0.0,0.0,100.0,0.0,0.000926,,0.0,17825
1059,2023-03-26 00:45:00,0.0,0.0,100.0,0.0,0.001791,,0.0,17825
1060,2023-03-26 01:00:00,0.0,0.0,100.0,0.0,0.000637,,0.0,17825
1061,2023-03-26 01:15:00,0.0,0.0,100.0,0.0,0.000349,,0.0,17825
1062,2023-03-26 01:30:00,0.0,0.0,100.0,0.0,0.000926,,0.0,17825
1063,2023-03-26 01:45:00,0.0,0.0,100.0,0.0,0.001487,,0.0,17825
1064,2023-03-26 03:00:00,0.0,0.0,100.0,0.0,0.000652,,0.0,17825
1065,2023-03-26 03:15:00,0.0,0.0,100.0,0.0,0.000637,,0.0,17825


In [47]:
# Do weird minutes exist now?
weird_minutes = ct.groupby('Num_centrifuga')['DateTime'].apply(lambda x: x.sort_values().diff().dt.total_seconds())
weird_minutes = weird_minutes[weird_minutes > 900]
weird_minutes

Num_centrifuga        
12912           53477     4500.0
                89093     4500.0
14246           105890    4500.0
                141506    4500.0
17825           1064      4500.0
                36680     4500.0
Name: DateTime, dtype: float64

In [48]:
# and in october?
weird_minutes_oct = ct.groupby('Num_centrifuga')['DateTime'].apply(lambda x: x.sort_values().diff().dt.total_seconds())
weird_minutes_oct = weird_minutes_oct[weird_minutes_oct < 900]
weird_minutes_oct

Num_centrifuga        
12912           74309     0.0
                74306     0.0
                74311     0.0
                74308     0.0
14246           126722    0.0
                126719    0.0
                126724    0.0
                126721    0.0
17825           21896     0.0
                21893     0.0
                21898     0.0
                21895     0.0
Name: DateTime, dtype: float64

In [49]:
# Are there dupe rows
ct.duplicated().sum()

np.int64(0)

In [50]:

ct[(ct['DateTime'] >= '2023-10-29 00:00:00') & (ct['DateTime'] < '2023-10-29 04:00:00')]

Unnamed: 0,DateTime,EN_parcial,EN_total,Apertura valvula agua,Caudal,Contrapresion,Presion agua,Velocidad separacion,Num_centrifuga
21884,2023-10-29 00:00:00,0.0,0.0,100.0,0.0,0.002063,0.021701,0.0,17825
21885,2023-10-29 00:15:00,0.0,0.0,100.0,0.0,0.002083,0.020313,0.0,17825
21886,2023-10-29 00:30:00,0.0,0.0,100.0,0.0,0.002083,0.019874,0.0,17825
21887,2023-10-29 00:45:00,0.0,0.0,100.0,0.0,0.002649,0.019875,0.0,17825
21888,2023-10-29 01:00:00,0.0,0.0,100.0,0.0,0.001928,0.021875,0.0,17825
21889,2023-10-29 01:15:00,0.0,0.0,100.0,0.0,0.002373,0.02146,0.0,17825
21890,2023-10-29 01:30:00,0.0,0.0,100.0,0.0,0.001794,0.020486,0.0,17825
21891,2023-10-29 01:45:00,0.0,0.0,100.0,0.0,0.002083,0.020074,0.0,17825
21892,2023-10-29 02:00:00,0.0,0.0,100.0,0.0,0.002216,0.02158,0.0,17825
21893,2023-10-29 02:15:00,0.0,0.0,100.0,0.0,0.002103,0.022384,0.0,17825


In [51]:
# DateTime describe
ct['DateTime'].describe()

count                           157239
mean     2023-12-13 00:11:51.476160512
min                2023-03-15 00:00:00
25%                2023-07-29 12:45:00
50%                2023-12-12 23:30:00
75%                2024-04-27 12:15:00
max                2024-09-11 00:00:00
Name: DateTime, dtype: object

In [52]:
# Save as csv
ct.to_csv('../../data/processed/Centrifuga_Total.csv', index=False)