# LC3 DATA INTEGRITY CHECK

In [1]:
# Libraries for general data management
import pandas as pd
import numpy  as np

In [23]:
# Pandas display options customization
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 12)
#pd.set_option('display.width', 200)

In [3]:
# Input file path (can be also an excel)
DATA_MERGED_PATH = './data/data_merged.ods'
DATA_FULL_PATH   = './data/data_full.ods'

In [4]:
# Read merged data skipping the first row and considering also '-' as NaN
data_merged = pd.read_excel(DATA_MERGED_PATH, skiprows=[0], na_values=['-'])

In [5]:
# Read full data and remove empty lines
data_full   = pd.read_excel(DATA_FULL_PATH,sheet_name='All data in rows')
data_full.dropna(how="all", inplace=True)

In [6]:
# Mean of measurements for each clay
# data_full.groupby('Clay').mean()

In [7]:
# Std of measurements for each clay
# data_full.groupby('Clay').std()

In [8]:
# Number of measurements for each clay and each day/total for each day
# data_full.groupby('Clay').count()
# data_full.groupby('Clay').count().sum()

### Points that fails in mean coherence

In [33]:
MAX_MEAN_DEVIATION = 0.2

In [34]:
data_merged_mean = data_merged[['Clay','1D','3D','7D','28D','90D']]

In [35]:
mean_check = pd.merge(data_full.groupby('Clay').mean(), data_merged_mean, on='Clay', how='inner')

In [36]:
mean_check[abs(mean_check['day_1'] - mean_check['1D']) > MAX_MEAN_DEVIATION]

Unnamed: 0,Clay,day_1,day_3,day_7,day_28,day_90,1D,3D,7D,28D,90D
0,Argex,9.78125,20.3375,28.475,38.734375,41.35,10.406129,20.74941,29.287255,41.275596,42.096173
6,B45 India1 750°C,7.930208,19.34375,23.430208,29.898958,31.226562,13.18375,30.17375,52.09125,64.03125,63.31875
7,B45 India3 750°C,8.780208,20.02125,26.027083,35.973958,37.582813,9.403125,21.560417,29.1575,37.515625,40.345312
8,B45 Loma Sur,16.076667,28.808333,37.821667,61.768333,79.115,10.08125,23.61875,38.52875,49.446875,51.429688
14,Chile,9.996875,23.978125,40.90625,56.909375,59.38125,10.635529,24.463771,42.073108,60.642992,60.452803
15,China Screened,9.857812,21.265625,31.801562,38.234375,39.921875,11.273438,21.265625,31.801562,38.234375,39.921875
40,Iran G1,9.225,23.4375,35.50625,53.81875,53.83125,9.814343,23.912196,36.519072,57.349602,54.802651
42,Iran Z1,10.15,20.54375,34.875,53.209375,57.896875,10.798437,20.959837,35.869816,56.700249,58.941642
43,Iran Z2,9.765625,19.725,32.81875,43.009375,44.021875,10.389506,20.124504,33.754911,45.831064,44.816263
53,Pontezuela,7.451172,17.079687,22.784375,39.085938,50.440625,10.620313,22.784375,39.085938,50.440625,51.70625


In [37]:
mean_check[abs(mean_check['day_3'] - mean_check['3D']) > MAX_MEAN_DEVIATION]

Unnamed: 0,Clay,day_1,day_3,day_7,day_28,day_90,1D,3D,7D,28D,90D
0,Argex,9.78125,20.3375,28.475,38.734375,41.35,10.406129,20.74941,29.287255,41.275596,42.096173
6,B45 India1 750°C,7.930208,19.34375,23.430208,29.898958,31.226562,13.18375,30.17375,52.09125,64.03125,63.31875
7,B45 India3 750°C,8.780208,20.02125,26.027083,35.973958,37.582813,9.403125,21.560417,29.1575,37.515625,40.345312
8,B45 Loma Sur,16.076667,28.808333,37.821667,61.768333,79.115,10.08125,23.61875,38.52875,49.446875,51.429688
14,Chile,9.996875,23.978125,40.90625,56.909375,59.38125,10.635529,24.463771,42.073108,60.642992,60.452803
34,Holcim 4 Brazil,11.620313,24.524219,36.934375,43.610937,44.936458,11.820312,26.196875,37.764063,42.16875,44.18125
40,Iran G1,9.225,23.4375,35.50625,53.81875,53.83125,9.814343,23.912196,36.519072,57.349602,54.802651
41,Iran G2,11.934375,21.065625,27.103125,32.578125,35.0875,11.934375,21.492282,27.876247,34.715457,35.720665
42,Iran Z1,10.15,20.54375,34.875,53.209375,57.896875,10.798437,20.959837,35.869816,56.700249,58.941642
43,Iran Z2,9.765625,19.725,32.81875,43.009375,44.021875,10.389506,20.124504,33.754911,45.831064,44.816263


In [38]:
mean_check[abs(mean_check['day_7'] - mean_check['7D']) > MAX_MEAN_DEVIATION]

Unnamed: 0,Clay,day_1,day_3,day_7,day_28,day_90,1D,3D,7D,28D,90D
0,Argex,9.78125,20.3375,28.475,38.734375,41.35,10.406129,20.74941,29.287255,41.275596,42.096173
6,B45 India1 750°C,7.930208,19.34375,23.430208,29.898958,31.226562,13.18375,30.17375,52.09125,64.03125,63.31875
7,B45 India3 750°C,8.780208,20.02125,26.027083,35.973958,37.582813,9.403125,21.560417,29.1575,37.515625,40.345312
8,B45 Loma Sur,16.076667,28.808333,37.821667,61.768333,79.115,10.08125,23.61875,38.52875,49.446875,51.429688
14,Chile,9.996875,23.978125,40.90625,56.909375,59.38125,10.635529,24.463771,42.073108,60.642992,60.452803
19,F1,9.815625,22.275,35.8,48.515212,,9.864583,22.360417,35.027083,48.460963,48.111333
34,Holcim 4 Brazil,11.620313,24.524219,36.934375,43.610937,44.936458,11.820312,26.196875,37.764063,42.16875,44.18125
40,Iran G1,9.225,23.4375,35.50625,53.81875,53.83125,9.814343,23.912196,36.519072,57.349602,54.802651
41,Iran G2,11.934375,21.065625,27.103125,32.578125,35.0875,11.934375,21.492282,27.876247,34.715457,35.720665
42,Iran Z1,10.15,20.54375,34.875,53.209375,57.896875,10.798437,20.959837,35.869816,56.700249,58.941642


In [39]:
mean_check[abs(mean_check['day_28'] - mean_check['28D']) > MAX_MEAN_DEVIATION]

Unnamed: 0,Clay,day_1,day_3,day_7,day_28,day_90,1D,3D,7D,28D,90D
0,Argex,9.78125,20.3375,28.475,38.734375,41.35,10.406129,20.74941,29.287255,41.275596,42.096173
6,B45 India1 750°C,7.930208,19.34375,23.430208,29.898958,31.226562,13.18375,30.17375,52.09125,64.03125,63.31875
7,B45 India3 750°C,8.780208,20.02125,26.027083,35.973958,37.582813,9.403125,21.560417,29.1575,37.515625,40.345312
8,B45 Loma Sur,16.076667,28.808333,37.821667,61.768333,79.115,10.08125,23.61875,38.52875,49.446875,51.429688
14,Chile,9.996875,23.978125,40.90625,56.909375,59.38125,10.635529,24.463771,42.073108,60.642992,60.452803
34,Holcim 4 Brazil,11.620313,24.524219,36.934375,43.610937,44.936458,11.820312,26.196875,37.764063,42.16875,44.18125
40,Iran G1,9.225,23.4375,35.50625,53.81875,53.83125,9.814343,23.912196,36.519072,57.349602,54.802651
41,Iran G2,11.934375,21.065625,27.103125,32.578125,35.0875,11.934375,21.492282,27.876247,34.715457,35.720665
42,Iran Z1,10.15,20.54375,34.875,53.209375,57.896875,10.798437,20.959837,35.869816,56.700249,58.941642
43,Iran Z2,9.765625,19.725,32.81875,43.009375,44.021875,10.389506,20.124504,33.754911,45.831064,44.816263


In [40]:
mean_check[abs(mean_check['day_90'] - mean_check['90D']) > MAX_MEAN_DEVIATION]

Unnamed: 0,Clay,day_1,day_3,day_7,day_28,day_90,1D,3D,7D,28D,90D
0,Argex,9.78125,20.3375,28.475,38.734375,41.35,10.406129,20.74941,29.287255,41.275596,42.096173
6,B45 India1 750°C,7.930208,19.34375,23.430208,29.898958,31.226562,13.18375,30.17375,52.09125,64.03125,63.31875
7,B45 India3 750°C,8.780208,20.02125,26.027083,35.973958,37.582813,9.403125,21.560417,29.1575,37.515625,40.345312
8,B45 Loma Sur,16.076667,28.808333,37.821667,61.768333,79.115,10.08125,23.61875,38.52875,49.446875,51.429688
9,B45 Suriname 2,13.8375,33.832813,49.710938,57.896875,59.929167,13.8375,33.832813,49.710938,57.896875,60.13152
14,Chile,9.996875,23.978125,40.90625,56.909375,59.38125,10.635529,24.463771,42.073108,60.642992,60.452803
34,Holcim 4 Brazil,11.620313,24.524219,36.934375,43.610937,44.936458,11.820312,26.196875,37.764063,42.16875,44.18125
39,India 2,9.854688,23.3875,39.75,54.789583,60.047917,9.854688,23.3875,39.75,54.789583,60.35
40,Iran G1,9.225,23.4375,35.50625,53.81875,53.83125,9.814343,23.912196,36.519072,57.349602,54.802651
41,Iran G2,11.934375,21.065625,27.103125,32.578125,35.0875,11.934375,21.492282,27.876247,34.715457,35.720665


### Points that fails in std coherence

In [43]:
MAX_STD_DEVIATION = 0.1

In [44]:
data_merged_std = data_merged[['Clay','STD','STD.1','STD.2','STD.3','STD.4']]

In [45]:
std_check = pd.merge(data_full.groupby('Clay').std(), data_merged_std, on='Clay', how='inner')

In [51]:
std_check[abs(std_check['day_1'] - std_check['STD']) > MAX_STD_DEVIATION]

Unnamed: 0,Clay,day_1,day_3,day_7,day_28,day_90,STD,STD.1,STD.2,STD.3,STD.4
6,B45 India1 750°C,0.096454,0.446217,0.86954,0.677327,1.133698,0.566334,0.726647,1.00678,4.819113,1.785445
7,B45 India3 750°C,0.038205,0.599065,1.241865,0.580173,0.329669,0.18952,1.963092,0.868001,0.555407,0.845567
8,B45 Loma Sur,0.255943,0.944233,1.852279,0.817249,2.212548,0.153793,1.293184,0.531298,1.382842,2.214808
34,Holcim 4 Brazil,0.422323,1.85502,1.560871,2.083866,1.091626,0.299234,0.428007,1.260419,0.830333,0.420147


In [52]:
std_check[abs(std_check['day_3'] - std_check['STD.1']) > MAX_STD_DEVIATION]

Unnamed: 0,Clay,day_1,day_3,day_7,day_28,day_90,STD,STD.1,STD.2,STD.3,STD.4
6,B45 India1 750°C,0.096454,0.446217,0.86954,0.677327,1.133698,0.566334,0.726647,1.00678,4.819113,1.785445
7,B45 India3 750°C,0.038205,0.599065,1.241865,0.580173,0.329669,0.18952,1.963092,0.868001,0.555407,0.845567
8,B45 Loma Sur,0.255943,0.944233,1.852279,0.817249,2.212548,0.153793,1.293184,0.531298,1.382842,2.214808
34,Holcim 4 Brazil,0.422323,1.85502,1.560871,2.083866,1.091626,0.299234,0.428007,1.260419,0.830333,0.420147


In [53]:
std_check[abs(std_check['day_7'] - std_check['STD.2']) > MAX_STD_DEVIATION]

Unnamed: 0,Clay,day_1,day_3,day_7,day_28,day_90,STD,STD.1,STD.2,STD.3,STD.4
6,B45 India1 750°C,0.096454,0.446217,0.86954,0.677327,1.133698,0.566334,0.726647,1.00678,4.819113,1.785445
7,B45 India3 750°C,0.038205,0.599065,1.241865,0.580173,0.329669,0.18952,1.963092,0.868001,0.555407,0.845567
8,B45 Loma Sur,0.255943,0.944233,1.852279,0.817249,2.212548,0.153793,1.293184,0.531298,1.382842,2.214808
19,F1,0.119324,0.335876,0.141421,0.230156,,0.119624,0.279811,1.342461,0.187921,
34,Holcim 4 Brazil,0.422323,1.85502,1.560871,2.083866,1.091626,0.299234,0.428007,1.260419,0.830333,0.420147


In [54]:
std_check[abs(std_check['day_28'] - std_check['STD.3']) > MAX_STD_DEVIATION]

Unnamed: 0,Clay,day_1,day_3,day_7,day_28,day_90,STD,STD.1,STD.2,STD.3,STD.4
6,B45 India1 750°C,0.096454,0.446217,0.86954,0.677327,1.133698,0.566334,0.726647,1.00678,4.819113,1.785445
8,B45 Loma Sur,0.255943,0.944233,1.852279,0.817249,2.212548,0.153793,1.293184,0.531298,1.382842,2.214808
34,Holcim 4 Brazil,0.422323,1.85502,1.560871,2.083866,1.091626,0.299234,0.428007,1.260419,0.830333,0.420147
53,Pontezuela,0.208573,0.174581,0.236979,0.299016,0.583218,0.150033,0.236979,0.299016,0.583218,1.266177
58,South Africa,0.260358,0.397891,0.414056,0.406823,0.971274,0.260358,0.414056,0.406823,0.971274,0.46392


In [55]:
std_check[abs(std_check['day_90'] - std_check['STD.4']) > MAX_STD_DEVIATION]

Unnamed: 0,Clay,day_1,day_3,day_7,day_28,day_90,STD,STD.1,STD.2,STD.3,STD.4
6,B45 India1 750°C,0.096454,0.446217,0.86954,0.677327,1.133698,0.566334,0.726647,1.00678,4.819113,1.785445
7,B45 India3 750°C,0.038205,0.599065,1.241865,0.580173,0.329669,0.18952,1.963092,0.868001,0.555407,0.845567
34,Holcim 4 Brazil,0.422323,1.85502,1.560871,2.083866,1.091626,0.299234,0.428007,1.260419,0.830333,0.420147
39,India 2,0.516205,1.201139,0.608383,0.768936,2.351415,0.516205,1.201139,0.608383,0.768936,2.012739
53,Pontezuela,0.208573,0.174581,0.236979,0.299016,0.583218,0.150033,0.236979,0.299016,0.583218,1.266177
58,South Africa,0.260358,0.397891,0.414056,0.406823,0.971274,0.260358,0.414056,0.406823,0.971274,0.46392
