In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import pandas as pd
import numpy as np

import datetime as dt

<IPython.core.display.Javascript object>

## Reading the dataset

In [3]:
xlsx = pd.ExcelFile(
    "../../../../data/raw/partner_iv/DB_Master_CP1_latest.xlsx", engine="openpyxl"
)

<IPython.core.display.Javascript object>

In [4]:
df_cem_a = pd.read_excel(xlsx, sheet_name="Cem A exp control", skiprows=[1, 2])
df_cem_a = df_cem_a[
    [col for col in df_cem_a.columns.fillna("Unnamed") if "Unnamed" not in col]
]

<IPython.core.display.Javascript object>

In [5]:
df_cem_b = pd.read_excel(xlsx, sheet_name="Cem B exp control", skiprows=[1, 2])
df_cem_b = df_cem_b[
    [col for col in df_cem_b.columns.fillna("Unnamed") if "Unnamed" not in col]
]

<IPython.core.display.Javascript object>

In [6]:
df_cem_c = pd.read_excel(xlsx, sheet_name="Cem c exp control", skiprows=[1, 2])
df_cem_c = df_cem_c[
    [col for col in df_cem_c.columns.fillna("Unnamed") if "Unnamed" not in col]
]

df_cem_c["data"] = pd.TimedeltaIndex(df_cem_c["data"], unit="d") + dt.datetime(
    1900, 1, 1
)

<IPython.core.display.Javascript object>

In [7]:
df = pd.concat([df_cem_a, df_cem_b, df_cem_c], ignore_index=True, axis=0)

<IPython.core.display.Javascript object>

## Initial Preprocessing

In [8]:
df["cem_type"] = (
    df["CEM A expedition control"]
    .combine_first(df["CEM B  expedition control"])
    .combine_first(df["CEM C  expedition control"])
)

<IPython.core.display.Javascript object>

### Removing features

Here we will drop some features that may not contribute to the performance of the models that will be fitted on this data.

1. Removing features with features having 60% or more of missing values
2. Removing features related to the properties of Cement
3. Removing features with zero variance
4. Removing features with more than 70% of zeros

<h3>Percentage of missing values:</h3>
    
<table>
    <th>Full Dataset:</th>
    <th></th>
    <tr>
        <td>silo</td>
        <td>100</td>
    </tr>
    <tr>
        <td>tratt. 63 µm</td>
        <td>100</td>
    </tr>
    <tr>
        <td>tratt. 40 µm</td>
        <td>100</td>
    </tr>
    <tr>
    <td>tratt. 32 µm</td>
    <td>100</td>
    </tr>
    <tr>    
    <td>7 days strenght</td>
    <td>97.952917</td>
    <tr/>
    <tr>
    <td>rihcl</td>
    <td>93.551689</td>
    <tr/>
    <tr>
    <td>LOI</td>
    <td>20.266121</td>
    <tr/>
</table>

<table>
    <th>CEM A</th>
    <th></th>
        <tr>
        <td>silo</td>
        <td>100</td>
    </tr>
    <tr>
        <td>tratt. 63 µm</td>
        <td>100</td>
    </tr>
    <tr>
        <td>tratt. 40 µm</td>
        <td>100</td>
    </tr>
    <tr>
    <td>tratt. 32 µm</td>
    <td>100</td>
    </tr>
    <tr>    
    <td>7 days strenght</td>
    <td>100.000000</td>
    <tr/>
    <tr>
    <td>rihcl</td>
    <td>80.434783</td>
    <tr/>
    <tr>
    <td>LOI</td>
    <td>19.875776</td>
    <tr/>
</table>

<table>
    <th>CEM B</th>
    <th></th>
        <tr>
        <td>silo</td>
        <td>100</td>
    </tr>
    <tr>
        <td>tratt. 63 µm</td>
        <td>100</td>
    </tr>
    <tr>
        <td>tratt. 40 µm</td>
        <td>100</td>
    </tr>
    <tr>
    <td>tratt. 32 µm</td>
    <td>100</td>
    </tr>
    <tr>    
    <td>7 days strenght</td>
    <td>100.000000</td>
    <tr/>
    <tr>
    <td>rihcl</td>
    <td>100.000000</td>
    <tr/>
    <tr>
    <td>LOI</td>
    <td>19.756839</td>
    <tr/>
</table>

<table>
    <th>CEM C</th>
    <th></th>
    <th></th>
        <tr>
        <td>silo</td>
        <td>100</td>
    </tr>
    <tr>
        <td>tratt. 63 µm</td>
        <td>100</td>
    </tr>
    <tr>
        <td>tratt. 40 µm</td>
        <td>100</td>
    </tr>
    <tr>
    <td>tratt. 32 µm</td>
    <td>100</td>
    </tr>
    <tr>    
    <td>7 days strenght</td>
    <td>93.865031</td>
    <tr/>
    <tr>
    <td>rihcl</td>
    <td>100.000000</td>
    <tr/>
    <tr>
    <td>LOI</td>
    <td>21.165644</td>
    <tr/>
</table>

In [9]:
FEATURES_TO_DROP_ALL_CEMENTS = [
    # Missing Values
    "silo",
    "tratt. 63 µm",
    "tratt. 40 µm",
    "tratt. 32 µm",
    "7 days strenght",
    "rihcl",
    # Missing values that cause troubles with cross validation techniques
    "LOI",
    # Features with more than 70% percent of repeated values (low variance)
    # Useless Features
    "CEM A expedition control",
    "CEM B  expedition control",
    "CEM C  expedition control",
    "sigla",
    "SampleName",
]

FEATURES_TO_DROP_CEM_A = [
    # Missing Values
    "tratt. 63 µm",
    "tratt. 40 µm",
    "tratt. 32 µm",
    "7 days strenght",
    "rihcl",
    # Missing values that cause troubles with cross validation techniques
    "LOI",
    # Useless Features
    "CEM A expedition control",
    "sigla",
    "SampleName",
]

FEATURES_TO_DROP_CEM_B = [
    # Missing Values
    "silo",
    "tratt. 63 µm",
    "tratt. 40 µm",
    "tratt. 32 µm",
    "7 days strenght",
    "rihcl",
    # Missing values that cause troubles with cross validation techniques
    "LOI",
    # Useless Features
    "CEM B  expedition control",
    "sigla",
    "SampleName",
]

FEATURES_TO_DROP_CEM_C = [
    # Missing Values
    "silo",
    "tratt. 63 µm",
    "tratt. 40 µm",
    "tratt. 32 µm",
    "7 days strenght",
    "rihcl",
    # Missing values that cause troubles with cross validation techniques
    "LOI",
    # Useless Features
    "CEM C  expedition control",
    "sigla",
    "SampleName",
]

<IPython.core.display.Javascript object>

In [10]:
df = df.drop(labels=FEATURES_TO_DROP_ALL_CEMENTS, axis=1)
df_cem_a = df_cem_a.drop(labels=FEATURES_TO_DROP_CEM_A, axis=1)
df_cem_b = df_cem_b.drop(labels=FEATURES_TO_DROP_CEM_B, axis=1)
df_cem_c = df_cem_c.drop(labels=FEATURES_TO_DROP_CEM_C, axis=1)

<IPython.core.display.Javascript object>

### Drop rows where the target variable is null

In [11]:
df = df[df["28 days strenght"].notna()]
df_cem_a = df_cem_a[df_cem_a["28 days strenght"].notna()]
df_cem_b = df_cem_b[df_cem_b["28 days strenght"].notna()]
df_cem_c = df_cem_c[df_cem_c["28 days strenght"].notna()]

<IPython.core.display.Javascript object>

In [12]:
(df.isna().sum().sort_values(ascending=False) / df.shape[0] * 100).to_frame(
    name="Missing (%)"
).style.background_gradient(cmap="Reds")

Unnamed: 0,Missing (%)
CO2_XRD,8.455115
SO3_XRD,8.455115
C3A cub,8.350731
Belite_beta,8.350731
Gesso,8.350731
"Aphthitalite – (K,Na)3(SO4)2",8.350731
Langbeinite – MgK2(SO4)2,8.350731
K2SO4,8.350731
Quartz,8.350731
Periclasio (MgO),8.350731


<IPython.core.display.Javascript object>

In [13]:
df.describe().T

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
data,958.0,2021-04-20 12:34:34.321503232,2019-10-07 00:00:00,2020-07-15 12:00:00,2021-04-22 12:00:00,2022-01-21 00:00:00,2022-10-21 00:00:00,
SO3,957.0,2.969927,1.77,2.7,2.96,3.23,3.91,0.353675
Cl-,958.0,0.049674,0.011,0.022,0.054,0.07,0.099,0.023952
Blaine,955.0,3514.092147,2654.0,3335.0,3499.0,3666.0,4771.0,274.100021
ph2oimm,957.0,28.44232,24.6,28.0,28.4,29.0,31.0,0.688486
Initial Setting Time,953.0,214.994753,115.0,190.0,210.0,230.0,390.0,37.538145
Soundness,944.0,0.725636,0.0,0.0,1.0,1.0,3.0,0.617816
flow,951.0,107.711882,78.0,98.0,110.0,118.0,132.0,11.715085
2 days strenght,958.0,28.300939,17.9,24.4,29.4,31.9,37.3,4.574196
28 days strenght,958.0,49.042276,34.4,42.5,50.8,54.1,65.4,6.617852


<IPython.core.display.Javascript object>

In [14]:
df.describe(exclude="number").T

Unnamed: 0,count,unique,top,freq,mean,min,25%,50%,75%,max
data,958,,,,2021-04-20 12:34:34.321503232,2019-10-07 00:00:00,2020-07-15 12:00:00,2021-04-22 12:00:00,2022-01-21 00:00:00,2022-10-21 00:00:00
cem_type,958,3.0,CEM B,323.0,,,,,,


<IPython.core.display.Javascript object>

In [15]:
(df_cem_a.isna().sum().sort_values(ascending=False) / df_cem_a.shape[0] * 100).to_frame(
    name="Missing (%)"
).style.background_gradient(cmap="Reds")

Unnamed: 0,Missing (%)
C3A cub,8.571429
K2SO4,8.571429
C4AF,8.571429
CaO,8.571429
Ca(OH)2,8.571429
Calce libera,8.571429
Periclasio (MgO),8.571429
Quartz,8.571429
Langbeinite – MgK2(SO4)2,8.571429
C3A_ortho,8.571429


<IPython.core.display.Javascript object>

In [16]:
df_cem_a.describe().T

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
data,315.0,2021-04-27 03:53:08.571428608,2019-10-25 00:00:00,2020-07-26 00:00:00,2021-05-03 00:00:00,2022-01-23 12:00:00,2022-10-19 00:00:00,
SO3,315.0,2.612952,1.77,2.48,2.62,2.75,2.98,0.189969
Cl-,315.0,0.022038,0.011,0.016,0.018,0.023,0.07,0.009984
Blaine,314.0,3482.678344,2703.0,3314.75,3488.5,3645.75,4316.0,256.813868
ph2oimm,314.0,28.465605,24.6,28.0,28.4,28.8,31.0,0.674048
Initial Setting Time,313.0,212.891374,140.0,190.0,210.0,230.0,345.0,33.239048
Soundness,312.0,0.615385,0.0,0.0,1.0,1.0,3.0,0.560908
flow,312.0,112.00641,88.0,100.0,115.0,122.0,132.0,12.194299
2 days strenght,315.0,30.630794,24.7,28.9,30.6,32.6,37.2,2.592217
28 days strenght,315.0,54.311111,46.6,51.9,54.2,56.3,65.4,3.394315


<IPython.core.display.Javascript object>

In [17]:
df_cem_a.describe(exclude="number").T

Unnamed: 0,count,mean,min,25%,50%,75%,max
data,315,2021-04-27 03:53:08.571428608,2019-10-25 00:00:00,2020-07-26 00:00:00,2021-05-03 00:00:00,2022-01-23 12:00:00,2022-10-19 00:00:00


<IPython.core.display.Javascript object>

In [18]:
(df_cem_b.isna().sum().sort_values(ascending=False) / df_cem_b.shape[0] * 100).to_frame(
    name="Missing (%)"
).style.background_gradient(cmap="Reds")

Unnamed: 0,Missing (%)
C3A cub,8.049536
K2SO4,8.049536
C4AF,8.049536
CaO,8.049536
Ca(OH)2,8.049536
Calce libera,8.049536
Periclasio (MgO),8.049536
Quartz,8.049536
Langbeinite – MgK2(SO4)2,8.049536
C3A_ortho,8.049536


<IPython.core.display.Javascript object>

In [19]:
df_cem_b.describe().T

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
data,323.0,2021-04-14 02:13:44.767801856,2019-10-07 00:00:00,2020-07-07 00:00:00,2021-04-19 00:00:00,2022-01-18 00:00:00,2022-10-19 00:00:00,
SO3,322.0,3.234224,2.48,3.04,3.255,3.4575,3.91,0.301798
Cl-,323.0,0.059226,0.015,0.05,0.061,0.071,0.094,0.016461
Blaine,321.0,3402.713396,2757.0,3248.0,3378.0,3505.0,4771.0,236.431779
ph2oimm,323.0,28.817028,25.4,28.4,28.8,29.2,30.6,0.575796
Initial Setting Time,322.0,205.295031,115.0,180.0,195.0,225.0,330.0,36.410047
Soundness,316.0,0.813291,0.0,0.0,1.0,1.0,2.0,0.642377
flow,321.0,107.538941,85.0,98.0,110.0,117.0,128.0,11.021536
2 days strenght,323.0,31.477709,23.3,29.8,31.4,33.1,37.3,2.393677
28 days strenght,323.0,52.093808,45.0,50.15,52.0,54.0,59.6,2.625359


<IPython.core.display.Javascript object>

In [20]:
df_cem_b.describe(exclude="number").T

Unnamed: 0,count,mean,min,25%,50%,75%,max
data,323,2021-04-14 02:13:44.767801856,2019-10-07 00:00:00,2020-07-07 00:00:00,2021-04-19 00:00:00,2022-01-18 00:00:00,2022-10-19 00:00:00


<IPython.core.display.Javascript object>

In [21]:
(df_cem_c.isna().sum().sort_values(ascending=False) / df_cem_c.shape[0] * 100).to_frame(
    name="Missing (%)"
).style.background_gradient(cmap="Reds")

Unnamed: 0,Missing (%)
CO2_XRD,8.75
SO3_XRD,8.75
K2SO4,8.4375
C4AF,8.4375
CaO,8.4375
Ca(OH)2,8.4375
Calce libera,8.4375
Periclasio (MgO),8.4375
Quartz,8.4375
Langbeinite – MgK2(SO4)2,8.4375


<IPython.core.display.Javascript object>

In [22]:
df_cem_c.describe().T

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
data,320.0,2021-04-20 11:33:00,2019-10-16 00:00:00,2020-07-16 12:00:00,2021-04-23 12:00:00,2022-01-21 12:00:00,2022-10-21 00:00:00,
SO3,320.0,3.055375,2.5,2.91,3.07,3.2,3.49,0.209815
Cl-,320.0,0.067237,0.021,0.06,0.07,0.076,0.099,0.013911
Blaine,320.0,3656.64375,2654.0,3499.75,3620.5,3785.0,4389.0,264.68623
ph2oimm,320.0,28.04125,26.4,27.6,28.0,28.4,31.0,0.579686
Initial Setting Time,318.0,226.886792,160.0,200.0,220.0,240.0,390.0,39.498749
Soundness,316.0,0.746835,0.0,0.0,1.0,1.0,2.0,0.63182
flow,318.0,103.672956,78.0,93.0,106.0,112.0,125.0,10.403831
2 days strenght,320.0,22.800937,17.9,21.4,22.5,24.4,28.5,2.10916
28 days strenght,320.0,40.775625,34.4,38.6,40.65,42.5,49.1,2.774328


<IPython.core.display.Javascript object>

In [23]:
df_cem_c.describe(exclude="number").T

Unnamed: 0,count,mean,min,25%,50%,75%,max
data,320,2021-04-20 11:33:00,2019-10-16 00:00:00,2020-07-16 12:00:00,2021-04-23 12:00:00,2022-01-21 12:00:00,2022-10-21 00:00:00


<IPython.core.display.Javascript object>

### Sort the dataset by date

In [24]:
df = df.sort_values(by="data")
df_cem_a = df_cem_a.sort_values(by="data")
df_cem_b = df_cem_b.sort_values(by="data")
df_cem_c = df_cem_c.sort_values(by="data")

<IPython.core.display.Javascript object>

### Renaming some features

The key idea here is to create a pattern among different datasets in order to facilitate reusing the same code

In [25]:
features_name_dict = {
    "data": "Date",
    "2 days strenght": "CS2",
    "28 days strenght": "CS28",
}

df = df.rename(features_name_dict, axis=1)
df_cem_a = df_cem_a.rename(features_name_dict, axis=1)
df_cem_b = df_cem_b.rename(features_name_dict, axis=1)
df_cem_c = df_cem_c.rename(features_name_dict, axis=1)

<IPython.core.display.Javascript object>

### Grouping and computing mean daily of features

Datasets with various cements we group by date and cement type
Datasets with a single type of cement we compute the mean only.

In [26]:
print(df.shape)
print(df_cem_a.shape)
print(df_cem_b.shape)
print(df_cem_c.shape)

(958, 37)
(315, 36)
(323, 36)
(320, 36)


<IPython.core.display.Javascript object>

In [27]:
# df = df.groupby(["Date", "cem_type"]).mean().reset_index()
# df_cem_a = df_cem_a.groupby(["Date"]).mean().reset_index()
# df_cem_b = df_cem_b.groupby(["Date"]).mean().reset_index()
# df_cem_c = df_cem_c.groupby(["Date"]).mean().reset_index()

<IPython.core.display.Javascript object>

In [28]:
# print(df.shape)
# print(df_cem_a.shape)
# print(df_cem_b.shape)
# print(df_cem_c.shape)

<IPython.core.display.Javascript object>

## Saving the dataset

<h3>Saving for feature engineering</h3>

In [29]:
df.to_csv("../../../../data/interim/partner_iv/cement-shipping.csv", index=False)
df_cem_a.to_csv("../../../../data/interim/partner_iv/cem_a.csv", index=False)
df_cem_b.to_csv("../../../../data/interim/partner_iv/cem_b.csv", index=False)
df_cem_c.to_csv("../../../../data/interim/partner_iv/cem_c.csv", index=False)

<IPython.core.display.Javascript object>