In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import pandas as pd
import numpy as np

<IPython.core.display.Javascript object>

## Reading the dataset

In [3]:
df = pd.read_csv("../../../../data/raw/partner_ii/db3_cement.csv")

<IPython.core.display.Javascript object>

# Removing Useless Features

Here we will drop some features that may not contribute to the performance of the models that will be fitted on this data.

<h3>Percentage of missing values:</h3>

<table>
    <th>Full Dataset:</th>
    <th> </th>
    <tr>
        <td>cl</td>
        <td>99.91</td>
    </tr>
    <tr>
        <td>tio2</td>
        <td>99.91</td>
    </tr>        
</table>

<table>
    <th>Type I-II</th>
    <th></th>
        <tr>
        <td>cl</td>
        <td>99.9</td>
    </tr>
    <tr>
        <td>tio2</td>
        <td>99.9</td>
    </tr>  
    
<table>
    <th>Type III</th>
    <th></th>
        <tr>
        <td>cl</td>
        <td>100</td>
    </tr>
    <tr>
        <td>tio2</td>
        <td>100</td>
    </tr> 
    
<table>
    <th>Type IL</th>
    <th></th>
        <tr>
        <td>cl</td>
        <td>100</td>
    </tr>
    <tr>
        <td>tio2</td>
        <td>100</td>
    </tr> 
</table>

In [4]:
FEATURES_TO_DROP = [
    # Missing Values
    "cl",
    "tio2",
    # Useless Features
    "sample_id_plant",
    "raw_material_code",
]

<IPython.core.display.Javascript object>

### Drop rows where the target variable is null

In [5]:
df = df[df["strength_28d"].notna()].reset_index(drop=True)

<IPython.core.display.Javascript object>

## Convert Date to format Date Time

In [6]:
df["sample_date"] = pd.to_datetime(df["sample_date"])

<IPython.core.display.Javascript object>

In [7]:
df_type_i_ii = (
    df[df["unified_material_name"] == "Type I-II"].reset_index(drop=True).copy()
)
df_type_iii = (
    df[df["unified_material_name"] == "Type III"].reset_index(drop=True).copy()
)
df_type_il = df[df["unified_material_name"] == "Type IL"].reset_index(drop=True).copy()

<IPython.core.display.Javascript object>

### Removing features with features having 30% or more of missing values

### Removing features with zero variance

### Removing features with more than 70% of zeros

In [8]:
df = df.drop(labels=FEATURES_TO_DROP, axis=1)
df_type_i_ii = df_type_i_ii.drop(
    labels=FEATURES_TO_DROP + ["unified_material_name"], axis=1
)
df_type_iii = df_type_iii.drop(
    labels=FEATURES_TO_DROP + ["unified_material_name"], axis=1
)
df_type_il = df_type_il.drop(
    labels=FEATURES_TO_DROP + ["unified_material_name"], axis=1
)

<IPython.core.display.Javascript object>

In [9]:
(df.isna().sum().sort_values(ascending=False) / df.shape[0] * 100).to_frame(
    name="Missing (%)"
).style.background_gradient(cmap="Reds")

Unnamed: 0,Missing (%)
aluminate,2.932829
ferrite,2.932829
belite,2.932829
alite,2.932829
free_lime,2.554399
so3,1.135289
k2o,1.135289
na2o,1.135289
mgo,1.135289
fe2o3,1.135289


<IPython.core.display.Javascript object>

In [10]:
(
    df_type_i_ii.isna().sum().sort_values(ascending=False) / df_type_i_ii.shape[0] * 100
).to_frame(name="Missing (%)").style.background_gradient(cmap="Reds")

Unnamed: 0,Missing (%)
ferrite,2.857143
belite,2.857143
aluminate,2.857143
alite,2.857143
free_lime,2.44898
fe2o3,1.22449
mgo,1.22449
so3,1.22449
k2o,1.22449
na2o,1.22449


<IPython.core.display.Javascript object>

In [11]:
(
    df_type_iii.isna().sum().sort_values(ascending=False) / df_type_iii.shape[0] * 100
).to_frame(name="Missing (%)").style.background_gradient(cmap="Reds")

Unnamed: 0,Missing (%)
ferrite,2.868852
belite,2.868852
aluminate,2.868852
alite,2.868852
free_lime,2.459016
fe2o3,0.819672
mgo,0.819672
so3,0.819672
k2o,0.819672
na2o,0.819672


<IPython.core.display.Javascript object>

In [12]:
(
    df_type_il.isna().sum().sort_values(ascending=False) / df_type_il.shape[0] * 100
).to_frame(name="Missing (%)").style.background_gradient(cmap="Reds")

Unnamed: 0,Missing (%)
ferrite,3.846154
belite,3.846154
free_lime,3.846154
aluminate,3.846154
alite,3.846154
mgo,1.282051
so3,1.282051
k2o,1.282051
na2o,1.282051
fe2o3,1.282051


<IPython.core.display.Javascript object>

In [13]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
cao,1045.0,62.160699,0.503725,58.75,61.89,62.15,62.45,64.75
sio2,1045.0,19.142297,0.448662,16.79,19.03,19.23,19.38,20.15
al2o3,1045.0,4.676459,0.162138,3.97,4.57,4.69,4.8,5.04
fe2o3,1045.0,3.223694,0.099771,2.92,3.17,3.22,3.28,4.58
mgo,1045.0,3.005923,0.344269,1.76,2.84,3.01,3.19,3.99
so3,1045.0,3.019579,0.167625,2.57,2.89,2.94,3.11,3.58
k2o,1045.0,0.639053,0.052072,0.5,0.6,0.63,0.68,0.82
na2o,1045.0,0.078584,0.033967,0.0,0.05,0.07,0.11,0.17
alite,1026.0,57.011296,2.886962,32.64,55.37,57.545,58.8775,63.62
belite,1026.0,15.020175,2.089796,10.02,13.71,14.87,16.2775,31.18


<IPython.core.display.Javascript object>

In [14]:
df_type_i_ii.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
cao,726.0,62.134077,0.487112,60.36,61.8925,62.15,62.44,64.65
sio2,726.0,19.253526,0.231384,18.37,19.09,19.26,19.39,20.0
al2o3,726.0,4.708994,0.136825,4.36,4.6,4.72,4.82,5.04
fe2o3,726.0,3.237865,0.075523,3.07,3.19,3.23,3.29,3.45
mgo,726.0,2.997603,0.314888,1.85,2.86,3.0,3.17,3.89
so3,726.0,2.932658,0.085731,2.57,2.88,2.91,2.95,3.22
k2o,726.0,0.637011,0.050115,0.53,0.6,0.63,0.67,0.78
na2o,726.0,0.079862,0.033076,0.01,0.05,0.075,0.11,0.17
alite,714.0,57.379916,2.488923,32.64,55.88,57.665,58.9275,63.3
belite,714.0,14.873978,2.069909,10.02,13.5725,14.66,15.9775,31.18


<IPython.core.display.Javascript object>

In [15]:
df_type_iii.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
cao,242.0,62.215165,0.517907,60.89,61.88,62.14,62.4775,64.75
sio2,242.0,19.224545,0.297748,18.48,19.0325,19.25,19.41,20.15
al2o3,242.0,4.679628,0.130159,4.3,4.59,4.69,4.78,5.02
fe2o3,242.0,3.227149,0.084748,3.03,3.16,3.23,3.28,3.48
mgo,242.0,3.023347,0.343752,1.76,2.875,3.04,3.22,3.99
so3,242.0,3.257397,0.133685,2.72,3.17,3.22,3.39,3.58
k2o,242.0,0.655289,0.054347,0.52,0.62,0.64,0.69,0.82
na2o,242.0,0.082066,0.036002,0.01,0.05,0.075,0.11,0.16
alite,237.0,57.586245,2.456642,49.56,56.0,57.75,59.1,63.62
belite,237.0,15.359451,2.089176,10.1,14.13,15.36,16.72,21.83


<IPython.core.display.Javascript object>

In [16]:
df_type_il.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
cao,77.0,62.240519,0.590615,58.75,62.03,62.23,62.54,63.42
sio2,77.0,17.835065,0.319805,16.79,17.62,17.87,18.08,18.87
al2o3,77.0,4.35974,0.130525,3.97,4.3,4.38,4.43,4.64
fe2o3,77.0,3.079221,0.190917,2.92,3.0,3.04,3.11,4.58
mgo,77.0,3.02961,0.55256,2.02,2.71,2.96,3.47,3.95
so3,77.0,3.091688,0.040827,2.93,3.07,3.09,3.11,3.18
k2o,77.0,0.607273,0.044945,0.5,0.57,0.61,0.64,0.75
na2o,77.0,0.055584,0.02668,0.0,0.04,0.06,0.07,0.13
alite,75.0,51.6852,2.321678,45.11,50.0,52.15,53.355,56.74
belite,75.0,15.339867,2.147599,10.55,13.93,15.12,16.845,21.67


<IPython.core.display.Javascript object>

In [17]:
df.shape

(1057, 25)

<IPython.core.display.Javascript object>

### Sort the dataset by date

In [18]:
df = df.sort_values(by="sample_date")
df_type_i_ii = df_type_i_ii.sort_values(by="sample_date")
df_type_iii = df_type_iii.sort_values(by="sample_date")
df_type_il = df_type_il.sort_values(by="sample_date")

<IPython.core.display.Javascript object>

### Renaming some features

The key idea here is to create a pattern among different datasets in order to facilitate reusing the same code


In [19]:
features_name_dict = {
    "sample_date": "Date",
    "strength_1d": "CS1",
    "strength_3d": "CS3",
    "strength_7d": "CS7",
    "strength_28d": "CS28",
}

df = df.rename(features_name_dict, axis=1)
df_type_i_ii = df_type_i_ii.rename(features_name_dict, axis=1)
df_type_iii = df_type_iii.rename(features_name_dict, axis=1)
df_type_il = df_type_il.rename(features_name_dict, axis=1)

<IPython.core.display.Javascript object>

In [20]:
COLUMNS_TO_DROP = [
    "water_demand",
]

COLUMNS_TO_RENAME = {
    "sample_date": "Date",
    "unified_material_name": "Cement_Type",
    "cao": "CaO",
    "sio2": "SiO2",
    "al2o3": "Al2O3",
    "fe2o3": "Fe2O3",
    "mgo": "MgO",
    "so3": "SO3",
    "k2o": "K2O",
    "na2o": "Na2O",
    # "cl": "Cl-",
    "cl": "Cl",
    "tio2": "TiO2",
    "alite": "Alite",
    "belite": "Belite",
    "aluminate": "Aluminate",
    "ferrite": "Ferrite",
    "free_lime": "Free CaO",
    "loi": "Loss on Ignition",
    "water_demand": "Water Demand",
    "setting_initial": "Initial setting time",
    "strength_1d": "CS1",
    "strength_3d": "CS3",
    "strength_7d": "CS7",
    "strength_28d": "CS28",
    "blaine": "Blaine",
    # "sieve_32um": "#400",
    # "sieve_45um": "#325",
    "sieve_32um": "Sieve 32 um",
    "sieve_45um": "Sieve 45 um",
}

COLUMNS_TO_RENAME_2 = {
    "Alite": "Total C3S",
    "Belite": "Total C2S",
    "Aluminate": "C3A",
    "Ferrite": "C4AF",
}

<IPython.core.display.Javascript object>

In [21]:
df = (
    df.drop(COLUMNS_TO_DROP, axis=1)
    .rename(COLUMNS_TO_RENAME, axis=1)
    .rename(COLUMNS_TO_RENAME_2, axis=1)
    .copy()
)
df = df.drop([0]).reset_index(drop=True)
df = df.infer_objects()

<IPython.core.display.Javascript object>

### Dropping duplicated rows

In [22]:
df.shape

(1056, 24)

<IPython.core.display.Javascript object>

In [23]:
df = df.reset_index(drop=True)

<IPython.core.display.Javascript object>

In [24]:
df.columns

Index(['Date', 'Cement_Type', 'CaO', 'SiO2', 'Al2O3', 'Fe2O3', 'MgO', 'SO3',
       'K2O', 'Na2O', 'Total C3S', 'Total C2S', 'C3A', 'C4AF', 'Free CaO',
       'Loss on Ignition', 'Initial setting time', 'CS1', 'CS3', 'CS7', 'CS28',
       'Blaine', 'Sieve 32 um', 'Sieve 45 um'],
      dtype='object')

<IPython.core.display.Javascript object>

In [25]:
chemical_mineralogical_vars = [
    "CaO",
    "SiO2",
    "Al2O3",
    "Fe2O3",
    "MgO",
    "SO3",
    "K2O",
    "Na2O",
    "Total C3S",
    "Total C2S",
    "C3A",
    "C4AF",
    "Free CaO",
    "Loss on Ignition",
]

chemical_vars = [
    "CaO",
    "SiO2",
    "Al2O3",
    "Fe2O3",
    "MgO",
    "SO3",
    "K2O",
    "Na2O",
    "Free CaO",
    "Loss on Ignition",
]


df = df.loc[df.select_dtypes("number").drop_duplicates().index].reset_index(drop=True)

df = df.loc[
    df.select_dtypes("number").drop_duplicates(subset=chemical_vars).index
].reset_index(drop=True)


df = df.loc[
    df.select_dtypes("number").drop_duplicates(subset=chemical_mineralogical_vars).index
].reset_index(drop=True)

<IPython.core.display.Javascript object>

In [26]:
df.shape

(1054, 24)

<IPython.core.display.Javascript object>

### Dropping rows with negative values

In [27]:
df[df.drop(["Date"], axis=1).duplicated()].drop("Date", axis=1).select_dtypes(
    include="number"
).apply(lambda x: x < 0).sum().sort_values(ascending=False)

CaO                     0.0
SiO2                    0.0
Sieve 32 um             0.0
Blaine                  0.0
CS28                    0.0
CS7                     0.0
CS3                     0.0
CS1                     0.0
Initial setting time    0.0
Loss on Ignition        0.0
Free CaO                0.0
C4AF                    0.0
C3A                     0.0
Total C2S               0.0
Total C3S               0.0
Na2O                    0.0
K2O                     0.0
SO3                     0.0
MgO                     0.0
Fe2O3                   0.0
Al2O3                   0.0
Sieve 45 um             0.0
dtype: float64

<IPython.core.display.Javascript object>

### Removing outliers

In [28]:
df.describe(percentiles=[0.99]).T

Unnamed: 0,count,mean,std,min,50%,99%,max
CaO,1044.0,62.160536,0.503939,58.75,62.15,63.37,64.75
SiO2,1044.0,19.142031,0.448795,16.79,19.23,19.8057,20.15
Al2O3,1044.0,4.676284,0.162116,3.97,4.69,4.9757,5.04
Fe2O3,1044.0,3.223697,0.099818,2.92,3.22,3.43,4.58
MgO,1044.0,3.00522,0.343682,1.76,3.01,3.8214,3.99
SO3,1044.0,3.019521,0.167695,2.57,2.935,3.44,3.58
K2O,1044.0,0.639013,0.052081,0.5,0.63,0.77,0.82
Na2O,1044.0,0.078592,0.033982,0.0,0.07,0.15,0.17
Total C3S,1025.0,57.01041,2.888232,32.64,57.54,62.6376,63.62
Total C2S,1025.0,15.020195,2.090817,10.02,14.87,20.6344,31.18


<IPython.core.display.Javascript object>

In [29]:
df = df.reset_index(drop=True)
df = df.drop(df["Sieve 32 um"].idxmax())

<IPython.core.display.Javascript object>

In [30]:
df.describe(percentiles=[0.99]).T

Unnamed: 0,count,mean,std,min,50%,99%,max
CaO,1043.0,62.160585,0.504178,58.75,62.15,63.37,64.75
SiO2,1043.0,19.143375,0.446903,16.79,19.23,19.8058,20.15
Al2O3,1043.0,4.676568,0.161933,3.97,4.69,4.9758,5.04
Fe2O3,1043.0,3.223969,0.099478,2.92,3.22,3.43,4.58
MgO,1043.0,3.005129,0.343835,1.76,3.01,3.8216,3.99
SO3,1043.0,3.019463,0.167765,2.57,2.93,3.44,3.58
K2O,1043.0,0.639012,0.052106,0.5,0.63,0.77,0.82
Na2O,1043.0,0.078591,0.033998,0.0,0.07,0.15,0.17
Total C3S,1024.0,57.017227,2.881382,32.64,57.545,62.6377,63.62
Total C2S,1024.0,15.019033,2.091507,10.02,14.87,20.6363,31.18


<IPython.core.display.Javascript object>

## Standardizing cement type identification

In [31]:
df["Cement_Type"].unique()

array(['Type I-II', 'Type III', 'Type IL'], dtype=object)

<IPython.core.display.Javascript object>

### Sort the dataset by date

In [32]:
df = df.sort_values(by="Date")
df_type_i_ii = df_type_i_ii.sort_values(by="Date")
df_type_iii = df_type_iii.sort_values(by="Date")
df_type_il = df_type_il.sort_values(by="Date")

<IPython.core.display.Javascript object>

## Saving the dataset

<h3>Saving for feature engineering</h3>

In [33]:
df.to_csv("../../../../data/interim/partner_ii/cement-shipping.csv", index=False)
df_type_i_ii.to_csv("../../../../data/interim/partner_ii/Type-I-II.csv", index=False)
df_type_iii.to_csv("../../../../data/interim/partner_ii/Type-III.csv", index=False)
df_type_il.to_csv("../../../../data/interim/partner_ii/Type-IL.csv", index=False)

<IPython.core.display.Javascript object>