In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import pandas as pd
import numpy as np

<IPython.core.display.Javascript object>

## Reading the dataset

In [3]:
xls = pd.ExcelFile(
    "../../../../data/raw/partner_i-Oficial/DB_Master_CP1_latest v03.06.xlsx",
    engine="openpyxl",
)
df = pd.read_excel(xls, "DB3 Cimento Shipping", header=[1, 2])

<IPython.core.display.Javascript object>

## Initial Preprocessing

In [4]:
df = pd.concat([df["Date"], df["Time"], df["Composition"], df["Properties"]], axis=1)
df = df.drop([0, 1, 2], axis=0).reset_index(drop=True)

df = pd.concat(
    [
        df["Production"].iloc[:, 0],
        df["Measurement"].iloc[:, 0],
        df["Production"].iloc[:, 1].rename("Production.1"),
        df["Measurement"].iloc[:, 1].rename("Measurement.1"),
        df.drop(["Production", "Measurement"], axis=1),
    ],
    axis=1,
)

df = df.infer_objects()

<IPython.core.display.Javascript object>

In [5]:
df_cpiif32 = df[df["Remarks"] == "CPIIF32 Expedido"].copy()
df_cpiif40 = df[df["Remarks"] == "CPIIF40 Expedido"].copy()
df_cpvari = df[df["Remarks"] == "CPVARI Expedido"].copy()

<IPython.core.display.Javascript object>

### Removing features

Here we will drop some features that may not contribute to the performance of the models that will be fitted on this data.

1. Removing features with features having 60% or more of missing values
2. Removing features related to the properties of Cement
3. Removing features with zero variance
4. Removing features with more than 70% of zeros

<h3>Percentage of missing values:</h3>
    
    
<table>
    <th>Full Dataset:</th>
    <th></th>
    <tr>
        <td>TiO2</td>
        <td>99.921011</td>
    </tr>
    <tr>
        <td>Ca(OH)2</td>
        <td>99.921011</td>
    </tr>        
    <tr>
        <td>CaCO3</td>
        <td>99.921011</td>
    </tr>
    <tr>
    <td>Langbeinite</td>
    <td>69.273302</td>
    </tr>
    <tr>    
    <td>1 day Compressive strength</td>
    <td>81.753555</td>
    <tr/>
</table>

<table>
    <th>CPIIF40</th>
    <th></th>
    <tr>
        <td>TiO2</td>
        <td>99.836334</td>
    </tr>
    <tr>
        <td>Ca(OH)2</td>
        <td>99.836334</td>
    </tr>
    <tr>        
        <td>CaCO3</td>
        <td>99.836334</td>
    </tr>
    <tr>
    <td>Langbeinite</td>
    <td>69.558101</td>
    </tr>
    <tr>
    <td>1 day Compressive strength</td>
    <td>100.000000</td>
    </tr>
</table>

<table>
    <th>CPIIF32</th>
    <th></th>
    <tr>
        <td>TiO2</td>
        <td>100.000000</td>
    </tr>
    <tr>
        <td>Ca(OH)2</td>
        <td>100.000000</td>
    </tr>
    <tr>
        <td>CaCO3</td>
        <td>100.000000</td>
    </tr>
    <tr>
        <td>Langbeinite</td>
        <td>68.646081</td>
    </tr>
    <tr>
        <td>1 day Compressive strength</td>
        <td>100.000000</td>
    </tr>
    <tr>
        <td>#400</td>
        <td>100.000000</td>
    </tr>
</table>

<table>
    <th>CPVARI</th>
    <th></th>
    <tr>
        <td>TiO2</td>
        <td>100.00000</td>
    </tr>
    <tr>
        <td>Ca(OH)2</td>
        <td>100.00000</td>
    </tr>
    <tr>
        <td>CaCO3</td>
        <td>100.00000</td>
    </tr>
    <tr>
        <td>Langbeinite</td>
        <td>69.65812</td>
    </tr>        
</table>

In [6]:
FEATURES_TO_DROP_ALL_CEMENTS = [
    # Missing Values
    "TiO2",
    "Ca(OH)2",
    "CaCO3",
    "Langbeinite",
    # Missing values that cause troubles with cross validation techniques
    "#400",
    "Dehydration",
    "Belite total",
    "Total alkali as Na2O",
    # Features with more than 70% percent of repeated values (low variance)
    "Soundness",
    "Muscovite",
    # Useless Features
    "Production.1",
    "Measurement.1",
    "Measurement",  # Both production and Measurement have the same info in this dataset
]

FEATURES_TO_DROP_CPIIF32 = FEATURES_TO_DROP_ALL_CEMENTS.copy() + [
    "Remarks",
    "1 day Compressive strength",
    "#400",
]
FEATURES_TO_DROP_CPIIF40 = FEATURES_TO_DROP_ALL_CEMENTS.copy() + [
    "Remarks",
    "1 day Compressive strength",
]
FEATURES_TO_DROP_CPVARI = FEATURES_TO_DROP_ALL_CEMENTS.copy() + ["Remarks"]
FEATURES_TO_DROP_ALL_CEMENTS += ["1 day Compressive strength"]

<IPython.core.display.Javascript object>

In [7]:
df = df.drop(labels=FEATURES_TO_DROP_ALL_CEMENTS, axis=1)
df_cpiif32 = df_cpiif32.drop(labels=FEATURES_TO_DROP_CPIIF32, axis=1)
df_cpiif40 = df_cpiif40.drop(labels=FEATURES_TO_DROP_CPIIF40, axis=1)
df_cpvari = df_cpvari.drop(labels=FEATURES_TO_DROP_CPVARI, axis=1)

<IPython.core.display.Javascript object>

### Drop rows where the target variable is null

In [8]:
df = df[df["28 day Compressive strength"].notna()]
df_cpiif32 = df_cpiif32[df_cpiif32["28 day Compressive strength"].notna()]
df_cpiif40 = df_cpiif40[df_cpiif40["28 day Compressive strength"].notna()]
df_cpvari = df_cpvari[df_cpvari["28 day Compressive strength"].notna()]

<IPython.core.display.Javascript object>

In [9]:
(df.isna().sum() / df.shape[0] * 100).sort_values()

Production                     0.000000
28 day Compressive strength    0.000000
7 day Compressive strength     0.000000
3 day Compressive strength     0.000000
Density                        0.000000
Final setting time             0.000000
Initial setting time           0.000000
Remarks                        0.000000
#200                           0.000000
#325                           0.000000
Blaine                         0.081037
CaO                            0.567261
MgO                            0.567261
Na2O                           0.567261
Al2O3                          0.567261
SiO2                           0.567261
SO3                            0.567261
K2O                            0.567261
Fe2O3                          0.567261
IR                             0.729335
LOI                            0.729335
Aphthalite                     1.944895
Periclase                      1.944895
Portlandite                    1.944895
Alite total                    1.944895


<IPython.core.display.Javascript object>

In [10]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
CaO,1227.0,59.717909,1.190905,51.316422,59.344755,59.866741,60.37315,62.99749
MgO,1227.0,2.918399,0.462436,1.64276,2.559535,2.91401,3.24801,4.52074
Na2O,1227.0,0.199705,0.02713,0.05623,0.18695,0.20133,0.213825,0.32538
Al2O3,1227.0,4.243171,0.194438,3.50343,4.137985,4.26463,4.36873,5.51224
SiO2,1227.0,18.339567,0.864886,14.3812,17.927916,18.510441,18.943265,20.58816
SO3,1227.0,3.734795,0.453015,1.96703,3.43871,3.79772,4.06398,5.57408
K2O,1227.0,1.19764,0.091108,0.66896,1.17165,1.21109,1.244095,1.45258
Fe2O3,1227.0,2.906824,0.195844,2.08781,2.787895,2.90333,3.038645,3.38856
LOI,1225.0,6.341706,2.466395,0.88,4.67,5.46,8.51,12.5
IR,1225.0,2.826751,1.16516,0.63,1.96,2.59,3.46,7.36


<IPython.core.display.Javascript object>

In [11]:
df.describe(exclude="number").T

Unnamed: 0,count,unique,top,freq
Production,1234,652,03/08/2020,3
Remarks,1234,3,CPIIF40 Expedido,594


<IPython.core.display.Javascript object>

### Dropping duplicated rows

In [12]:
df.shape

(1234, 43)

<IPython.core.display.Javascript object>

In [13]:
df = df.reset_index(drop=True)

<IPython.core.display.Javascript object>

In [14]:
df = df.loc[df.select_dtypes("number").drop_duplicates().index].reset_index(drop=True)

chemical_vars = [
    "CaO",
    "MgO",
    "Na2O",
    "Al2O3",
    "SiO2",
    "SO3",
    "K2O",
    "Fe2O3",
    "LOI",
    "IR",
]

df = df.loc[
    df.select_dtypes("number").drop_duplicates(subset=chemical_vars).index
].reset_index(drop=True)

<IPython.core.display.Javascript object>

In [15]:
df.shape

(1234, 43)

<IPython.core.display.Javascript object>

### Dropping rows with negative values

In [16]:
df[df.drop(["Production"], axis=1).duplicated()].drop(
    "Production", axis=1
).select_dtypes(include="number").apply(lambda x: x < 0).sum().sort_values(
    ascending=False
)

CaO                            0.0
Arcanite                       0.0
Gypsum                         0.0
Bassanite                      0.0
Anhydrite                      0.0
Calcite                        0.0
Dolimite                       0.0
Quartz                         0.0
%Gypsum                        0.0
%Limestone                     0.0
%Clinker                       0.0
Blaine                         0.0
Initial setting time           0.0
Final setting time             0.0
Density                        0.0
3 day Compressive strength     0.0
7 day Compressive strength     0.0
28 day Compressive strength    0.0
#200                           0.0
Aphthalite                     0.0
Periclase                      0.0
MgO                            0.0
Portlandite                    0.0
Na2O                           0.0
Al2O3                          0.0
SiO2                           0.0
SO3                            0.0
K2O                            0.0
Fe2O3               

<IPython.core.display.Javascript object>

### Removing outliers

In [17]:
df.describe(percentiles=[0.99]).T

Unnamed: 0,count,mean,std,min,50%,99%,max
CaO,1227.0,59.717909,1.190905,51.316422,59.866741,61.500757,62.99749
MgO,1227.0,2.918399,0.462436,1.64276,2.91401,3.92957,4.52074
Na2O,1227.0,0.199705,0.02713,0.05623,0.20133,0.266761,0.32538
Al2O3,1227.0,4.243171,0.194438,3.50343,4.26463,4.596647,5.51224
SiO2,1227.0,18.339567,0.864886,14.3812,18.510441,19.750427,20.58816
SO3,1227.0,3.734795,0.453015,1.96703,3.79772,4.632827,5.57408
K2O,1227.0,1.19764,0.091108,0.66896,1.21109,1.382183,1.45258
Fe2O3,1227.0,2.906824,0.195844,2.08781,2.90333,3.292707,3.38856
LOI,1225.0,6.341706,2.466395,0.88,5.46,12.0252,12.5
IR,1225.0,2.826751,1.16516,0.63,2.59,6.0952,7.36


<IPython.core.display.Javascript object>

In [18]:
df = df.reset_index(drop=True)
df = df.drop(df["Belite alpha"].idxmax())
df = df.drop(df["Aluminate"].idxmax())
df = df.drop(df["Aluminate orto"].idxmax())
df = df.drop(df["Aphthalite"].idxmax())
df = df.drop(df["Gypsum"].idxmax())
df = df.drop(df["#325"].idxmax())
df = df.drop(df["#325"].idxmax())


<IPython.core.display.Javascript object>

### Sort the dataset by date

In [19]:
df["Production"] = pd.to_datetime(df["Production"], format="%d/%m/%Y")
df = df.sort_values(by="Production")

df_cpiif32["Production"] = pd.to_datetime(df_cpiif32["Production"], format="%d/%m/%Y")
df_cpiif32 = df_cpiif32.sort_values(by="Production")

df_cpiif40["Production"] = pd.to_datetime(df_cpiif40["Production"], format="%d/%m/%Y")
df_cpiif40 = df_cpiif40.sort_values(by="Production")

df_cpvari["Production"] = pd.to_datetime(df_cpvari["Production"], format="%d/%m/%Y")
df_cpvari = df_cpvari.sort_values(by="Production")

<IPython.core.display.Javascript object>

### Renaming some features

The key idea here is to create a pattern among different datasets in order to facilitate reusing the same code

In [20]:
features_name_dict = {
    "Production": "Date",
    "1 day Compressive strength": "CS1",
    "3 day Compressive strength": "CS3",
    "7 day Compressive strength": "CS7",
    "28 day Compressive strength": "CS28",
}

df = df.rename(features_name_dict, axis=1)
df_cpiif32 = df_cpiif32.rename(features_name_dict, axis=1)
df_cpiif40 = df_cpiif40.rename(features_name_dict, axis=1)
df_cpvari = df_cpvari.rename(features_name_dict, axis=1)

<IPython.core.display.Javascript object>

### Renaming more features features

In [21]:
COLUMNS_TO_RENAME = {
    "Production": "Date",
    "Remarks": "Cement_Type",
    "LOI": "Loss on Ignition",
    "IR": "Insoluble Residue",
    "Total alkali as Na2O": "Total Alkali as Na2O",
    "Alite total": "Alite Total",
    "Belite total": "Total C2S",
    "Belite alpha": "Belite Alpha",
    "Belite beta": "Belite Beta",
    "Belite gamma": "Belite Gamma",
    "Ferrite": "C4AF",
    "Aluminate cubic": "Aluminate Cubic",
    "Aluminate orto": "Aluminate Orto",
    "Free lime": "Free CaO",
    "Dolimite": "Dolomite",
    "Aphthalite": "Aphthitalite",
    "1 day Compressive strength": "CS1",
    "3 day Compressive strength": "CS3",
    "7 day Compressive strength": "CS7",
    "28 day Compressive strength": "CS28",
}

COLUMNS_TO_RENAME_2 = {
    "Alite Total": "Total C3S",
    "Total C2S": "Total C2S",
    "Belite Alpha": "Alpha C2S",
    "Belite Beta": "Beta C2S",
    "Belite Gamma": "Gamma C2S",
    "C4AF": "C4AF",
    "Aluminate": "C3A",
    "Aluminate Cubic": "Cubic C3A",
    "Aluminate Orto": "Orthorhombic C3A",
    "Free lime": "Free CaO",
}

<IPython.core.display.Javascript object>

In [22]:
df = df.rename(COLUMNS_TO_RENAME, axis=1).rename(COLUMNS_TO_RENAME_2, axis=1).copy()
df = df.drop([0]).reset_index(drop=True)
df = df.infer_objects()

<IPython.core.display.Javascript object>

In [23]:
df.columns

Index(['Date', 'CaO', 'MgO', 'Na2O', 'Al2O3', 'SiO2', 'SO3', 'K2O', 'Fe2O3',
       'Loss on Ignition', 'Insoluble Residue', 'Total C3S', 'Alpha C2S',
       'Beta C2S', 'Gamma C2S', 'C4AF', 'C3A', 'Cubic C3A', 'Orthorhombic C3A',
       'Free CaO', 'Portlandite', 'Periclase', 'Arcanite', 'Aphthitalite',
       'Gypsum', 'Bassanite', 'Anhydrite', 'Calcite', 'Dolomite', 'Quartz',
       '%Gypsum', '%Limestone', '%Clinker', 'Cement_Type', 'Blaine',
       'Initial setting time', 'Final setting time', 'Density', 'CS3', 'CS7',
       'CS28', '#200', '#325'],
      dtype='object')

<IPython.core.display.Javascript object>

## Dropping features that might might relate to materials used in cement production rather than mineralogical analysis.

In [24]:
df = df.drop(["%Gypsum", "%Limestone", "%Clinker"], axis=1)

<IPython.core.display.Javascript object>

## Standardizing cement type identification

In [25]:
df["Cement_Type"].unique()

array(['CPIIF32 Expedido', 'CPVARI Expedido', 'CPIIF40 Expedido'],
      dtype=object)

<IPython.core.display.Javascript object>

In [26]:
# Preprocessing
df["Cement_Type"] = (
    df["Cement_Type"]
    .str.replace("Expedido", "")
    .str.replace("Linha 9", "")
    .str.replace("Itajaí", "")
    .str.replace("D-G", "")
    .str.replace("D-E", "")
    .str.replace("DF2", "")
    .str.replace("CPIIF40", "CP II-F-40")
    .str.replace("CPIIF40", "CP II-F-40")
    .str.replace("CP VARI", "CP V-ARI")
    .str.replace("CPV ARI", "CP V-ARI")
    .str.replace("CPVARI", "CP V-ARI")
    .str.replace("CP V-RSARI", "CP V-ARI RS")
    .str.replace("CPV ARI RS", "CP V-ARI RS")
    .str.replace("CP III-RS40", "CP III-40 RS")
    .str.replace("CPIII 40 RS", "CP III-40 RS")
    .str.replace("CP III-RS32", "CP III-32 RS")
    .str.replace("CPIV32RS", "CP IV-32 RS")
    .str.replace("CP IV-RS32", "CP IV-32 RS")
    .str.replace("CP III40", "CP III-40")
    .str.replace("CPIII40", "CP III-40")
    .str.replace("CP III32", "CP III-32")
    .str.replace("CP I-S40", "CP I-S-40")
    .str.replace("CP I40", "CP I-40")
    .str.replace("CP II-E32", "CP II-E-32")
    .str.replace("CP II E 32", "CP II-E-32")
    .str.replace("CP II E 32", "CP II-E-32")
    .str.replace("CP II-E40", "CP II-E-40")
    .str.replace("CP II-F FIBRO40", "CP II-F-40")
    .str.replace("CP II-F32", "CP II-F-32")
    .str.replace("CPII F 32", "CP II-F-32")
    .str.replace("CPII F32", "CP II-F-32")
    .str.replace("CPIIF32", "CP II-F-32")
    .str.replace("CP II-F40", "CP II-F-40")
    .str.replace("CPII F40", "CP II-F-40")
    .str.replace("CP II-Z32", "CP II-Z-32")
    .str.replace("CP II-Z40", "CP II-Z-40")
    .str.replace("CP IV32", "CP IV-32")
    .str.replace("CPIV 32", "CP IV-32")
    .str.replace("CP IND", "CP I")
    .str.replace("CPINDCC", "CP I")
    .str.strip()
)

<IPython.core.display.Javascript object>

In [27]:
df["Cement_Type"].unique()

array(['CP II-F-32', 'CP V-ARI', 'CP II-F-40'], dtype=object)

<IPython.core.display.Javascript object>

In [28]:
print(df.shape)
print(df_cpiif32.shape)
print(df_cpiif40.shape)
print(df_cpvari.shape)

(1226, 40)
(410, 42)
(594, 42)
(230, 43)


<IPython.core.display.Javascript object>

## Saving the dataset

<h3>Saving for feature engineering</h3>

In [29]:
df.to_csv(
    "../../../../data/interim/partner_i-Oficial/cement-shipping.csv", index=False
)
df_cpiif32.to_csv(
    "../../../../data/interim/partner_i-Oficial/cpiif32.csv", index=False
)
df_cpiif40.to_csv(
    "../../../../data/interim/partner_i-Oficial/cpiif40.csv", index=False
)
df_cpvari.to_csv("../../../../data/interim/partner_i-Oficial/cpvari.csv", index=False)

<IPython.core.display.Javascript object>

In [30]:
df.isna().sum() / df.shape[0] * 100

Date                    0.000000
CaO                     0.570962
MgO                     0.570962
Na2O                    0.570962
Al2O3                   0.570962
SiO2                    0.570962
SO3                     0.570962
K2O                     0.570962
Fe2O3                   0.570962
Loss on Ignition        0.734095
Insoluble Residue       0.734095
Total C3S               1.957586
Alpha C2S               1.957586
Beta C2S                1.957586
Gamma C2S               1.957586
C4AF                    1.957586
C3A                     1.957586
Cubic C3A               1.957586
Orthorhombic C3A        1.957586
Free CaO                1.957586
Portlandite             1.957586
Periclase               1.957586
Arcanite                1.957586
Aphthitalite            1.957586
Gypsum                  2.039152
Bassanite               2.039152
Anhydrite               2.039152
Calcite                 2.039152
Dolomite                2.039152
Quartz                  2.039152
Cement_Typ

<IPython.core.display.Javascript object>

In [31]:
df_cpiif32.isna().sum() / df_cpiif32.shape[0] * 100

Date                    0.000000
CaO                     0.000000
MgO                     0.000000
Na2O                    0.000000
Al2O3                   0.000000
SiO2                    0.000000
SO3                     0.000000
K2O                     0.000000
Fe2O3                   0.000000
LOI                     0.731707
IR                      0.731707
Alite total             2.195122
Belite alpha            2.195122
Belite beta             2.195122
Belite gamma            2.195122
Ferrite                 2.195122
Aluminate               2.195122
Aluminate cubic         2.195122
Aluminate orto          2.195122
Free lime               2.195122
Portlandite             2.195122
Periclase               2.195122
Arcanite                2.195122
Aphthalite              2.195122
Gypsum                  2.439024
Bassanite               2.439024
Anhydrite               2.439024
Calcite                 2.439024
Dolimite                2.439024
Quartz                  2.439024
%Gypsum   

<IPython.core.display.Javascript object>

In [32]:
df_cpiif40.isna().sum() / df_cpiif40.shape[0] * 100

Date                    0.000000
CaO                     0.168350
MgO                     0.168350
Na2O                    0.168350
Al2O3                   0.168350
SiO2                    0.168350
SO3                     0.168350
K2O                     0.168350
Fe2O3                   0.168350
LOI                     1.010101
IR                      1.010101
Alite total             2.020202
Belite alpha            2.020202
Belite beta             2.020202
Belite gamma            2.020202
Ferrite                 2.020202
Aluminate               2.020202
Aluminate cubic         2.020202
Aluminate orto          2.020202
Free lime               2.020202
Portlandite             2.020202
Periclase               2.020202
Arcanite                2.020202
Aphthalite              2.020202
Gypsum                  2.020202
Bassanite               2.020202
Anhydrite               2.020202
Calcite                 2.020202
Dolimite                2.020202
Quartz                  2.020202
%Gypsum   

<IPython.core.display.Javascript object>

In [33]:
df_cpvari.isna().sum() / df_cpvari.shape[0] * 100

Date                    0.000000
CaO                     2.608696
MgO                     2.608696
Na2O                    2.608696
Al2O3                   2.608696
SiO2                    2.608696
SO3                     2.608696
K2O                     2.608696
Fe2O3                   2.608696
LOI                     0.000000
IR                      0.000000
Alite total             1.304348
Belite alpha            1.304348
Belite beta             1.304348
Belite gamma            1.304348
Ferrite                 1.304348
Aluminate               1.304348
Aluminate cubic         1.304348
Aluminate orto          1.304348
Free lime               1.304348
Portlandite             1.304348
Periclase               1.304348
Arcanite                1.304348
Aphthalite              1.304348
Gypsum                  1.304348
Bassanite               1.304348
Anhydrite               1.304348
Calcite                 1.304348
Dolimite                1.304348
Quartz                  1.304348
%Gypsum   

<IPython.core.display.Javascript object>