In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from scipy.stats import pearsonr
from collections import Counter

<IPython.core.display.Javascript object>

In [3]:
def convert_to_float(value):
    try:
        # If it's a string enclosed in single quotes, remove quotes and replace comma with dot
        if isinstance(value, str):
            return float(value.replace(",", "."))
        else:
            return float(value)
    except ValueError as e:
        # print(e)
        return float("nan")

<IPython.core.display.Javascript object>

In [4]:
def preprocess_numeric_cols(df, columns):
    for col in columns:
        df[col] = df[col].astype(str)  # Ensure string type for string operations
        df[col] = df[col].str.replace("#", "")  # Remove '#' characters
        df[col] = df[col].str.replace(",", ".")  # Replace ',' with '.' for decimals
    return df

<IPython.core.display.Javascript object>

# Data preparation - 209 plant AM data

### Reading the files and extracting relevant information:

In [5]:
plant = "AM"
xls_files = {}

xls = pd.ExcelFile(
    f"../../../../data/raw/209/EMBRAPII hubIC IACC_CIMENTO_{plant}.xlsx",
    engine="openpyxl",
)
xls_files[plant] = xls

<IPython.core.display.Javascript object>

In [6]:
print("Plant: ", plant, xls.sheet_names)

Plant:  AM ['INSTRUÇÕES', 'Clínquer', '185', '96', '15', '22', '20', '57', '107', '64']


<IPython.core.display.Javascript object>

In [7]:
cement_types_per_plant = []
CEMENT_TYPE_COLS = ["Tipo de cimento", "Classe de resistência"]
dataframes = []

for plant, xls in xls_files.items():
    for sheet_name in xls.sheet_names[2:]:
        df = pd.read_excel(xls_files[plant], sheet_name, header=[1, 2])

        if not df[CEMENT_TYPE_COLS].iloc[2:].isna().all().all():
            cement_type = (
                df[CEMENT_TYPE_COLS]
                .loc[2:]
                .astype(str)
                .sum(axis=1)
                .str.replace(" ", "")
                .str.replace("-", "")
                .str.replace("nan", "")
                .unique()
            )

        else:
            cement_type = (
                df[[("Obs.", "Unnamed: 8_level_1")]]
                .loc[2:]
                .astype(str)
                .sum(axis=1)
                .str.replace(" ", "")
                .str.replace("-", "")
                .unique()
            )

        cement_types_per_plant.append((plant, sheet_name, cement_type))
        dataframes.append(df)

<IPython.core.display.Javascript object>

In [8]:
cement_types_per_plant

[('AM', '185', array(['CPIIF40'], dtype=object)),
 ('AM', '96', array(['CPIV32'], dtype=object)),
 ('AM', '15', array(['CPVARI'], dtype=object)),
 ('AM', '22', array(['CPIIF32'], dtype=object)),
 ('AM', '20', array(['CPIIFFIBRO40'], dtype=object)),
 ('AM', '57', array(['CPIIZ32'], dtype=object)),
 ('AM', '107', array(['CPIIZ40'], dtype=object)),
 ('AM', '64', array(['CPVARI'], dtype=object))]

<IPython.core.display.Javascript object>

In [9]:
cements = []
for tup in cement_types_per_plant:
    cements.append(tup[2][0])
Counter(cements)

Counter({'CPVARI': 2,
         'CPIIF40': 1,
         'CPIV32': 1,
         'CPIIF32': 1,
         'CPIIFFIBRO40': 1,
         'CPIIZ32': 1,
         'CPIIZ40': 1})

<IPython.core.display.Javascript object>

In [10]:
dataframes = []

for plant, xls in xls_files.items():
    for sheet_name in xls.sheet_names[2:]:
        df = pd.read_excel(xls_files[plant], sheet_name, header=[0, 1, 2])
        df["Unnamed: 0_level_0", "Unnamed: 0_level_1", "Unnamed: 0_level_2"] = plant
        dataframes.append(df.drop([0, 1], axis=0).reset_index(drop=True).copy())

<IPython.core.display.Javascript object>

In [11]:
df = pd.concat(dataframes, axis=0).reset_index(drop=True)

<IPython.core.display.Javascript object>

## Initial Preprocessing

In [12]:
df = pd.concat(
    [
        df["Unnamed: 0_level_0", "Unnamed: 0_level_1", "Unnamed: 0_level_2"],
        df["Dados iniciais"],
        df["Cimento"]["Composição"].rename(
            {"Sulfato de cálcio": "Sulfato de cálcio (composição)"}, axis=1
        ),
        df["Cimento"]["Análise mineralógica"],
        df["Cimento"]["Análise química"],
        df["Característias físicas do cimento"],
        df["Propriedades do cimento"],
    ],
    axis=1,
)

<IPython.core.display.Javascript object>

In [13]:
# df.info()

<IPython.core.display.Javascript object>

### 1. Dropping columns where either there is no data nor it contributes to the analysis
### 2. Renaming columns to a short identifiable name

In [14]:
COLUMNS_TO_DROP = [
    ("Data", "Medida"),
    ("Horário", "Produção"),
    ("Horário", "Medida"),
    ("Obs.", "Unnamed: 148_level_2"),
    ("Tipo de cimento", "Unnamed: 6_level_2"),
    ("Classe de resistência", "Unnamed: 7_level_2"),
    ("Obs.", "Unnamed: 8_level_2"),
    "Obs.",
    ("Obs.", "Unnamed: 148_level_2"),
    ("Unnamed: 141_level_1", "Obs."),
    # ("Unnamed: 139_level_1", "#400"),
]

COLUMNS_TO_RENAME = {
    ("Unnamed: 0_level_0", "Unnamed: 0_level_1", "Unnamed: 0_level_2"): "Plant",
    ("Cidade", "Unnamed: 1_level_2"): "City",
    ("Data", "Produção"): "Date",  # Arrumar!
    ("Unnamed: 136_level_1", "Blaine"): "Blaine",
    ("Unnamed: 137_level_1", "#200"): "#200",
    ("Unnamed: 138_level_1", "#325"): "#325",
    ("Unnamed: 139_level_1", "#400"): "#400",
    ("Unnamed: 140_level_1", "Massa específica"): "Specific Gravity",
    ("Tempo de pega", "Inicio"): "Initial setting time",
    ("Tempo de pega", "Fim"): "Final setting time",
    ("Resistência à compressão", "1 d"): "CS1",
    ("Resistência à compressão", "3 d"): "CS3",
    ("Resistência à compressão", "7 d"): "CS7",
    ("Resistência à compressão", "28 d"): "CS28",
    "Perda ao fogo": "Loss on Ignition",
    "Resíduo insolúvel": "Insoluble Residue",
    "Clínquer": "Clinker",
    "Fíler calcário": "Calcareous filler",
    "Cinza volante": "Fly ash",
    "Escória": "Slag",
    "Argila calcinada": "Calcined clay",
    "Sulfato de cálcio": "Calcium sulfate",
    "C3S total": "Total C3S",
    "C2S total": "Total C2S",
    "C2S alpha": "Alpha C2S",
    "C2S beta": "Beta C2S",
    "C2S gama": "Gamma C2S",
    "C4AF": "C4AF",
    "C3A": "C3A",
    "C3A cubic": "Cubic C3A",
    "C3A orto": "Orthorhombic C3A",
    "CaO livre": "Free CaO",
    "Portlandita": "Portlandite",
    "Periclasio": "Periclase",
    "Arcanita": "Arcanite",
    "Aphthalita": "Aphthitalite",
    "Langbeinita": "Langbeinite",
    "Bassanita": "Bassanite",
    "Anidrita": "Anhydrite",
    "Calcita": "Calcite",
    "Dolomita": "Dolomite",
    "Quartzo": "Quartz",
    "Muscovita": "Muscovite",
    "Sulfato de cálcio (composição)": "Calcium sulfate (composition)",
}

<IPython.core.display.Javascript object>

### Defining a single variable to identify Cement Type

In [15]:
# Single variable for the Cement Type
df["Cement_Type"] = (
    df[
        [
            ("Tipo de cimento", "Unnamed: 6_level_2"),
            ("Classe de resistência", "Unnamed: 7_level_2"),
            ("Obs.", "Unnamed: 8_level_2"),
        ]
    ]
    .fillna("")
    .astype(str)
    .sum(axis=1)
)

# # Preprocessing
df["Cement_Type"] = df["Cement_Type"].replace("Fibro", "Fibrocimento")

# Preprocessing
df["Cement_Type"] = (
    df["Cement_Type"]
    .str.replace("Expedido", "")
    .str.replace("Linha 9", "")
    .str.replace("Itajaí", "")
    .str.replace("D-G", "")
    .str.replace("D-E", "")
    .str.replace("DF2", "")
    .str.replace("CPIIF40", "CP II-F-40")
    .str.replace("CPIIF40", "CP II-F40")
    .str.replace("CP VARI", "CP V-ARI")
    .str.replace("CPVARI", "CP V-ARI")
    .str.replace("CP V-RSARI", "CP V-ARI RS")
    .str.replace("CP III-RS40", "CP III-40 RS")
    .str.replace("CP III-RS32", "CP III-32 RS")
    .str.replace("CP IV-RS32", "CP IV-32 RS")
    .str.replace("CP III40", "CP III-40")
    .str.replace("CP III32", "CP III-32")
    .str.replace("CP I-S40", "CP I-S-40")
    .str.replace("CP I40", "CP I-40")
    .str.replace("CP II-E32", "CP II-E-32")
    .str.replace("CP II-E40", "CP II-E-40")
    .str.replace("CP II-F FIBRO40", "CP II-F-40")
    .str.replace("CP II-F32", "CP II-F-32")
    .str.replace("CP II-F40", "CP II-F-40")
    .str.replace("CP II-Z32", "CP II-Z-32")
    .str.replace("CP II-Z40", "CP II-Z-40")
    .str.replace("CP IV32", "CP IV-32")
    .str.replace("CP IND", "CP I")
    .str.replace("CPINDCC", "CP I")
    .str.strip()
)

df = df.drop(COLUMNS_TO_DROP, axis=1).rename(COLUMNS_TO_RENAME, axis=1).copy()
df = df.drop([0]).reset_index(drop=True)
df = df.infer_objects()

<IPython.core.display.Javascript object>

### Dropping and rename the columns defined above

### Cleaning and converting numeric columns to float

In [16]:
NUMERIC_COLUMNS = [
    "Calcium sulfate (composition)",
    "Clinker",
    "Calcareous filler",
    "Fly ash",
    "Slag",
    "Calcined clay",
    "Calcium sulfate",
    "Total C3S",
    "Total C2S",
    "Alpha C2S",
    "Beta C2S",
    "Gamma C2S",
    "C4AF",
    "C3A",
    "Cubic C3A",
    "Orthorhombic C3A",
    "Free CaO",
    "Portlandite",
    "Periclase",
    "Arcanite",
    "Aphthitalite",
    "Langbeinite",
    "Bassanite",
    "Anhydrite",
    "Calcite",
    "Dolimita",
    "Quartz",
    "Muscovite",
    "CaO",
    "MgO",
    "Na2O",
    "Al2O3",
    "SiO2",
    "SO3",
    "K2O",
    "TiO2",
    "Fe2O3",
    "Loss on Ignition",
    "Insoluble Residue",
    "Blaine",
    "#200",
    "#325",
    "#400",
    "Specific Gravity",
    "Final setting time",
    "Initial setting time",
    "CS1",
    "CS3",
    "CS7",
    "CS28",
]

df = preprocess_numeric_cols(df, NUMERIC_COLUMNS)

# Preprocessing to fix numeric columns
df[NUMERIC_COLUMNS] = df[NUMERIC_COLUMNS].map(convert_to_float)

# Converting Date to pandas datetime
df["Date"] = pd.to_datetime(df["Date"])

# Drop rows completely missing
index_to_keep = (
    df.drop(["Plant", "Date", "Cement_Type"], axis=1).dropna(axis=0, how="all").index
)
df = df.iloc[index_to_keep].reset_index(drop=True)

df = df.infer_objects()

<IPython.core.display.Javascript object>

## Dropping missing features

### Drop rows completely missing

In [17]:
index_to_keep = (
    df.drop(["Plant", "City", "Date", "Cement_Type"], axis=1)
    .dropna(axis=0, how="all")
    .index
)
df = df.iloc[index_to_keep].reset_index(drop=True)

<IPython.core.display.Javascript object>

### Removing features
 that will be fitted on this data.

1. Removing features with features having ~50% or more of missing values
2. Removing features with zero variance
3. Removing features with more than 70% of zeros

In [18]:
(df.isna().sum() / df.shape[0] * 100).sort_values(ascending=False).to_frame(
    name="Missing (%)"
).style.background_gradient(cmap="Reds")

Unnamed: 0,Missing (%)
Calcite,100.0
Alpha C2S,100.0
Specific Gravity,100.0
TiO2,100.0
Muscovite,100.0
Arcanite,100.0
Gamma C2S,100.0
Calcium sulfate,100.0
Calcined clay,100.0
Slag,100.0


<IPython.core.display.Javascript object>

<h3>Percentage of missing values:</h3>
<table>
    <th>Plant AM - Full Dataset:</th>
    <th></th>
            <tr>
                <td>Calcite</td>
                <td>100.0</td>
            </tr>
            <tr>
                <td>Alpha C2S</td>
                <td>100.0</td>
            </tr>
            <tr>
                <td>Specific Gravity</td>
                <td>100.0</td>
            </tr>
            <tr>
                <td>TiO2</td>
                <td>100.0</td>
            </tr>
            <tr>
                <td>Muscovite</td>
                <td>100.0</td>
            </tr>
            <tr>
                <td>Arcanite</td>
                <td>100.0</td>
            </tr>
            <tr>
                <td>Gamma C2S</td>
                <td>100.0</td>
            </tr>
            <tr>
                <td>Calcium sulfate</td>
                <td>100.0</td>
            </tr>
            <tr>
                <td>Calcined clay</td>
                <td>100.0</td>
            </tr>
            <tr>
                <td>Slag</td>
                <td>100.0</td>
            </tr>
            <tr>
                <td>Clinker</td>
                <td>100.0</td>
            </tr>
            <tr>
                <td>Calcareous filler</td>
                <td>100.0</td>
            </tr>
            <tr>
                <td>Fly ash</td>
                <td>100.0</td>
            </tr>
            <tr>
                <td>Beta C2S</td>
                <td>98.57</td>
            </tr>
            <tr>
                <td>Total C2S</td>
                <td>92.44</td>
            </tr>
            <tr>
                <td>Quartz</td>
                <td>92.44</td>
            </tr>
            <tr>
                <td>Anhydrite</td>
                <td>92.44</td>
            </tr>
            <tr>
                <td>C3A</td>
                <td>89.98</td>
            </tr>
            <tr>
                <td>Bassanite</td>
                <td>89.3</td>
            </tr>
            <tr>
                <td>Total C3S</td>
                <td>88.04</td>
            </tr>
            <tr>
                <td>Calcium sulfate (composition)</td>
                <td>87.87</td>
            </tr>
            <tr>
                <td>#400</td>
                <td>84.98</td>
            </tr>
            <tr>
                <td>Cubic C3A</td>
                <td>82.9</td>
            </tr>
            <tr>
                <td>Orthorhombic C3A</td>
                <td>82.9</td>
            </tr>
            <tr>
                <td>Dolimita</td>
                <td>81.45</td>
            </tr>
            <tr>
                <td>Aphthitalite</td>
                <td>81.33</td>
            </tr>
            <tr>
                <td>Periclase</td>
                <td>81.33</td>
            </tr>
            <tr>
                <td>Langbeinite</td>
                <td>81.33</td>
            </tr>
            <tr>
                <td>CS1</td>
                <td>76.09</td>
            </tr>
            <tr>
                <td>C4AF</td>
                <td>66.59</td>
            </tr>
            <tr>
                <td>Free CaO</td>
                <td>66.55</td>
            </tr>
            <tr>
                <td>Portlandite</td>
                <td>64.71</td>
            </tr>
</table>

In [19]:
FEATRUES_TO_DROP = [
    "Calcite",
    "Alpha C2S",
    "Specific Gravity",
    "TiO2",
    "Muscovite",
    "Arcanite",
    "Gamma C2S",
    "Calcium sulfate",
    "Calcined clay",
    "Slag",
    "Clinker",
    "Calcareous filler",
    "Fly ash",
    "Beta C2S",
    "Total C2S",
    "Quartz",
    "Anhydrite",
    "C3A",
    "Bassanite",
    "Total C3S",
    "Calcium sulfate (composition)",
    "#400",
    "Cubic C3A",
    "Orthorhombic C3A",
    "Dolimita",
    "Aphthitalite",
    "Periclase",
    "Langbeinite",
    "CS1",
    "C4AF",
    "Free CaO",
    "Portlandite",
]
df = df.drop(labels=FEATRUES_TO_DROP, axis=1)

<IPython.core.display.Javascript object>

### Removing features with more than 23% of zeros

In [20]:
zero_values = {}
for col in df.select_dtypes(include="number").columns:
    zero_percentages = (df[df[col].eq(0)].shape[0] / df.shape[0]) * 100
    zero_values[col] = zero_percentages

zero_percentages = pd.Series(zero_values, name=f"Zero (%) for {plant}")
zero_percentages = zero_percentages.sort_values(ascending=False)
zero_percentages = zero_percentages.to_frame(name=f"Zero (%) for {plant}")
zero_percentages.style.background_gradient(cmap="Reds")

Unnamed: 0,Zero (%) for AM
#200,28.985507
Na2O,1.932367
#325,0.241546
Insoluble Residue,0.193237
K2O,0.024155
Loss on Ignition,0.024155
Blaine,0.024155
CS7,0.0
CS3,0.0
Final setting time,0.0


<IPython.core.display.Javascript object>

In [21]:
df.shape

(4140, 22)

<IPython.core.display.Javascript object>

<h3>Percentage of zero values:</h3>
<table>
    <th>Plant AM - Full Dataset:</th>
    <th></th>
    <tr>
        <td>#200</td>
        <td>28.99</td>
    </tr>
</table>

Here we drop th #200 with only 28.99% of zeros because deleting on the lnes the dataset with current shape of 4140 drops to 2773, which is about 66.98% of the full dataset, meaning that the zeros occurrence are not contiguous in the dataset.

In [22]:
FEATRUES_TO_DROP = ["#200"]
df = df.drop(labels=FEATRUES_TO_DROP, axis=1)

<IPython.core.display.Javascript object>

### Dropping variable with City and Plant identification

In [23]:
FEATRUES_TO_DROP = ["City", "Plant"]
df = df.drop(labels=FEATRUES_TO_DROP, axis=1)

<IPython.core.display.Javascript object>

### Dropping rows where the target variable is null

In [24]:
df = df[df["CS28"].notna()]

<IPython.core.display.Javascript object>

### Dropping rows where the target variable is zero

In [25]:
df = df[~df["CS28"].eq(0)]

<IPython.core.display.Javascript object>

In [26]:
df.shape

(4025, 19)

<IPython.core.display.Javascript object>

In [27]:
df.describe().T

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
Date,4025.0,2022-04-21 07:42:56.795030784,2021-01-04 00:00:00,2021-09-01 00:00:00,2022-05-06 00:00:00,2022-12-07 00:00:00,2023-07-25 00:00:00,
CaO,4020.0,55.244476,32.0,52.369999,56.72681,59.19922,62.200001,4.803191
MgO,4021.0,5.938297,2.68,5.61,5.93,6.38888,8.57,0.598598
Na2O,4019.0,0.17528,-0.12633,0.06,0.16075,0.24,7.54,0.311532
Al2O3,4020.0,5.390938,3.88,4.35726,4.79214,6.07,11.41,1.39142
SiO2,4020.0,20.791825,16.53,17.85,18.93,22.966985,35.86,3.737064
SO3,4009.0,2.883638,1.35,2.57,2.87,3.22,4.32,0.455767
K2O,4019.0,1.069346,0.0,0.91689,1.05966,1.19,1.79,0.181096
Fe2O3,4020.0,2.969836,2.30147,2.694358,2.89,3.16,4.68,0.371775
Loss on Ignition,3995.0,5.608169,0.0,4.47,5.58,6.93,13.92,1.445409


<IPython.core.display.Javascript object>

In [28]:
# df_copy = df.copy()

<IPython.core.display.Javascript object>

### Dropping rows where any other variable has a zero value

In [29]:
for col in df.columns:
    df = df[~df[col].eq(0)]

<IPython.core.display.Javascript object>

In [30]:
df.shape

(3925, 19)

<IPython.core.display.Javascript object>

In [31]:
(df.isna().sum() / df.shape[0] * 100).sort_values(ascending=False).to_frame(
    name="Missing (%)"
).style.background_gradient(cmap="Reds")

Unnamed: 0,Missing (%)
Loss on Ignition,0.764331
#325,0.764331
SO3,0.407643
Na2O,0.152866
K2O,0.152866
Al2O3,0.127389
SiO2,0.127389
Fe2O3,0.127389
CaO,0.127389
MgO,0.101911


<IPython.core.display.Javascript object>

### Dropping duplicated rows

<!-- <h3>Percentage of missing values:</h3>
<table>
    <th>Plant A - Full Dataset:</th>
    <th></th>
    <tr>
        <td>C3A</td>
        <td>38.39</td>
    </tr>
    <tr>
        <td>Aphthitalite</td>
        <td>37.52</td>
    </tr>
</table> -->

In [32]:
df.shape

(3925, 19)

<IPython.core.display.Javascript object>

In [33]:
df = df.reset_index(drop=True)

<IPython.core.display.Javascript object>

In [34]:
df = df.loc[df.select_dtypes("number").drop_duplicates().index].reset_index(drop=True)

chemical_vars = [
    "CaO",
    "MgO",
    "Na2O",
    "Al2O3",
    "SiO2",
    "SO3",
    "K2O",
    "Fe2O3",
    "Loss on Ignition",
    "Insoluble Residue",
]

df = df.loc[
    df.select_dtypes("number").drop_duplicates(subset=chemical_vars).index
].reset_index(drop=True)

<IPython.core.display.Javascript object>

In [35]:
df.shape

(3857, 19)

<IPython.core.display.Javascript object>

### Dropping rows with negative values

In [36]:
df[df.drop(["Date"], axis=1).duplicated()].drop("Date", axis=1).select_dtypes(
    include="number"
).apply(lambda x: x < 0).sum().sort_values(ascending=False)

CaO                     0.0
Insoluble Residue       0.0
CS7                     0.0
CS3                     0.0
Final setting time      0.0
Initial setting time    0.0
#325                    0.0
Blaine                  0.0
Loss on Ignition        0.0
MgO                     0.0
Fe2O3                   0.0
K2O                     0.0
SO3                     0.0
SiO2                    0.0
Al2O3                   0.0
Na2O                    0.0
CS28                    0.0
dtype: float64

<IPython.core.display.Javascript object>

### Removing outliers

In [37]:
df.describe(percentiles=[0.99]).T

Unnamed: 0,count,mean,min,50%,99%,max,std
Date,3857.0,2022-04-20 06:06:15.213896960,2021-01-04 00:00:00,2022-05-10 00:00:00,2023-07-10 10:33:36,2023-07-25 00:00:00,
CaO,3854.0,55.199225,32.0,55.945,60.673501,62.200001,4.848797
MgO,3855.0,5.941118,2.68,5.93,7.2546,8.57,0.602642
Na2O,3853.0,0.17863,-0.12633,0.16973,0.45,7.54,0.316887
Al2O3,3854.0,5.405068,3.88,4.935,10.3647,11.41,1.405527
SiO2,3854.0,20.83122,16.53,19.38,33.2488,35.86,3.767593
SO3,3843.0,2.874147,1.35,2.86,3.8858,4.32,0.450475
K2O,3853.0,1.07149,0.04,1.06,1.58,1.79,0.180158
Fe2O3,3854.0,2.973302,2.30147,2.9,4.23,4.68,0.374168
Loss on Ignition,3830.0,5.619093,2.01,5.6,8.0371,12.86,1.433036


<IPython.core.display.Javascript object>

In [38]:
df = df.reset_index(drop=True)
df = df[df["Na2O"] < 0.5].reset_index(drop=True)
df = df[df["Insoluble Residue"] < 30].reset_index(drop=True)
df = df[df["#325"] < 20].reset_index(drop=True)
df = df.drop(df["Loss on Ignition"].idxmin()).reset_index(drop=True)

<IPython.core.display.Javascript object>

In [39]:
df.describe(percentiles=[0.99]).T

Unnamed: 0,count,mean,min,50%,99%,max,std
Date,3776.0,2022-04-22 13:12:04.576271104,2021-01-04 00:00:00,2022-05-12 00:00:00,2023-07-10 06:00:00,2023-07-25 00:00:00,
CaO,3776.0,55.445056,32.0,57.159266,60.700001,62.200001,4.503489
MgO,3776.0,5.959343,2.68,5.94,7.2625,8.57,0.590602
Na2O,3776.0,0.16113,-0.12633,0.16,0.4,0.49898,0.105912
Al2O3,3776.0,5.327511,3.88,4.726715,10.09,10.97,1.279093
SiO2,3776.0,20.627858,16.53,18.790001,32.495,34.53,3.454666
SO3,3766.0,2.886735,1.58,2.87,3.89,4.32,0.439934
K2O,3776.0,1.064517,0.71,1.05,1.53,1.7,0.168933
Fe2O3,3776.0,2.953145,2.30147,2.88,4.15,4.68,0.343584
Loss on Ignition,3776.0,5.624797,2.18,5.61,8.0425,12.86,1.436693


<IPython.core.display.Javascript object>

In [40]:
df.describe().T

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
Date,3776.0,2022-04-22 13:12:04.576271104,2021-01-04 00:00:00,2021-09-08 00:00:00,2022-05-12 00:00:00,2022-12-05 00:00:00,2023-07-25 00:00:00,
CaO,3776.0,55.445056,32.0,52.470001,57.159266,59.227135,62.200001,4.503489
MgO,3776.0,5.959343,2.68,5.63,5.94,6.4,8.57,0.590602
Na2O,3776.0,0.16113,-0.12633,0.06,0.16,0.24,0.49898,0.105912
Al2O3,3776.0,5.327511,3.88,4.35,4.726715,6.03,10.97,1.279093
SiO2,3776.0,20.627858,16.53,17.848237,18.790001,22.879999,34.53,3.454666
SO3,3766.0,2.886735,1.58,2.57,2.87,3.21,4.32,0.439934
K2O,3776.0,1.064517,0.71,0.92,1.05,1.18,1.7,0.168933
Fe2O3,3776.0,2.953145,2.30147,2.693295,2.88,3.15,4.68,0.343584
Loss on Ignition,3776.0,5.624797,2.18,4.52,5.61,6.94,12.86,1.436693


<IPython.core.display.Javascript object>

In [41]:
df.describe(exclude="number").T

Unnamed: 0,count,unique,top,freq,mean,min,25%,50%,75%,max
Date,3776,,,,2022-04-22 13:12:04.576271104,2021-01-04 00:00:00,2021-09-08 00:00:00,2022-05-12 00:00:00,2022-12-05 00:00:00,2023-07-25 00:00:00
Cement_Type,3776,6.0,CP II-Z-32,1179.0,,,,,,


<IPython.core.display.Javascript object>

In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3776 entries, 0 to 3775
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   Date                  3776 non-null   datetime64[ns]
 1   CaO                   3776 non-null   float64       
 2   MgO                   3776 non-null   float64       
 3   Na2O                  3776 non-null   float64       
 4   Al2O3                 3776 non-null   float64       
 5   SiO2                  3776 non-null   float64       
 6   SO3                   3766 non-null   float64       
 7   K2O                   3776 non-null   float64       
 8   Fe2O3                 3776 non-null   float64       
 9   Loss on Ignition      3776 non-null   float64       
 10  Insoluble Residue     3776 non-null   float64       
 11  Blaine                3776 non-null   float64       
 12  #325                  3776 non-null   float64       
 13  Initial setting ti

<IPython.core.display.Javascript object>

### Sort the dataset by date

In [43]:
# We first make sure Date column was infered correctly
df["Date"].dtype

dtype('<M8[ns]')

<IPython.core.display.Javascript object>

In [44]:
df = df.sort_values(by="Date")

<IPython.core.display.Javascript object>

## Saving the dataset

<h3>Saving for feature engineering</h3>

In [45]:
df.to_csv("../../../../data/interim/209/am.csv", index=False)

<IPython.core.display.Javascript object>