In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from scipy.stats import pearsonr
from collections import Counter

<IPython.core.display.Javascript object>

In [3]:
def convert_to_float(value):
    try:
        # If it's a string enclosed in single quotes, remove quotes and replace comma with dot
        if isinstance(value, str):
            return float(value.replace(",", "."))
        else:
            return float(value)
    except ValueError as e:
        print(e)
        return float("nan")

<IPython.core.display.Javascript object>

In [4]:
def preprocess_numeric_cols(df, columns):
    for col in columns:
        df[col] = df[col].astype(str)  # Ensure string type for string operations
        df[col] = df[col].str.replace("#", "")  # Remove '#' characters
        df[col] = df[col].str.replace(",", ".")  # Replace ',' with '.' for decimals
    return df

<IPython.core.display.Javascript object>

# Data preparation - 209 plant E data

### Reading the files and extracting relevant information:

In [5]:
plant = "E"
xls_files = {}

xls = pd.ExcelFile(
    f"../../../../data/raw/209/EMBRAPII hubIC IACC_CIMENTO_{plant}.xlsx",
    engine="openpyxl",
)
xls_files[plant] = xls

<IPython.core.display.Javascript object>

In [6]:
print("Plant: ", plant, xls.sheet_names)

Plant:  E ['INSTRUÇÕES', 'Clínquer', '121', '178']


<IPython.core.display.Javascript object>

In [7]:
cement_types_per_plant = []
CEMENT_TYPE_COLS = ["Tipo de cimento", "Classe de resistência"]
dataframes = []

for plant, xls in xls_files.items():
    for sheet_name in xls.sheet_names[2:]:
        df = pd.read_excel(xls_files[plant], sheet_name, header=[1, 2])

        if not df[CEMENT_TYPE_COLS].iloc[2:].isna().all().all():
            cement_type = (
                df[CEMENT_TYPE_COLS]
                .loc[2:]
                .astype(str)
                .sum(axis=1)
                .str.replace(" ", "")
                .str.replace("-", "")
                .str.replace("nan", "")
                .unique()
            )

        else:
            cement_type = (
                df[[("Obs.", "Unnamed: 8_level_1")]]
                .loc[2:]
                .astype(str)
                .sum(axis=1)
                .str.replace(" ", "")
                .str.replace("-", "")
                .unique()
            )

        cement_types_per_plant.append((plant, sheet_name, cement_type))
        dataframes.append(df)

<IPython.core.display.Javascript object>

In [8]:
cement_types_per_plant

[('E', '121', array(['CPIIE40'], dtype=object)),
 ('E', '178', array(['CPIII32'], dtype=object))]

<IPython.core.display.Javascript object>

In [9]:
cements = []
for tup in cement_types_per_plant:
    cements.append(tup[2][0])
Counter(cements)

Counter({'CPIIE40': 1, 'CPIII32': 1})

<IPython.core.display.Javascript object>

In [10]:
dataframes = []

for plant, xls in xls_files.items():
    for sheet_name in xls.sheet_names[2:]:
        df = pd.read_excel(xls_files[plant], sheet_name, header=[0, 1, 2])
        df["Unnamed: 0_level_0", "Unnamed: 0_level_1", "Unnamed: 0_level_2"] = plant
        dataframes.append(df.drop([0, 1], axis=0).reset_index(drop=True).copy())

<IPython.core.display.Javascript object>

In [11]:
df = pd.concat(dataframes, axis=0).reset_index(drop=True)

<IPython.core.display.Javascript object>

## Initial Preprocessing

In [12]:
df = pd.concat(
    [
        df["Unnamed: 0_level_0", "Unnamed: 0_level_1", "Unnamed: 0_level_2"],
        df["Dados iniciais"],
        df["Cimento"]["Composição"].rename(
            {"Sulfato de cálcio": "Sulfato de cálcio (composição)"}, axis=1
        ),
        df["Cimento"]["Análise mineralógica"],
        df["Cimento"]["Análise química"],
        df["Característias físicas do cimento"],
        df["Propriedades do cimento"],
    ],
    axis=1,
)

<IPython.core.display.Javascript object>

In [13]:
# df.info()

<IPython.core.display.Javascript object>

### 1. Dropping columns where either there is no data nor it contributes to the analysis
### 2. Renaming columns to a short identifiable name

In [14]:
COLUMNS_TO_DROP = [
    ("Data", "Medida"),
    ("Horário", "Produção"),
    ("Horário", "Medida"),
    ("Obs.", "Unnamed: 148_level_2"),
    ("Tipo de cimento", "Unnamed: 6_level_2"),
    ("Classe de resistência", "Unnamed: 7_level_2"),
    ("Obs.", "Unnamed: 8_level_2"),
    "Obs.",
    ("Obs.", "Unnamed: 148_level_2"),
    ("Unnamed: 141_level_1", "Obs."),
    # ("Unnamed: 139_level_1", "#400"),
]

COLUMNS_TO_RENAME = {
    ("Unnamed: 0_level_0", "Unnamed: 0_level_1", "Unnamed: 0_level_2"): "Plant",
    ("Cidade", "Unnamed: 1_level_2"): "City",
    ("Data", "Produção"): "Date",  # Arrumar!
    ("Unnamed: 136_level_1", "Blaine"): "Blaine",
    ("Unnamed: 137_level_1", "#200"): "#200",
    ("Unnamed: 138_level_1", "#325"): "#325",
    ("Unnamed: 139_level_1", "#400"): "#400",
    ("Unnamed: 140_level_1", "Massa específica"): "Specific Gravity",
    ("Tempo de pega", "Inicio"): "Initial setting time",
    ("Tempo de pega", "Fim"): "Final setting time",
    ("Resistência à compressão", "1 d"): "CS1",
    ("Resistência à compressão", "3 d"): "CS3",
    ("Resistência à compressão", "7 d"): "CS7",
    ("Resistência à compressão", "28 d"): "CS28",
    "Perda ao fogo": "Loss on Ignition",
    "Resíduo insolúvel": "Insoluble Residue",
    "Clínquer": "Clinker",
    "Fíler calcário": "Calcareous filler",
    "Cinza volante": "Fly ash",
    "Escória": "Slag",
    "Argila calcinada": "Calcined clay",
    "Sulfato de cálcio": "Calcium sulfate",
    "C3S total": "Total C3S",
    "C2S total": "Total C2S",
    "C2S alpha": "Alpha C2S",
    "C2S beta": "Beta C2S",
    "C2S gama": "Gamma C2S",
    "C4AF": "C4AF",
    "C3A": "C3A",
    "C3A cubic": "Cubic C3A",
    "C3A orto": "Orthorhombic C3A",
    "CaO livre": "Free CaO",
    "Portlandita": "Portlandite",
    "Periclasio": "Periclase",
    "Arcanita": "Arcanite",
    "Aphthalita": "Aphthitalite",
    "Langbeinita": "Langbeinite",
    "Bassanita": "Bassanite",
    "Anidrita": "Anhydrite",
    "Calcita": "Calcite",
    "Dolomita": "Dolomite",
    "Quartzo": "Quartz",
    "Muscovita": "Muscovite",
    "Sulfato de cálcio (composição)": "Calcium sulfate (composition)",
}

<IPython.core.display.Javascript object>

### Defining a single variable to identify Cement Type

In [15]:
# Single variable for the Cement Type
df["Cement_Type"] = (
    df[
        [
            ("Tipo de cimento", "Unnamed: 6_level_2"),
            ("Classe de resistência", "Unnamed: 7_level_2"),
            ("Obs.", "Unnamed: 8_level_2"),
        ]
    ]
    .fillna("")
    .astype(str)
    .sum(axis=1)
)

# # Preprocessing
df["Cement_Type"] = df["Cement_Type"].replace("Fibro", "Fibrocimento")

# Preprocessing
df["Cement_Type"] = (
    df["Cement_Type"]
    .str.replace("Expedido", "")
    .str.replace("Linha 9", "")
    .str.replace("Itajaí", "")
    .str.replace("D-G", "")
    .str.replace("D-E", "")
    .str.replace("DF2", "")
    .str.replace("CPIIF40", "CP II-F-40")
    .str.replace("CPIIF40", "CP II-F40")
    .str.replace("CP VARI", "CP V-ARI")
    .str.replace("CPVARI", "CP V-ARI")
    .str.replace("CP V-RSARI", "CP V-ARI RS")
    .str.replace("CP III-RS40", "CP III-40 RS")
    .str.replace("CP III-RS32", "CP III-32 RS")
    .str.replace("CP IV-RS32", "CP IV-32 RS")
    .str.replace("CP III40", "CP III-40")
    .str.replace("CP III32", "CP III-32")
    .str.replace("CP I-S40", "CP I-S-40")
    .str.replace("CP I40", "CP I-40")
    .str.replace("CP II-E32", "CP II-E-32")
    .str.replace("CP II-E40", "CP II-E-40")
    .str.replace("CP II-F FIBRO40", "CP II-F-40")
    .str.replace("CP II-F32", "CP II-F-32")
    .str.replace("CP II-F40", "CP II-F-40")
    .str.replace("CP II-Z32", "CP II-Z-32")
    .str.replace("CP II-Z40", "CP II-Z-40")
    .str.replace("CP IV32", "CP IV-32")
    .str.replace("CP IND", "CP I")
    .str.replace("CPINDCC", "CP I")
    .str.strip()
)

df = df.drop(COLUMNS_TO_DROP, axis=1).rename(COLUMNS_TO_RENAME, axis=1).copy()
df = df.drop([0]).reset_index(drop=True)
df = df.infer_objects()

<IPython.core.display.Javascript object>

### Dropping and rename the columns defined above

### Cleaning and converting numeric columns to float

In [16]:
NUMERIC_COLUMNS = [
    "Calcium sulfate (composition)",
    "Clinker",
    "Calcareous filler",
    "Fly ash",
    "Slag",
    "Calcined clay",
    "Calcium sulfate",
    "Total C3S",
    "Total C2S",
    "Alpha C2S",
    "Beta C2S",
    "Gamma C2S",
    "C4AF",
    "C3A",
    "Cubic C3A",
    "Orthorhombic C3A",
    "Free CaO",
    "Portlandite",
    "Periclase",
    "Arcanite",
    "Aphthitalite",
    "Langbeinite",
    "Bassanite",
    "Anhydrite",
    "Calcite",
    "Dolimita",
    "Quartz",
    "Muscovite",
    "CaO",
    "MgO",
    "Na2O",
    "Al2O3",
    "SiO2",
    "SO3",
    "K2O",
    "TiO2",
    "Fe2O3",
    "Loss on Ignition",
    "Insoluble Residue",
    "Blaine",
    "#200",
    "#325",
    "#400",
    "Specific Gravity",
    "Final setting time",
    "Initial setting time",
    "CS1",
    "CS3",
    "CS7",
    "CS28",
]

df = preprocess_numeric_cols(df, NUMERIC_COLUMNS)

# Preprocessing to fix numeric columns
df[NUMERIC_COLUMNS] = df[NUMERIC_COLUMNS].map(convert_to_float)

# Converting Date to pandas datetime
df["Date"] = pd.to_datetime(df["Date"])

# Drop rows completely missing
index_to_keep = (
    df.drop(["Plant", "Date", "Cement_Type"], axis=1).dropna(axis=0, how="all").index
)
df = df.iloc[index_to_keep].reset_index(drop=True)

df = df.infer_objects()

could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string 

<IPython.core.display.Javascript object>

## Dropping missing features

### Drop rows completely missing

In [17]:
index_to_keep = (
    df.drop(["Plant", "City", "Date", "Cement_Type"], axis=1)
    .dropna(axis=0, how="all")
    .index
)
df = df.iloc[index_to_keep].reset_index(drop=True)

<IPython.core.display.Javascript object>

### Removing features
 that will be fitted on this data.

1. Removing features with features having ~50% or more of missing values
2. Removing features with zero variance
3. Removing features with more than 70% of zeros

In [18]:
(df.isna().sum() / df.shape[0] * 100).sort_values(ascending=False).to_frame(
    name="Missing (%)"
).style.background_gradient(cmap="Reds")

Unnamed: 0,Missing (%)
CS1,100.0
Muscovite,100.0
Na2O,100.0
Calcium sulfate,100.0
Al2O3,100.0
SiO2,100.0
K2O,100.0
TiO2,100.0
Fe2O3,100.0
CaO,100.0


<IPython.core.display.Javascript object>

<h3>Percentage of missing values:</h3>
<table>
    <th>Plant E - Full Dataset:</th>
    <th></th>
	<tr>
            <td>CS1</td>
            <td>100.0</td>
	</tr>
	<tr>
            <td>Muscovite</td>
            <td>100.0</td>
	</tr>
	<tr>
            <td>Na2O</td>
            <td>100.0</td>
	</tr>
	<tr>
            <td>Calcium sulfate</td>
            <td>100.0</td>
	</tr>
	<tr>
            <td>Al2O3</td>
            <td>100.0</td>
	</tr>
	<tr>
            <td>SiO2</td>
            <td>100.0</td>
	</tr>
	<tr>
            <td>K2O</td>
            <td>100.0</td>
	</tr>
	<tr>
            <td>TiO2</td>
            <td>100.0</td>
	</tr>
	<tr>
            <td>Fe2O3</td>
            <td>100.0</td>
	</tr>
	<tr>
            <td>CaO</td>
            <td>100.0</td>
	</tr>
	<tr>
            <td>Calcined clay</td>
            <td>100.0</td>
	</tr>
	<tr>
            <td>Specific Gravity</td>
            <td>100.0</td>
	</tr>
	<tr>
            <td>Clinker</td>
            <td>100.0</td>
	</tr>
	<tr>
            <td>Calcareous filler</td>
            <td>100.0</td>
	</tr>
	<tr>
            <td>Fly ash</td>
            <td>100.0</td>
	</tr>
	<tr>
            <td>Slag</td>
            <td>100.0</td>
	</tr>
	<tr>
            <td>Alpha C2S</td>
            <td>58.41</td>
	</tr>
	<tr>
            <td>Anhydrite</td>
            <td>58.41</td>
	</tr>
	<tr>
            <td>Total C2S</td>
            <td>58.41</td>
	</tr>
	<tr>
            <td>Calcium sulfate (composition)</td>
            <td>58.41</td>
	</tr>
	<tr>
            <td>Total C3S</td>
            <td>58.41</td>
	</tr>
	<tr>
            <td>Gamma C2S</td>
            <td>58.41</td>
	</tr>
	<tr>
            <td>Beta C2S</td>
            <td>58.41</td>
	</tr>
	<tr>
            <td>#400</td>
            <td>56.39</td>
	</tr>
	<tr>
            <td>MgO</td>
            <td>55.87</td>
	</tr>
	<tr>
            <td>#200</td>
            <td>45.71</td>
	</tr>
	<tr>
            <td>#325</td>
            <td>45.27</td>
	</tr>

</table>

In [19]:
FEATRUES_TO_DROP = [
    "CS1",
    "Muscovite",
    "Na2O",
    "Calcium sulfate",
    "Al2O3",
    "SiO2",
    "K2O",
    "TiO2",
    "Fe2O3",
    "CaO",
    "Calcined clay",
    "Specific Gravity",
    "Clinker",
    "Calcareous filler",
    "Fly ash",
    "Slag",
    "Alpha C2S",
    "Anhydrite",
    "Total C2S",
    "Calcium sulfate (composition)",
    "Total C3S",
    "Gamma C2S",
    "Beta C2S",
    "#400",
    "MgO",
    "#200",
    "#325",
]
df = df.drop(labels=FEATRUES_TO_DROP, axis=1)

<IPython.core.display.Javascript object>

### Removing features with more than 23% of zeros

In [20]:
zero_values = {}
for col in df.select_dtypes(include="number").columns:
    zero_percentages = (df[df[col].eq(0)].shape[0] / df.shape[0]) * 100
    zero_values[col] = zero_percentages

zero_percentages = pd.Series(zero_values, name=f"Zero (%) for {plant}")
zero_percentages = zero_percentages.sort_values(ascending=False)
zero_percentages = zero_percentages.to_frame(name=f"Zero (%) for {plant}")
zero_percentages.style.background_gradient(cmap="Reds")

Unnamed: 0,Zero (%) for E
Orthorhombic C3A,22.591944
Arcanite,11.908932
Bassanite,9.894921
Dolimita,6.392294
SO3,3.327496
Langbeinite,2.101576
Aphthitalite,1.401051
Quartz,0.087566
C4AF,0.0
Insoluble Residue,0.0


<IPython.core.display.Javascript object>

<h3>Percentage of zero values:</h3>
<table>
    <th>Plant A - Full Dataset:</th>
    <th></th>
    <tr>
        <td>Orthorhombic C3A</td>
        <td>22.59</td>
    </tr>
    <tr>
        <td>Arcanite</td>
        <td>11.91</td>
    </tr>
    <tr>
        <td>Bassanite</td>
        <td>9.89</td>
    </tr>  
    <tr>
        <td></td>
        <td></td>
    </tr>
    <tr>
        <td></td>
        <td></td>
    </tr>
</table>

In [21]:
FEATRUES_TO_DROP = ["Orthorhombic C3A", "Arcanite", "Bassanite"]
df = df.drop(labels=FEATRUES_TO_DROP, axis=1)

<IPython.core.display.Javascript object>

### Dropping variable with City and Plant identification

In [22]:
FEATRUES_TO_DROP = ["City", "Plant"]
df = df.drop(labels=FEATRUES_TO_DROP, axis=1)

<IPython.core.display.Javascript object>

### Dropping rows where the target variable is null

In [23]:
df = df[df["CS28"].notna()]

<IPython.core.display.Javascript object>

### Dropping rows where the target variable is zero

In [24]:
df = df[~df["CS28"].eq(0)]

<IPython.core.display.Javascript object>

### Dropping duplicated rows

In [25]:
df.shape

(1101, 22)

<IPython.core.display.Javascript object>

In [26]:
df = df.reset_index(drop=True)

<IPython.core.display.Javascript object>

In [27]:
df = df.loc[df.select_dtypes("number").drop_duplicates().index].reset_index(drop=True)

chemical_vars = [
    "C4AF",
    "C3A",
    "Cubic C3A",
    "Free CaO",
    "Portlandite",
    "Periclase",
    "Aphthitalite",
    "Langbeinite",
    "Calcite",
    "Dolimita",
    "Quartz",
    "SO3",
    "Loss on Ignition",
    "Insoluble Residue",
]

df = df.loc[
    df.select_dtypes("number").drop_duplicates(subset=chemical_vars).index
].reset_index(drop=True)

<IPython.core.display.Javascript object>

In [28]:
df.shape

(1075, 22)

<IPython.core.display.Javascript object>

### Dropping rows with negative values

In [29]:
df[df.drop(["Date"], axis=1).duplicated()].drop("Date", axis=1).select_dtypes(
    include="number"
).apply(lambda x: x < 0).sum().sort_values(ascending=False)

C4AF                    0.0
C3A                     0.0
CS7                     0.0
CS3                     0.0
Final setting time      0.0
Initial setting time    0.0
Blaine                  0.0
Insoluble Residue       0.0
Loss on Ignition        0.0
SO3                     0.0
Quartz                  0.0
Dolimita                0.0
Calcite                 0.0
Langbeinite             0.0
Aphthitalite            0.0
Periclase               0.0
Portlandite             0.0
Free CaO                0.0
Cubic C3A               0.0
CS28                    0.0
dtype: float64

<IPython.core.display.Javascript object>

### Removing outliers

In [30]:
df.describe(percentiles=[0.99]).T

Unnamed: 0,count,mean,min,50%,99%,max,std
Date,1075.0,2022-05-10 14:13:17.023256064,2021-01-05 00:00:00,2022-05-27 00:00:00,2023-07-07 18:43:12,2023-07-24 00:00:00,
C4AF,900.0,5.777378,2.12,7.16,9.091,14.77,2.348809
C3A,900.0,1.724322,0.48,1.68,3.0404,3.31,0.772079
Cubic C3A,900.0,1.636289,0.4,1.62,2.9202,3.31,0.754876
Free CaO,900.0,0.884956,0.12,0.8,1.8101,2.34,0.410195
Portlandite,900.0,0.662267,0.05,0.59,1.5901,2.03,0.334826
Periclase,900.0,1.021356,0.39,1.175,1.7601,2.75,0.408798
Aphthitalite,900.0,0.465089,0.0,0.43,1.0601,1.38,0.336461
Langbeinite,900.0,0.245478,0.0,0.21,0.88,1.57,0.179059
Calcite,900.0,5.656556,0.39,5.635,9.0211,16.48,1.599148


<IPython.core.display.Javascript object>

In [31]:
df = df.reset_index(drop=True)
df = df[~df["Calcite"].eq(df["Calcite"].max())].reset_index(drop=True)

df = df[~df["Insoluble Residue"].eq(df["Insoluble Residue"].max())].reset_index(
    drop=True
)
df = df[~df["Final setting time"].eq(df["Final setting time"].min())].reset_index(
    drop=True
)

<IPython.core.display.Javascript object>

In [32]:
df.describe(percentiles=[0.99]).T

Unnamed: 0,count,mean,min,50%,99%,max,std
Date,1072.0,2022-05-10 01:21:56.417910528,2021-01-05 00:00:00,2022-05-26 12:00:00,2023-07-07 20:52:48,2023-07-24 00:00:00,
C4AF,898.0,5.776258,2.12,7.16,9.093,14.77,2.34977
C3A,898.0,1.725757,0.48,1.685,3.0412,3.31,0.772324
Cubic C3A,898.0,1.637617,0.4,1.635,2.9206,3.31,0.755186
Free CaO,898.0,0.885546,0.12,0.8,1.8103,2.34,0.410428
Portlandite,898.0,0.662984,0.05,0.59,1.5903,2.03,0.334745
Periclase,898.0,1.020089,0.39,1.175,1.7503,2.75,0.405951
Aphthitalite,898.0,0.466091,0.0,0.43,1.0603,1.38,0.336162
Langbeinite,898.0,0.245679,0.0,0.21,0.88,1.57,0.179057
Calcite,898.0,5.646069,0.39,5.635,8.9036,11.28,1.558866


<IPython.core.display.Javascript object>

### Dropping rows where any other variable has a zero value

In [33]:
for col in df.columns:
    df = df[~df[col].eq(0)]

<IPython.core.display.Javascript object>

In [34]:
df.shape

(938, 22)

<IPython.core.display.Javascript object>

In [35]:
(df.isna().sum() / df.shape[0] * 100).sort_values(ascending=False).to_frame(
    name="Missing (%)"
).style.background_gradient(cmap="Reds")

Unnamed: 0,Missing (%)
SO3,23.667377
Quartz,18.550107
Aphthitalite,18.550107
C4AF,18.550107
Calcite,18.550107
Langbeinite,18.550107
Dolimita,18.550107
Periclase,18.550107
Portlandite,18.550107
Free CaO,18.550107


<IPython.core.display.Javascript object>

## Dropping rows with increased missing values after preprocessing

<!-- <h3>Percentage of missing values:</h3>
<table>
    <th>Plant A - Full Dataset:</th>
    <th></th>
    <tr>
        <td>C3A</td>
        <td>38.39</td>
    </tr>
    <tr>
        <td>Aphthitalite</td>
        <td>37.52</td>
    </tr>
</table> -->

In [36]:
# FEATRUES_TO_DROP = ["C3A", "Aphthitalite"]
# df = df.drop(labels=FEATRUES_TO_DROP, axis=1)

<IPython.core.display.Javascript object>

In [37]:
(df.isna().sum() / df.shape[0] * 100).sort_values(ascending=False).to_frame(
    name="Missing (%)"
).style.background_gradient(cmap="Reds")

Unnamed: 0,Missing (%)
SO3,23.667377
Quartz,18.550107
Aphthitalite,18.550107
C4AF,18.550107
Calcite,18.550107
Langbeinite,18.550107
Dolimita,18.550107
Periclase,18.550107
Portlandite,18.550107
Free CaO,18.550107


<IPython.core.display.Javascript object>

In [38]:
df.describe().T

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
Date,938.0,2022-05-05 01:39:47.206822912,2021-01-05 00:00:00,2021-11-04 06:00:00,2022-05-20 12:00:00,2022-11-03 00:00:00,2023-07-24 00:00:00,
C4AF,764.0,6.030838,2.12,3.35,7.48,8.0025,10.18,2.34099
C3A,764.0,1.821034,0.5,1.02,1.97,2.5025,3.25,0.759454
Cubic C3A,764.0,1.728887,0.4,0.94,1.91,2.4,3.23,0.745505
Free CaO,764.0,0.925157,0.12,0.54,0.93,1.26,2.08,0.416037
Portlandite,764.0,0.67966,0.05,0.43,0.61,0.8625,2.03,0.341878
Periclase,764.0,1.065628,0.39,0.61,1.25,1.4,2.75,0.406334
Aphthitalite,764.0,0.506728,0.01,0.17,0.565,0.79,1.23,0.330905
Langbeinite,764.0,0.262788,0.01,0.14,0.23,0.34,1.57,0.178946
Calcite,764.0,5.708194,0.39,4.47,5.675,6.92,11.28,1.508423


<IPython.core.display.Javascript object>

In [39]:
df.describe(exclude="number").T

Unnamed: 0,count,unique,top,freq,mean,min,25%,50%,75%,max
Date,938,,,,2022-05-05 01:39:47.206822912,2021-01-05 00:00:00,2021-11-04 06:00:00,2022-05-20 12:00:00,2022-11-03 00:00:00,2023-07-24 00:00:00
Cement_Type,938,2.0,CP II-E-40,596.0,,,,,,


<IPython.core.display.Javascript object>

In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 938 entries, 0 to 1068
Data columns (total 22 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   Date                  938 non-null    datetime64[ns]
 1   C4AF                  764 non-null    float64       
 2   C3A                   764 non-null    float64       
 3   Cubic C3A             764 non-null    float64       
 4   Free CaO              764 non-null    float64       
 5   Portlandite           764 non-null    float64       
 6   Periclase             764 non-null    float64       
 7   Aphthitalite          764 non-null    float64       
 8   Langbeinite           764 non-null    float64       
 9   Calcite               764 non-null    float64       
 10  Dolimita              764 non-null    float64       
 11  Quartz                764 non-null    float64       
 12  SO3                   716 non-null    float64       
 13  Loss on Ignition      92

<IPython.core.display.Javascript object>

### Sort the dataset by date

In [41]:
# We first make sure Date column was infered correctly
df["Date"].dtype

dtype('<M8[ns]')

<IPython.core.display.Javascript object>

In [42]:
df = df.sort_values(by="Date")

<IPython.core.display.Javascript object>

## Saving the dataset

<h3>Saving for feature engineering</h3>

In [43]:
df.to_csv("../../../../data/interim/209/e.csv", index=False)

<IPython.core.display.Javascript object>