In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import pandas as pd
import numpy as np

<IPython.core.display.Javascript object>

In [3]:
def convert_to_float(value):
    try:
        # If it's a string enclosed in single quotes, remove quotes and replace comma with dot
        if isinstance(value, str):
            return float(value.replace(",", "."))
        else:
            return float(value)
    except ValueError as e:
        print(e)
        return float("nan")

<IPython.core.display.Javascript object>

## Reading the dataset

In [4]:
plant = "AV"

xls = pd.ExcelFile(
    f"../../../../data/raw/203/EMBRAPII hubIC IACC_CIMENTO_{plant}.xlsx",
    engine="openpyxl",
)

<IPython.core.display.Javascript object>

In [5]:
dataframes = []

for sheet_name in xls.sheet_names[1:]:
    df = pd.read_excel(xls, sheet_name, header=[0, 1, 2])
    dataframes.append(df.drop([0, 1], axis=0).reset_index(drop=True).copy())

df = pd.concat(dataframes, axis=0).reset_index(drop=True)

<IPython.core.display.Javascript object>

## Initial Preprocessing

In [6]:
df = pd.concat(
    [
        df["Dados iniciais"],
        df["Cimento"]["Análise química"],
        df["Característias físicas do cimento"],
        df["Propriedades do cimento"],
    ],
    axis=1,
)

<IPython.core.display.Javascript object>

### 1. Dropping columns where either there is no data nor it contributes to the analysis
### 2. Renaming columns to a short identifiable name 

In [7]:
COLUMNS_TO_DROP = [
    ("Data", "Produção"),
    ("Horário", "Produção"),
    ("Horário", "Medida"),
    ("Obs.", "Unnamed: 148_level_2"),
    ("Tipo de cimento", "Unnamed: 6_level_2"),
    ("Classe de resistência", "Unnamed: 7_level_2"),
    ("Obs.", "Unnamed: 8_level_2"),
    "Obs.",
    ("Obs.", "Unnamed: 148_level_2"),
    ("Unnamed: 141_level_1", "Obs."),
    ("Unnamed: 139_level_1", "#400"),
]

COLUMNS_TO_RENAME = {
    ("Cidade", "Unnamed: 1_level_2"): "Plant",
    ("Data", "Medida"): "Date",
    ("Unnamed: 136_level_1", "Blaine"): "Blaine",
    ("Unnamed: 137_level_1", "#200"): "#200",
    ("Unnamed: 138_level_1", "#325"): "#325",
    ("Unnamed: 139_level_1", "#400"): "#400",
    ("Unnamed: 140_level_1", "Massa específica"): "Specific Gravity",
    ("Tempo de pega", "Fim"): "Final setting time",
    ("Tempo de pega", "Inicio"): "Initial setting time",
    ("Resistência à compressão", "1 d"): "CS1",
    ("Resistência à compressão", "3 d"): "CS3",
    ("Resistência à compressão", "7 d"): "CS7",
    ("Resistência à compressão", "28 d"): "CS28",
    "Perda ao fogo": "Loss on Ignition",
    "Resíduo insolúvel": "Insoluble Residue",
}

<IPython.core.display.Javascript object>

### Defining a single variable to identify Cement Type

In [8]:
df["Cement_Type"] = (
    df[
        [
            ("Tipo de cimento", "Unnamed: 6_level_2"),
            ("Classe de resistência", "Unnamed: 7_level_2"),
            ("Obs.", "Unnamed: 8_level_2"),
        ]
    ]
    .fillna("")
    .astype(str)
    .sum(axis=1)
)

<IPython.core.display.Javascript object>

### Replacing the current cement type categories for a short identifiable category

In [9]:
# Preprocessing
df["Cement_Type"] = df["Cement_Type"].replace("FIBROCMENTO", "Fibrocimento")
df["Cement_Type"] = df["Cement_Type"].replace("CP VARIEnsacado", "CP VARI")
df["Cement_Type"] = df["Cement_Type"].replace("CP VARIGranel", "CP VARI")
df["Cement_Type"] = df["Cement_Type"].replace("CP III40Ensacado", "CP III40")
df["Cement_Type"] = df["Cement_Type"].replace("CP III40Granel", "CP III40")
df["Cement_Type"] = df["Cement_Type"].replace("CP VARIRS", "CP VARI")

<IPython.core.display.Javascript object>

### Dropping and rename the columns defined above

In [10]:
df = df.drop(COLUMNS_TO_DROP, axis=1).rename(COLUMNS_TO_RENAME, axis=1).copy()
df = df.infer_objects()

<IPython.core.display.Javascript object>

### Clean and converting numeric columns to float

In [11]:
NUMERIC_COLUMNS = [
    "CaO",
    "MgO",
    "Na2O",
    "Al2O3",
    "SiO2",
    "SO3",
    "K2O",
    "TiO2",
    "Fe2O3",
    "Loss on Ignition",
    "Insoluble Residue",
    "Blaine",
    "#200",
    "#325",
    "Specific Gravity",
    "Final setting time",
    "Initial setting time",
    "CS1",
    "CS3",
    "CS7",
    "CS28",
]

df[NUMERIC_COLUMNS] = df[NUMERIC_COLUMNS].map(convert_to_float)

<IPython.core.display.Javascript object>

### Converting Date columns to pandas datetime

In [12]:
df["Date"] = pd.to_datetime(df["Date"])

<IPython.core.display.Javascript object>

## Dropping missing features

### Drop rows completely missing

In [13]:
index_to_keep = (
    df.drop(["Plant", "Date", "Cement_Type"], axis=1).dropna(axis=0, how="all").index
)
df = df.iloc[index_to_keep].reset_index(drop=True)

<IPython.core.display.Javascript object>

### Removing features
 that will be fitted on this data.

1. Removing features with features having 60% or more of missing values
2. Removing features with zero variance
3. Removing features with more than 70% of zeros

<h3>Percentage of missing values:</h3>
    
    
<table>
    <th>Plant AV - Full Dataset:</th>
    <th></th>
    <tr>
        <td>Specific Gravity</td>
        <td>100.00</td>
    </tr>
    <tr>
        <td>TiO2</td>
        <td>100.00</td>
    </tr>        
</table>

In [14]:
FEATRUES_TO_DROP = ["Specific Gravity", "TiO2"]
df = df.drop(labels=FEATRUES_TO_DROP, axis=1)

<IPython.core.display.Javascript object>

In [15]:
zero_values = {}
for col in df.select_dtypes(include="number").columns:
    zero_percentages = (df[df[col].eq(0)].shape[0] / df.shape[0]) * 100
    zero_values[col] = zero_percentages

zero_percentages = pd.Series(zero_values, name=f"Zero (%) for Global Dataset")
zero_percentages = zero_percentages.sort_values(ascending=False)
zero_percentages = zero_percentages.to_frame(name=f"Zero (%) for Global Dataset")
zero_percentages.style.background_gradient(cmap="Reds")

Unnamed: 0,Zero (%) for Global Dataset
#200,0.129955
CaO,0.0
Blaine,0.0
CS7,0.0
CS3,0.0
CS1,0.0
Final setting time,0.0
Initial setting time,0.0
#325,0.0
Insoluble Residue,0.0


<IPython.core.display.Javascript object>

### Drop rows where the target variable is null

In [16]:
df = df[df["CS28"].notna()]

<IPython.core.display.Javascript object>

In [17]:
(df.isna().sum() / df.shape[0] * 100).sort_values(ascending=False).to_frame(
    name="Missing (%)"
).style.background_gradient(cmap="Reds")

Unnamed: 0,Missing (%)
Plant,0.0
Date,0.0
CS28,0.0
CS7,0.0
CS3,0.0
CS1,0.0
Final setting time,0.0
Initial setting time,0.0
#325,0.0
#200,0.0


<IPython.core.display.Javascript object>

### Dropping duplicated rows

In [18]:
df.shape

(1510, 22)

<IPython.core.display.Javascript object>

In [19]:
df = df.reset_index(drop=True)

<IPython.core.display.Javascript object>

In [20]:
df = df.loc[df.select_dtypes("number").drop_duplicates().index].reset_index(drop=True)

chemical_vars = [
    "CaO",
    "MgO",
    "Na2O",
    "Al2O3",
    "SiO2",
    "SO3",
    "K2O",
    "Fe2O3",
    "Loss on Ignition",
    "Insoluble Residue",
]

df = df.loc[
    df.select_dtypes("number").drop_duplicates(subset=chemical_vars).index
].reset_index(drop=True)

<IPython.core.display.Javascript object>

In [21]:
df.shape

(1509, 22)

<IPython.core.display.Javascript object>

### Dropping rows with negative values

In [22]:
df[df.drop(["Date"], axis=1).duplicated()].drop("Date", axis=1).select_dtypes(
    include="number"
).apply(lambda x: x < 0).sum().sort_values(ascending=False)

CaO                     0.0
Blaine                  0.0
CS7                     0.0
CS3                     0.0
CS1                     0.0
Final setting time      0.0
Initial setting time    0.0
#325                    0.0
#200                    0.0
Insoluble Residue       0.0
MgO                     0.0
Loss on Ignition        0.0
Fe2O3                   0.0
K2O                     0.0
SO3                     0.0
SiO2                    0.0
Al2O3                   0.0
Na2O                    0.0
CS28                    0.0
dtype: float64

<IPython.core.display.Javascript object>

### Removing outliers

In [23]:
df.describe(percentiles=[0.99]).T

Unnamed: 0,count,mean,min,50%,99%,max,std
Date,1509.0,2021-01-28 20:11:32.723658240,2018-02-07 14:00:00,2021-04-03 13:00:00,2023-03-13 17:00:00,2023-03-28 13:06:00,
CaO,1509.0,55.994831,46.25,55.36,61.7884,64.13,3.688712
MgO,1509.0,2.095262,0.64,2.11,4.3684,6.4,0.851172
Na2O,1509.0,0.116355,0.01,0.1,0.28,0.47,0.057133
Al2O3,1509.0,6.421133,4.87,6.4,8.7992,9.69,0.819284
SiO2,1509.0,22.929808,18.49,22.95,30.0036,30.6,2.691587
SO3,1509.0,3.065514,1.52,3.09,4.0792,4.48,0.484261
K2O,1509.0,0.595202,0.41,0.59,0.78,0.87,0.06616
Fe2O3,1509.0,2.206793,1.16,2.19,2.95,3.5,0.371016
Loss on Ignition,1509.0,4.975567,2.11,4.99,6.52,8.44,0.726746


<IPython.core.display.Javascript object>

In [24]:
df = df.drop([df["Final setting time"].idxmin()])

<IPython.core.display.Javascript object>

In [25]:
df.describe(percentiles=[0.99]).T

Unnamed: 0,count,mean,min,50%,99%,max,std
Date,1508.0,2021-01-28 18:44:59.164456448,2018-02-07 14:00:00,2021-04-02 13:02:00,2023-03-13 17:00:00,2023-03-28 13:06:00,
CaO,1508.0,56.000902,46.25,55.365,61.7886,64.13,3.682387
MgO,1508.0,2.093674,0.64,2.11,4.3493,6.4,0.849215
Na2O,1508.0,0.116313,0.01,0.1,0.28,0.47,0.057128
Al2O3,1508.0,6.419947,4.87,6.4,8.7993,9.69,0.818258
SiO2,1508.0,22.926366,18.49,22.945,30.0044,30.6,2.689156
SO3,1508.0,3.066174,1.52,3.09,4.0793,4.48,0.483742
K2O,1508.0,0.595285,0.41,0.59,0.78,0.87,0.066103
Fe2O3,1508.0,2.207235,1.16,2.19,2.95,3.5,0.370741
Loss on Ignition,1508.0,4.974589,2.11,4.99,6.52,8.44,0.725993


<IPython.core.display.Javascript object>

In [26]:
df.describe().T

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
Date,1508.0,2021-01-28 18:44:59.164456448,2018-02-07 14:00:00,2020-02-20 01:41:45,2021-04-02 13:02:00,2022-04-13 11:54:00,2023-03-28 13:06:00,
CaO,1508.0,56.000902,46.25,53.84,55.365,59.3125,64.13,3.682387
MgO,1508.0,2.093674,0.64,1.35,2.11,2.5225,6.4,0.849215
Na2O,1508.0,0.116313,0.01,0.08,0.1,0.14,0.47,0.057128
Al2O3,1508.0,6.419947,4.87,5.73,6.4,6.9,9.69,0.818258
SiO2,1508.0,22.926366,18.49,20.4675,22.945,24.46,30.6,2.689156
SO3,1508.0,3.066174,1.52,2.82,3.09,3.34,4.48,0.483742
K2O,1508.0,0.595285,0.41,0.55,0.59,0.63,0.87,0.066103
Fe2O3,1508.0,2.207235,1.16,1.94,2.19,2.5,3.5,0.370741
Loss on Ignition,1508.0,4.974589,2.11,4.4375,4.99,5.48,8.44,0.725993


<IPython.core.display.Javascript object>

In [27]:
df.describe(exclude="number").T

Unnamed: 0,count,unique,top,freq,mean,min,25%,50%,75%,max
Plant,1508,1.0,AV,1508.0,,,,,,
Date,1508,,,,2021-01-28 18:44:59.164456448,2018-02-07 14:00:00,2020-02-20 01:41:45,2021-04-02 13:02:00,2022-04-13 11:54:00,2023-03-28 13:06:00
Cement_Type,1508,4.0,CP II-E40,834.0,,,,,,


<IPython.core.display.Javascript object>

### Dropping variable with Plant identification

In [28]:
df = df.drop("Plant", axis=1)

<IPython.core.display.Javascript object>

### Sort the dataset by date

In [29]:
# We first make sure Date column was infered correctly
df["Date"].dtype

dtype('<M8[ns]')

<IPython.core.display.Javascript object>

In [30]:
df = df.sort_values(by="Date")

<IPython.core.display.Javascript object>

## Saving the dataset

<h3>Saving for feature engineering</h3>

In [31]:
df.to_csv("../../../../data/interim/203/av.csv", index=False)

<IPython.core.display.Javascript object>