In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import pandas as pd
import numpy as np

<IPython.core.display.Javascript object>

In [3]:
def convert_to_float(value):
    try:
        # If it's a string enclosed in single quotes, remove quotes and replace comma with dot
        if isinstance(value, str):
            return float(value.replace(",", "."))
        else:
            return float(value)
    except ValueError as e:
        print(e)
        return float("nan")

<IPython.core.display.Javascript object>

## Reading the dataset

In [4]:
plant = "Y"

xls = pd.ExcelFile(
    f"../../../../data/raw/203/EMBRAPII hubIC IACC_CIMENTO_{plant}.xlsx",
    engine="openpyxl",
)

<IPython.core.display.Javascript object>

In [5]:
dataframes = []

for sheet_name in xls.sheet_names[1:]:
    df = pd.read_excel(xls, sheet_name, header=[0, 1, 2])
    dataframes.append(df.drop([0, 1], axis=0).reset_index(drop=True).copy())

df = pd.concat(dataframes, axis=0).reset_index(drop=True)

<IPython.core.display.Javascript object>

## Initial Preprocessing

In [6]:
df = pd.concat(
    [
        df["Dados iniciais"],
        df["Cimento"]["Análise química"],
        df["Característias físicas do cimento"],
        df["Propriedades do cimento"],
    ],
    axis=1,
)

<IPython.core.display.Javascript object>

### 1. Dropping columns where either there is no data nor it contributes to the analysis
### 2. Renaming columns to a short identifiable name 

In [7]:
COLUMNS_TO_DROP = [
    ("Data", "Produção"),
    ("Horário", "Produção"),
    ("Horário", "Medida"),
    ("Obs.", "Unnamed: 148_level_2"),
    ("Tipo de cimento", "Unnamed: 6_level_2"),
    ("Classe de resistência", "Unnamed: 7_level_2"),
    ("Obs.", "Unnamed: 8_level_2"),
    "Obs.",
    ("Obs.", "Unnamed: 148_level_2"),
    ("Unnamed: 141_level_1", "Obs."),
    ("Unnamed: 139_level_1", "#400"),
]

COLUMNS_TO_RENAME = {
    ("Cidade", "Unnamed: 1_level_2"): "Plant",
    ("Data", "Medida"): "Date",
    ("Unnamed: 136_level_1", "Blaine"): "Blaine",
    ("Unnamed: 137_level_1", "#200"): "#200",
    ("Unnamed: 138_level_1", "#325"): "#325",
    ("Unnamed: 139_level_1", "#400"): "#400",
    ("Unnamed: 140_level_1", "Massa específica"): "Specific Gravity",
    ("Tempo de pega", "Fim"): "Final setting time",
    ("Tempo de pega", "Inicio"): "Initial setting time",
    ("Resistência à compressão", "1 d"): "CS1",
    ("Resistência à compressão", "3 d"): "CS3",
    ("Resistência à compressão", "7 d"): "CS7",
    ("Resistência à compressão", "28 d"): "CS28",
    "Perda ao fogo": "Loss on Ignition",
    "Resíduo insolúvel": "Insoluble Residue",
}

<IPython.core.display.Javascript object>

### Defining a single variable to identify Cement Type

In [8]:
df["Cement_Type"] = (
    df[
        [
            ("Tipo de cimento", "Unnamed: 6_level_2"),
            ("Classe de resistência", "Unnamed: 7_level_2"),
            ("Obs.", "Unnamed: 8_level_2"),
        ]
    ]
    .fillna("")
    .astype(str)
    .sum(axis=1)
)

<IPython.core.display.Javascript object>

### Replacing the current cement type categories for a short identifiable category

In [9]:
# Preprocessing
df["Cement_Type"] = df["Cement_Type"].replace("FIBROCMENTO", "Fibrocimento")
df["Cement_Type"] = df["Cement_Type"].replace("CP VARIEnsacado", "CP VARI")
df["Cement_Type"] = df["Cement_Type"].replace("CP VARIGranel", "CP VARI")
df["Cement_Type"] = df["Cement_Type"].replace("CP III40Ensacado", "CP III40")
df["Cement_Type"] = df["Cement_Type"].replace("CP III40Granel", "CP III40")
df["Cement_Type"] = df["Cement_Type"].replace("CP VARIRS", "CP VARI")

<IPython.core.display.Javascript object>

### Dropping and rename the columns defined above

In [10]:
df = df.drop(COLUMNS_TO_DROP, axis=1).rename(COLUMNS_TO_RENAME, axis=1).copy()
df = df.infer_objects()

<IPython.core.display.Javascript object>

### Clean and converting numeric columns to float

In [11]:
NUMERIC_COLUMNS = [
    "CaO",
    "MgO",
    "Na2O",
    "Al2O3",
    "SiO2",
    "SO3",
    "K2O",
    "TiO2",
    "Fe2O3",
    "Loss on Ignition",
    "Insoluble Residue",
    "Blaine",
    "#200",
    "#325",
    "Specific Gravity",
    "Final setting time",
    "Initial setting time",
    "CS1",
    "CS3",
    "CS7",
    "CS28",
]

df[NUMERIC_COLUMNS] = df[NUMERIC_COLUMNS].map(convert_to_float)

<IPython.core.display.Javascript object>

### Converting Date columns to pandas datetime

In [12]:
df["Date"] = pd.to_datetime(df["Date"])

<IPython.core.display.Javascript object>

## Dropping missing features

### Drop rows completely missing

In [13]:
index_to_keep = (
    df.drop(["Plant", "Date", "Cement_Type"], axis=1).dropna(axis=0, how="all").index
)
df = df.iloc[index_to_keep].reset_index(drop=True)

<IPython.core.display.Javascript object>

### Removing features
 that will be fitted on this data.

1. Removing features with features having 60% or more of missing values
2. Removing features with zero variance
3. Removing features with more than 70% of zeros

<h3>Percentage of missing values:</h3>
    
    
<table>
    <th>Plant AV - Full Dataset:</th>
    <th></th>
    <tr>
        <td>TiO2</td>
        <td>100.00</td>
    </tr>
    <tr>
        <td>Specific Gravity</td>
        <td>98.60</td>
    </tr>
</table>

In [14]:
FEATRUES_TO_DROP = ["Specific Gravity", "TiO2"]
df = df.drop(labels=FEATRUES_TO_DROP, axis=1)

<IPython.core.display.Javascript object>

### Drop rows where the target variable is null

In [15]:
df.shape

(1724, 22)

<IPython.core.display.Javascript object>

In [16]:
df = df[df["CS28"].notna()].reset_index(drop=True)

<IPython.core.display.Javascript object>

In [17]:
(df.isna().sum() / df.shape[0] * 100).sort_values(ascending=False).to_frame(
    name="Missing (%)"
).style.background_gradient(cmap="Reds")

Unnamed: 0,Missing (%)
#325,8.56102
Fe2O3,3.278689
CaO,1.730419
MgO,1.730419
Na2O,1.730419
Al2O3,1.730419
SiO2,1.730419
SO3,1.730419
K2O,1.730419
Loss on Ignition,0.546448


<IPython.core.display.Javascript object>

In [18]:
df.shape

(1098, 22)

<IPython.core.display.Javascript object>

In [19]:
# df[df.drop(["Plant", "Date", "Cement_Type"], axis=1).isna().all(axis=1)]

<IPython.core.display.Javascript object>

In [20]:
# df = df.dropna()

<IPython.core.display.Javascript object>

In [21]:
# df.shape

<IPython.core.display.Javascript object>

In [22]:
# (df.isna().sum() / df.shape[0] * 100).sort_values(ascending=False).to_frame(
#     name="Missing (%)"
# ).style.background_gradient(cmap="Reds")

<IPython.core.display.Javascript object>

In [23]:
df.describe().T

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
Date,1098.0,2022-01-06 00:43:58.251366144,2020-09-01 13:27:00,2021-06-07 00:00:00,2022-01-14 00:00:00,2022-08-17 00:00:00,2023-03-27 00:00:00,
CaO,1079.0,55.236942,44.4,50.095,52.87,61.41,65.03,5.770569
MgO,1079.0,3.147025,1.56,2.74,3.12,3.76,4.78,0.756582
Na2O,1079.0,0.19861,0.0,0.11,0.15,0.25,1.05,0.114688
Al2O3,1079.0,6.276599,3.8,4.83,6.3,7.67,9.43,1.483419
SiO2,1079.0,24.890139,19.03,20.375,25.87,28.685,33.68,4.202556
SO3,1079.0,2.583207,0.58,2.05,2.33,3.225,4.32,0.656147
K2O,1079.0,1.00354,0.09,0.4,0.44,0.55,4.46,1.148358
Fe2O3,1062.0,2.37774,1.27,1.94,2.215,2.85,4.76,0.559591
Loss on Ignition,1092.0,3.954332,1.36,3.06,3.7,4.61,7.5,1.073501


<IPython.core.display.Javascript object>

In [24]:
df.describe(exclude="number").T

Unnamed: 0,count,unique,top,freq,mean,min,25%,50%,75%,max
Plant,1098,1.0,Y,1098.0,,,,,,
Date,1098,,,,2022-01-06 00:43:58.251366144,2020-09-01 13:27:00,2021-06-07 00:00:00,2022-01-14 00:00:00,2022-08-17 00:00:00,2023-03-27 00:00:00
Cement_Type,1098,5.0,CP III40,352.0,,,,,,


<IPython.core.display.Javascript object>

### Dropping variable with Plant identification

In [25]:
df = df.drop("Plant", axis=1)

<IPython.core.display.Javascript object>

### Sort the dataset by date

In [26]:
# We first make sure Date column was infered correctly
df["Date"].dtype

dtype('<M8[ns]')

<IPython.core.display.Javascript object>

In [27]:
df = df.sort_values(by="Date")

<IPython.core.display.Javascript object>

## Saving the dataset

<h3>Saving for feature engineering</h3>

In [28]:
df.to_csv("../../../../data/interim/203/y.csv", index=False)

<IPython.core.display.Javascript object>