In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import pandas as pd
import numpy as np

<IPython.core.display.Javascript object>

In [3]:
def convert_to_float(value):
    try:
        # If it's a string enclosed in single quotes, remove quotes and replace comma with dot
        if isinstance(value, str):
            return float(value.replace(",", "."))
        else:
            return float(value)
    except ValueError as e:
        print(e)
        return float("nan")

<IPython.core.display.Javascript object>

## Reading the dataset

In [4]:
plant = "M"

xls = pd.ExcelFile(
    f"../../../../data/raw/203/EMBRAPII hubIC IACC_CIMENTO_{plant}.xlsx",
    engine="openpyxl",
)

<IPython.core.display.Javascript object>

In [5]:
dataframes = []

for sheet_name in xls.sheet_names[1:]:
    df = pd.read_excel(xls, sheet_name, header=[0, 1, 2])
    dataframes.append(df.drop([0, 1], axis=0).reset_index(drop=True).copy())

df = pd.concat(dataframes, axis=0).reset_index(drop=True)

<IPython.core.display.Javascript object>

## Initial Preprocessing

In [6]:
df = pd.concat(
    [
        df["Dados iniciais"],
        df["Cimento"]["Análise química"],
        df["Característias físicas do cimento"],
        df["Propriedades do cimento"],
    ],
    axis=1,
)

<IPython.core.display.Javascript object>

### 1. Dropping columns where either there is no data nor it contributes to the analysis
### 2. Renaming columns to a short identifiable name 

In [7]:
COLUMNS_TO_DROP = [
    ("Data", "Produção"),
    ("Horário", "Produção"),
    ("Horário", "Medida"),
    ("Obs.", "Unnamed: 148_level_2"),
    ("Tipo de cimento", "Unnamed: 6_level_2"),
    ("Classe de resistência", "Unnamed: 7_level_2"),
    ("Obs.", "Unnamed: 8_level_2"),
    "Obs.",
    ("Obs.", "Unnamed: 148_level_2"),
    ("Unnamed: 141_level_1", "Obs."),
    ("Unnamed: 139_level_1", "#400"),
]

COLUMNS_TO_RENAME = {
    ("Cidade", "Unnamed: 1_level_2"): "Plant",
    ("Data", "Medida"): "Date",
    ("Unnamed: 136_level_1", "Blaine"): "Blaine",
    ("Unnamed: 137_level_1", "#200"): "#200",
    ("Unnamed: 138_level_1", "#325"): "#325",
    ("Unnamed: 139_level_1", "#400"): "#400",
    ("Unnamed: 140_level_1", "Massa específica"): "Specific Gravity",
    ("Tempo de pega", "Fim"): "Final setting time",
    ("Tempo de pega", "Inicio"): "Initial setting time",
    ("Resistência à compressão", "1 d"): "CS1",
    ("Resistência à compressão", "3 d"): "CS3",
    ("Resistência à compressão", "7 d"): "CS7",
    ("Resistência à compressão", "28 d"): "CS28",
    "Perda ao fogo": "Loss on Ignition",
    "Resíduo insolúvel": "Insoluble Residue",
}

<IPython.core.display.Javascript object>

### Defining a single variable to identify Cement Type

In [8]:
df["Cement_Type"] = (
    df[
        [
            ("Tipo de cimento", "Unnamed: 6_level_2"),
            ("Classe de resistência", "Unnamed: 7_level_2"),
            ("Obs.", "Unnamed: 8_level_2"),
        ]
    ]
    .fillna("")
    .astype(str)
    .sum(axis=1)
)

<IPython.core.display.Javascript object>

### Replacing the current cement type categories for a short identifiable category

In [9]:
# Preprocessing
df["Cement_Type"] = df["Cement_Type"].replace("FIBROCMENTO", "Fibrocimento")
df["Cement_Type"] = df["Cement_Type"].replace("CP VARIEnsacado", "CP VARI")
df["Cement_Type"] = df["Cement_Type"].replace("CP VARIGranel", "CP VARI")
df["Cement_Type"] = df["Cement_Type"].replace("CP III40Ensacado", "CP III40")
df["Cement_Type"] = df["Cement_Type"].replace("CP III40Granel", "CP III40")
df["Cement_Type"] = df["Cement_Type"].replace("CP VARIRS", "CP VARI")

<IPython.core.display.Javascript object>

### Dropping and rename the columns defined above

In [10]:
df = df.drop(COLUMNS_TO_DROP, axis=1).rename(COLUMNS_TO_RENAME, axis=1).copy()
df = df.infer_objects()

<IPython.core.display.Javascript object>

### Clean and converting numeric columns to float

In [11]:
NUMERIC_COLUMNS = [
    "CaO",
    "MgO",
    "Na2O",
    "Al2O3",
    "SiO2",
    "SO3",
    "K2O",
    "TiO2",
    "Fe2O3",
    "Loss on Ignition",
    "Insoluble Residue",
    "Blaine",
    "#200",
    "#325",
    "Specific Gravity",
    "Final setting time",
    "Initial setting time",
    "CS1",
    "CS3",
    "CS7",
    "CS28",
]

df[NUMERIC_COLUMNS] = df[NUMERIC_COLUMNS].map(convert_to_float)

could not convert string to float: '-'
could not convert string to float: '-'
could not convert string to float: '-'
could not convert string to float: '-'
could not convert string to float: '-'
could not convert string to float: '-'
could not convert string to float: '-'
could not convert string to float: '-'
could not convert string to float: '1..10'
could not convert string to float: '  '
could not convert string to float: ' '
could not convert string to float: '1.6.0'
could not convert string to float: '-'
could not convert string to float: '-'
could not convert string to float: ' '
could not convert string to float: '145 *'
could not convert string to float: ' '


<IPython.core.display.Javascript object>

### Converting Date columns to pandas datetime

In [12]:
df["Date"] = pd.to_datetime(df["Date"])

<IPython.core.display.Javascript object>

## Dropping missing features

### Drop rows completely missing

In [13]:
index_to_keep = (
    df.drop(["Plant", "Date", "Cement_Type"], axis=1).dropna(axis=0, how="all").index
)
df = df.iloc[index_to_keep].reset_index(drop=True)

<IPython.core.display.Javascript object>

### Removing features
 that will be fitted on this data.

1. Removing features with features having 60% or more of missing values
2. Removing features with zero variance
3. Removing features with more than 70% of zeros

In [14]:
(df.isna().sum() / df.shape[0] * 100).sort_values(ascending=False).to_frame(
    name="Missing (%)"
).style.background_gradient(cmap="Reds")

Unnamed: 0,Missing (%)
Fe2O3,100.0
SiO2,100.0
CaO,100.0
TiO2,100.0
Al2O3,100.0
Specific Gravity,84.710412
Insoluble Residue,57.09931
CS3,28.573721
#200,23.215145
K2O,12.305471


<IPython.core.display.Javascript object>

<h3>Percentage of missing values:</h3>
    
    
<table>
    <th>Plant AV - Full Dataset:</th>
    <th></th>
    <tr>
        <td>Fe2O3</td>
        <td>100.00</td>
    </tr>
    <tr>
        <td>SiO2</td>
        <td>100.00</td>
    </tr>   
    <tr>
        <td>SiO2</td>
        <td>100.00</td>
    </tr>
    <tr>
        <td>CaO</td>
        <td>100.00</td>
    </tr>
    <tr>
        <td>TiO2</td>
        <td>100.00</td>
    </tr>
    <tr>
        <td>Al2O3</td>
        <td>100.00</td>
    </tr>    
    <tr>
        <td>Specific Gravity</td>
        <td>84.71</td>
    </tr>    
    <tr>
        <td>Insoluble Residue</td>
        <td>57.10</td>
    </tr>
</table>


In [15]:
FEATRUES_TO_DROP = [
    "Specific Gravity",
    "TiO2",
    "Fe2O3",
    "SiO2",
    "CaO",
    "Al2O3",
    "Insoluble Residue",
]
df = df.drop(labels=FEATRUES_TO_DROP, axis=1)

<IPython.core.display.Javascript object>

### Drop rows where the target variable is null

In [16]:
df.shape

(6233, 17)

<IPython.core.display.Javascript object>

In [17]:
df = df[df["CS28"].notna()].reset_index(drop=True)

<IPython.core.display.Javascript object>

In [18]:
(df.isna().sum() / df.shape[0] * 100).sort_values(ascending=False).to_frame(
    name="Missing (%)"
).style.background_gradient(cmap="Reds")

Unnamed: 0,Missing (%)
CS3,28.383214
#200,22.722837
K2O,12.166558
Na2O,2.293429
MgO,1.398829
SO3,1.398829
Loss on Ignition,1.382563
#325,1.284971
Initial setting time,0.471698
Final setting time,0.455433


<IPython.core.display.Javascript object>

### Dropping duplicated rows

In [19]:
df.shape

(6148, 17)

<IPython.core.display.Javascript object>

In [20]:
df = df.reset_index(drop=True)

<IPython.core.display.Javascript object>

In [21]:
df = df.loc[df.select_dtypes("number").drop_duplicates().index].reset_index(drop=True)

chemical_vars = [
    "MgO",
    "Na2O",
    "SO3",
    "K2O",
    "Loss on Ignition",
]

df = df.loc[
    df.select_dtypes("number").drop_duplicates(subset=chemical_vars).index
].reset_index(drop=True)

<IPython.core.display.Javascript object>

In [22]:
df.shape

(5840, 17)

<IPython.core.display.Javascript object>

### Dropping rows with negative values

In [23]:
df[df.drop(["Date"], axis=1).duplicated()].drop("Date", axis=1).select_dtypes(
    include="number"
).apply(lambda x: x < 0).sum().sort_values(ascending=False)

MgO                     0.0
Na2O                    0.0
SO3                     0.0
K2O                     0.0
Loss on Ignition        0.0
Blaine                  0.0
#200                    0.0
#325                    0.0
Initial setting time    0.0
Final setting time      0.0
CS1                     0.0
CS3                     0.0
CS7                     0.0
CS28                    0.0
dtype: float64

<IPython.core.display.Javascript object>

### Removing outliers

In [24]:
df.describe(percentiles=[0.99]).T

Unnamed: 0,count,mean,min,50%,99%,max,std
Date,5840.0,2017-01-26 19:24:05.819178240,1900-01-24 01:26:24,2017-02-27 00:00:00,2023-02-06 00:00:00,2023-03-28 00:00:00,
MgO,5757.0,1.775574,0.87,1.8,2.7644,5.83,0.425555
Na2O,5702.0,0.08299,0.0,0.07,0.17,5.0,0.087219
SO3,5757.0,2.671925,0.09,2.62,4.2044,4.92,0.591707
K2O,5125.0,0.94803,0.07,0.89,1.63,2.62,0.244538
Loss on Ignition,5760.0,4.218813,0.46,4.05,7.4341,8.6,1.247882
Blaine,5825.0,4400.989528,3085.0,4460.0,4980.76,5197.0,337.910021
#200,4468.0,0.533855,0.0,0.16,4.52,18.6,1.023866
#325,5763.0,3.480244,0.0,2.04,19.1338,148.0,4.486047
Initial setting time,5813.0,172.087046,45.0,175.0,250.0,290.0,37.479712


<IPython.core.display.Javascript object>

In [25]:
df = df.reset_index(drop=True)
df = df[df["MgO"] < 3].reset_index(drop=True)
df = df[df["Na2O"] < 0.2].reset_index(drop=True)
df = df[df["#200"] < 5].reset_index(drop=True)
df = df[df["#325"] < 20].reset_index(drop=True)
df = df.drop(df["Initial setting time"].idxmin()).reset_index(drop=True)
df = df.drop(df["Final setting time"].idxmin()).reset_index(drop=True)

<IPython.core.display.Javascript object>

In [26]:
df.describe(percentiles=[0.99]).T

Unnamed: 0,count,mean,min,50%,99%,max,std
Date,4304.0,2016-05-16 23:21:31.449814016,2011-05-12 00:00:00,2016-04-04 12:00:00,2022-12-12 00:00:00,2023-03-14 00:00:00,
MgO,4304.0,1.80036,0.89,1.83,2.76,2.99,0.424613
Na2O,4304.0,0.07752,0.0,0.07,0.15,0.19,0.024421
SO3,4304.0,2.59984,0.09,2.57,4.1297,4.92,0.552095
K2O,3725.0,0.951054,0.07,0.9,1.61,2.62,0.209938
Loss on Ignition,4236.0,4.200014,0.46,4.04,7.16,8.03,1.198436
Blaine,4296.0,4386.411313,3085.0,4444.0,4968.05,5187.0,337.926886
#200,4304.0,0.479985,0.0,0.17,3.8894,4.88,0.810438
#325,4304.0,3.655007,0.0,2.12,18.5397,19.97,4.093456
Initial setting time,4285.0,171.256709,65.0,175.0,255.0,290.0,36.880132


<IPython.core.display.Javascript object>

In [27]:
df.describe(exclude="number").T

Unnamed: 0,count,unique,top,freq,mean,min,25%,50%,75%,max
Plant,4304,1.0,M,4304.0,,,,,,
Date,4304,,,,2016-05-16 23:21:31.449814016,2011-05-12 00:00:00,2013-08-19 00:00:00,2016-04-04 12:00:00,2018-05-09 06:00:00,2023-03-14 00:00:00
Cement_Type,4304,8.0,CP VARI,1636.0,,,,,,


<IPython.core.display.Javascript object>

### Dropping variable with Plant identification

In [28]:
df = df.drop("Plant", axis=1)

<IPython.core.display.Javascript object>

### Sort the dataset by date

In [29]:
# We first make sure Date column was infered correctly
df["Date"].dtype

dtype('<M8[ns]')

<IPython.core.display.Javascript object>

In [30]:
df = df.sort_values(by="Date")

<IPython.core.display.Javascript object>

## Saving the dataset

<h3>Saving for feature engineering</h3>

In [31]:
df.to_csv("../../../../data/interim/203/m.csv", index=False)

<IPython.core.display.Javascript object>