In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import pandas as pd
import numpy as np

<IPython.core.display.Javascript object>

In [3]:
def convert_to_float(value):
    try:
        # If it's a string enclosed in single quotes, remove quotes and replace comma with dot
        if isinstance(value, str):
            return float(value.replace(",", "."))
        else:
            return float(value)
    except ValueError as e:
        print(e)
        return float("nan")

<IPython.core.display.Javascript object>

## Reading the dataset

In [4]:
plant = "G"

xls = pd.ExcelFile(
    f"../../../../data/raw/203/EMBRAPII hubIC IACC_CIMENTO_{plant}.xlsx",
    engine="openpyxl",
)

<IPython.core.display.Javascript object>

In [5]:
dataframes = []

for sheet_name in xls.sheet_names[1:]:
    df = pd.read_excel(xls, sheet_name, header=[0, 1, 2])
    dataframes.append(df.drop([0, 1], axis=0).reset_index(drop=True).copy())

df = pd.concat(dataframes, axis=0).reset_index(drop=True)

<IPython.core.display.Javascript object>

## Initial Preprocessing

In [6]:
df = pd.concat(
    [
        df["Dados iniciais"],
        df["Cimento"]["Análise química"],
        df["Característias físicas do cimento"],
        df["Propriedades do cimento"],
    ],
    axis=1,
)

<IPython.core.display.Javascript object>

### 1. Dropping columns where either there is no data nor it contributes to the analysis
### 2. Renaming columns to a short identifiable name 

In [7]:
COLUMNS_TO_DROP = [
    ("Data", "Produção"),
    ("Horário", "Produção"),
    ("Horário", "Medida"),
    ("Obs.", "Unnamed: 148_level_2"),
    ("Tipo de cimento", "Unnamed: 6_level_2"),
    ("Classe de resistência", "Unnamed: 7_level_2"),
    ("Obs.", "Unnamed: 8_level_2"),
    "Obs.",
    ("Obs.", "Unnamed: 148_level_2"),
    ("Unnamed: 141_level_1", "Obs."),
    ("Unnamed: 139_level_1", "#400"),
]

COLUMNS_TO_RENAME = {
    ("Cidade", "Unnamed: 1_level_2"): "Plant",
    ("Data", "Medida"): "Date",
    ("Unnamed: 136_level_1", "Blaine"): "Blaine",
    ("Unnamed: 137_level_1", "#200"): "#200",
    ("Unnamed: 138_level_1", "#325"): "#325",
    ("Unnamed: 139_level_1", "#400"): "#400",
    ("Unnamed: 140_level_1", "Massa específica"): "Specific Gravity",
    ("Tempo de pega", "Fim"): "Final setting time",
    ("Tempo de pega", "Inicio"): "Initial setting time",
    ("Resistência à compressão", "1 d"): "CS1",
    ("Resistência à compressão", "3 d"): "CS3",
    ("Resistência à compressão", "7 d"): "CS7",
    ("Resistência à compressão", "28 d"): "CS28",
    "Perda ao fogo": "Loss on Ignition",
    "Resíduo insolúvel": "Insoluble Residue",
}

<IPython.core.display.Javascript object>

### Defining a single variable to identify Cement Type

In [8]:
df["Cement_Type"] = (
    df[
        [
            ("Tipo de cimento", "Unnamed: 6_level_2"),
            ("Classe de resistência", "Unnamed: 7_level_2"),
            ("Obs.", "Unnamed: 8_level_2"),
        ]
    ]
    .fillna("")
    .astype(str)
    .sum(axis=1)
)

<IPython.core.display.Javascript object>

### Replacing the current cement type categories for a short identifiable category

In [9]:
# Preprocessing
df["Cement_Type"] = df["Cement_Type"].replace("FIBROCMENTO", "Fibrocimento")
df["Cement_Type"] = df["Cement_Type"].replace("CP VARIEnsacado", "CP VARI")
df["Cement_Type"] = df["Cement_Type"].replace("CP VARIGranel", "CP VARI")
df["Cement_Type"] = df["Cement_Type"].replace("CP III40Ensacado", "CP III40")
df["Cement_Type"] = df["Cement_Type"].replace("CP III40Granel", "CP III40")
df["Cement_Type"] = df["Cement_Type"].replace("CP VARIRS", "CP VARI")

<IPython.core.display.Javascript object>

### Dropping and rename the columns defined above

In [10]:
df = df.drop(COLUMNS_TO_DROP, axis=1).rename(COLUMNS_TO_RENAME, axis=1).copy()
df = df.infer_objects()

<IPython.core.display.Javascript object>

### Clean and converting numeric columns to float

In [11]:
NUMERIC_COLUMNS = [
    "CaO",
    "MgO",
    "Na2O",
    "Al2O3",
    "SiO2",
    "SO3",
    "K2O",
    "TiO2",
    "Fe2O3",
    "Loss on Ignition",
    "Insoluble Residue",
    "Blaine",
    "#200",
    "#325",
    "Specific Gravity",
    "Final setting time",
    "Initial setting time",
    "CS1",
    "CS3",
    "CS7",
    "CS28",
]

df[NUMERIC_COLUMNS] = df[NUMERIC_COLUMNS].map(convert_to_float)

<IPython.core.display.Javascript object>

### Converting Date columns to pandas datetime

In [12]:
df["Date"] = pd.to_datetime(df["Date"])

<IPython.core.display.Javascript object>

## Dropping missing features

### Drop rows completely missing

In [13]:
index_to_keep = (
    df.drop(["Plant", "Date", "Cement_Type"], axis=1).dropna(axis=0, how="all").index
)
df = df.iloc[index_to_keep].reset_index(drop=True)

<IPython.core.display.Javascript object>

### Removing features
 that will be fitted on this data.

1. Removing features with features having 60% or more of missing values
2. Removing features with zero variance
3. Removing features with more than 70% of zeros

<h3>Percentage of missing values:</h3>
    
    
<table>
    <th>Plant AV - Full Dataset:</th>
    <th></th>
    <tr>
        <td>Specific Gravity</td>
        <td>100.00</td>
    </tr>
    <tr>
        <td>TiO2</td>
        <td>100.00</td>
    </tr>        
</table>

In [14]:
FEATRUES_TO_DROP = ["Specific Gravity", "TiO2"]
df = df.drop(labels=FEATRUES_TO_DROP, axis=1)

<IPython.core.display.Javascript object>

### Dropping samples with Missing Values

In [15]:
df = df.dropna()

<IPython.core.display.Javascript object>

### Dropping duplicated rows

In [16]:
df = df.reset_index(drop=True)

<IPython.core.display.Javascript object>

In [17]:
df.shape

(1888, 22)

<IPython.core.display.Javascript object>

In [18]:
df = df.loc[df.select_dtypes("number").drop_duplicates().index].reset_index(drop=True)

<IPython.core.display.Javascript object>

In [19]:
df.shape

(1888, 22)

<IPython.core.display.Javascript object>

### Dropping rows with negative values

In [20]:
df[df.drop(["Date"], axis=1).duplicated()].drop("Date", axis=1).select_dtypes(
    include="number"
).apply(lambda x: x < 0).sum().sort_values(ascending=False)

CaO                     0.0
Blaine                  0.0
CS7                     0.0
CS3                     0.0
CS1                     0.0
Final setting time      0.0
Initial setting time    0.0
#325                    0.0
#200                    0.0
Insoluble Residue       0.0
MgO                     0.0
Loss on Ignition        0.0
Fe2O3                   0.0
K2O                     0.0
SO3                     0.0
SiO2                    0.0
Al2O3                   0.0
Na2O                    0.0
CS28                    0.0
dtype: float64

<IPython.core.display.Javascript object>

### Dropping rows with zero values

In [21]:
df[df.drop(["Date"], axis=1).duplicated()].drop("Date", axis=1).select_dtypes(
    include="number"
).apply(lambda x: x == 0).sum().sort_values(ascending=False)

CaO                     0.0
Blaine                  0.0
CS7                     0.0
CS3                     0.0
CS1                     0.0
Final setting time      0.0
Initial setting time    0.0
#325                    0.0
#200                    0.0
Insoluble Residue       0.0
MgO                     0.0
Loss on Ignition        0.0
Fe2O3                   0.0
K2O                     0.0
SO3                     0.0
SiO2                    0.0
Al2O3                   0.0
Na2O                    0.0
CS28                    0.0
dtype: float64

<IPython.core.display.Javascript object>

### Drop rows where the target variable is null

In [22]:
df = df[df["CS28"].notna()]

<IPython.core.display.Javascript object>

In [23]:
(df.isna().sum() / df.shape[0] * 100).sort_values(ascending=False).to_frame(
    name="Missing (%)"
).style.background_gradient(cmap="Reds")

Unnamed: 0,Missing (%)
Plant,0.0
Date,0.0
CS28,0.0
CS7,0.0
CS3,0.0
CS1,0.0
Final setting time,0.0
Initial setting time,0.0
#325,0.0
#200,0.0


<IPython.core.display.Javascript object>

### Removing outliers

In [24]:
# df.describe(percentiles=[0.25, 0.5, 0.75, 0.8, 0.85, 0.9, 0.95, 0.99]).T
df.describe().T

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
Date,1888.0,2020-07-28 00:12:45.190678016,2017-12-06 15:00:00,2019-05-15 13:00:00,2020-08-02 12:43:00,2021-10-09 14:51:15,2023-03-29 13:06:00,
CaO,1888.0,55.781414,45.74,51.2675,52.675,62.1125,65.27,5.410552
MgO,1888.0,1.985371,0.42,0.79,2.29,2.88,4.37,1.050936
Na2O,1888.0,0.109783,0.0,0.07,0.1,0.14,0.49,0.058383
Al2O3,1888.0,6.265625,4.38,5.23,6.32,7.15,9.43,1.016592
SiO2,1888.0,23.388596,17.18,19.3975,24.995,26.33,30.5,3.676792
SO3,1888.0,2.474417,1.35,1.94,2.19,3.14,4.16,0.632725
K2O,1888.0,0.67312,0.47,0.62,0.68,0.73,0.95,0.078252
Fe2O3,1888.0,2.580837,1.22,2.16,2.63,2.89,5.74,0.519324
Loss on Ignition,1888.0,5.323972,1.92,4.7075,5.24,6.2,8.48,1.221323


<IPython.core.display.Javascript object>

In [25]:
df = df[~df["Na2O"].eq(df["Na2O"].max())]
df = df[~df["Final setting time"].eq(df["Final setting time"].min())]

<IPython.core.display.Javascript object>

In [26]:
df.describe().T

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
Date,1886.0,2020-07-28 12:01:55.259809280,2017-12-06 15:00:00,2019-05-15 18:45:00,2020-08-02 12:43:00,2021-10-09 14:51:45,2023-03-29 13:06:00,
CaO,1886.0,55.779242,45.74,51.2625,52.665,62.1175,65.27,5.411256
MgO,1886.0,1.985896,0.42,0.79,2.295,2.88,4.37,1.051063
Na2O,1886.0,0.109597,0.0,0.07,0.1,0.14,0.47,0.05775
Al2O3,1886.0,6.266288,4.38,5.23,6.32,7.15,9.43,1.016896
SiO2,1886.0,23.390742,17.18,19.4,25.005,26.33,30.5,3.67743
SO3,1886.0,2.474178,1.35,1.94,2.19,3.14,4.16,0.632909
K2O,1886.0,0.673086,0.47,0.62,0.68,0.73,0.95,0.078283
Fe2O3,1886.0,2.580764,1.22,2.16,2.63,2.89,5.74,0.519447
Loss on Ignition,1886.0,5.32351,1.92,4.71,5.24,6.2,8.48,1.219861


<IPython.core.display.Javascript object>

In [27]:
# import matplotlib.pyplot as plt

# # Number of columns in the dataframe
# num_columns = df.drop(["Plant", "Date", "Cement_Type"], axis=1).shape[1]

# # Creating subplots
# fig, axes = plt.subplots(num_columns, 1, figsize=(8, num_columns * 4))

# # Plotting each variable in its own subplot
# for i, column in enumerate(df.drop(["Plant", "Date", "Cement_Type"], axis=1).columns):
#     df.boxplot(column=column, ax=axes[i])
#     axes[i].set_title(column)
#     axes[i].set_xlabel('')

# plt.tight_layout()
# plt.show()


<IPython.core.display.Javascript object>

In [28]:
df.describe().T

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
Date,1886.0,2020-07-28 12:01:55.259809280,2017-12-06 15:00:00,2019-05-15 18:45:00,2020-08-02 12:43:00,2021-10-09 14:51:45,2023-03-29 13:06:00,
CaO,1886.0,55.779242,45.74,51.2625,52.665,62.1175,65.27,5.411256
MgO,1886.0,1.985896,0.42,0.79,2.295,2.88,4.37,1.051063
Na2O,1886.0,0.109597,0.0,0.07,0.1,0.14,0.47,0.05775
Al2O3,1886.0,6.266288,4.38,5.23,6.32,7.15,9.43,1.016896
SiO2,1886.0,23.390742,17.18,19.4,25.005,26.33,30.5,3.67743
SO3,1886.0,2.474178,1.35,1.94,2.19,3.14,4.16,0.632909
K2O,1886.0,0.673086,0.47,0.62,0.68,0.73,0.95,0.078283
Fe2O3,1886.0,2.580764,1.22,2.16,2.63,2.89,5.74,0.519447
Loss on Ignition,1886.0,5.32351,1.92,4.71,5.24,6.2,8.48,1.219861


<IPython.core.display.Javascript object>

In [29]:
df.describe(exclude="number").T

Unnamed: 0,count,unique,top,freq,mean,min,25%,50%,75%,max
Plant,1886,1.0,G,1886.0,,,,,,
Date,1886,,,,2020-07-28 12:01:55.259809280,2017-12-06 15:00:00,2019-05-15 18:45:00,2020-08-02 12:43:00,2021-10-09 14:51:45,2023-03-29 13:06:00
Cement_Type,1886,7.0,CP VARI,543.0,,,,,,


<IPython.core.display.Javascript object>

### Dropping variable with Plant identification

In [30]:
df = df.drop("Plant", axis=1)

<IPython.core.display.Javascript object>

### Sort the dataset by date

In [31]:
# We first make sure Date column was infered correctly
df["Date"].dtype

dtype('<M8[ns]')

<IPython.core.display.Javascript object>

In [32]:
df = df.sort_values(by="Date")

<IPython.core.display.Javascript object>

## Saving the dataset

<h3>Saving for feature engineering</h3>

In [33]:
df.to_csv("../../../../data/interim/203/g.csv", index=False)

<IPython.core.display.Javascript object>