In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from scipy.stats import pearsonr
from collections import Counter

<IPython.core.display.Javascript object>

In [3]:
def convert_to_float(value):
    try:
        # If it's a string enclosed in single quotes, remove quotes and replace comma with dot
        if isinstance(value, str):
            return float(value.replace(",", "."))
        else:
            return float(value)
    except ValueError as e:
        print(e)
        return float("nan")

<IPython.core.display.Javascript object>

In [4]:
def preprocess_numeric_cols(df, columns):
    for col in columns:
        df[col] = df[col].astype(str)  # Ensure string type for string operations
        df[col] = df[col].str.replace("#", "")  # Remove '#' characters
        df[col] = df[col].str.replace(",", ".")  # Replace ',' with '.' for decimals
    return df

<IPython.core.display.Javascript object>

# Data preparation - 209 plant AG data

### Reading the files and extracting relevant information:

In [5]:
plant = "AG"
xls_files = {}

xls = pd.ExcelFile(
    f"../../../../data/raw/209/EMBRAPII hubIC IACC_CIMENTO_{plant}.xlsx",
    engine="openpyxl",
)
xls_files[plant] = xls

<IPython.core.display.Javascript object>

In [6]:
print("Plant: ", plant, xls.sheet_names)

Plant:  AG ['INSTRUÇÕES', 'Clínquer', '146', '115', '166']


<IPython.core.display.Javascript object>

In [7]:
cement_types_per_plant = []
CEMENT_TYPE_COLS = ["Tipo de cimento", "Classe de resistência"]
dataframes = []

for plant, xls in xls_files.items():
    for sheet_name in xls.sheet_names[2:]:
        df = pd.read_excel(xls_files[plant], sheet_name, header=[1, 2])

        if not df[CEMENT_TYPE_COLS].iloc[2:].isna().all().all():
            cement_type = (
                df[CEMENT_TYPE_COLS]
                .loc[2:]
                .astype(str)
                .sum(axis=1)
                .str.replace(" ", "")
                .str.replace("-", "")
                .str.replace("nan", "")
                .unique()
            )

        else:
            cement_type = (
                df[[("Obs.", "Unnamed: 8_level_1")]]
                .loc[2:]
                .astype(str)
                .sum(axis=1)
                .str.replace(" ", "")
                .str.replace("-", "")
                .unique()
            )

        cement_types_per_plant.append((plant, sheet_name, cement_type))
        dataframes.append(df)

<IPython.core.display.Javascript object>

In [8]:
cement_types_per_plant

[('AG', '146', array(['CPIIZ32'], dtype=object)),
 ('AG', '115', array(['CPIVRS32'], dtype=object)),
 ('AG', '166', array(['CPVRSARI'], dtype=object))]

<IPython.core.display.Javascript object>

In [9]:
cements = []
for tup in cement_types_per_plant:
    cements.append(tup[2][0])
Counter(cements)

Counter({'CPIIZ32': 1, 'CPIVRS32': 1, 'CPVRSARI': 1})

<IPython.core.display.Javascript object>

In [10]:
dataframes = []

for plant, xls in xls_files.items():
    for sheet_name in xls.sheet_names[2:]:
        df = pd.read_excel(xls_files[plant], sheet_name, header=[0, 1, 2])
        df["Unnamed: 0_level_0", "Unnamed: 0_level_1", "Unnamed: 0_level_2"] = plant
        dataframes.append(df.drop([0, 1], axis=0).reset_index(drop=True).copy())

<IPython.core.display.Javascript object>

In [11]:
df = pd.concat(dataframes, axis=0).reset_index(drop=True)

<IPython.core.display.Javascript object>

## Initial Preprocessing

In [12]:
df = pd.concat(
    [
        df["Unnamed: 0_level_0", "Unnamed: 0_level_1", "Unnamed: 0_level_2"],
        df["Dados iniciais"],
        df["Cimento"]["Análise química"],
        df["Característias físicas do cimento"],
        df["Propriedades do cimento"],
    ],
    axis=1,
)

<IPython.core.display.Javascript object>

### 1. Dropping columns where either there is no data nor it contributes to the analysis
### 2. Renaming columns to a short identifiable name

In [13]:
COLUMNS_TO_DROP = [
    ("Data", "Medida"),
    ("Horário", "Produção"),
    ("Horário", "Medida"),
    ("Obs.", "Unnamed: 148_level_2"),
    ("Tipo de cimento", "Unnamed: 6_level_2"),
    ("Classe de resistência", "Unnamed: 7_level_2"),
    ("Obs.", "Unnamed: 8_level_2"),
    "Obs.",
    ("Obs.", "Unnamed: 148_level_2"),
    ("Unnamed: 141_level_1", "Obs."),
    # ("Unnamed: 139_level_1", "#400"),
]

COLUMNS_TO_RENAME = {
    ("Unnamed: 0_level_0", "Unnamed: 0_level_1", "Unnamed: 0_level_2"): "Plant",
    ("Cidade", "Unnamed: 1_level_2"): "City",
    ("Data", "Produção"): "Date",  # Arrumar!
    ("Unnamed: 136_level_1", "Blaine"): "Blaine",
    ("Unnamed: 137_level_1", "#200"): "#200",
    ("Unnamed: 138_level_1", "#325"): "#325",
    ("Unnamed: 139_level_1", "#400"): "#400",
    ("Unnamed: 140_level_1", "Massa específica"): "Specific Gravity",
    ("Tempo de pega", "Inicio"): "Initial setting time",
    ("Tempo de pega", "Fim"): "Final setting time",
    ("Resistência à compressão", "1 d"): "CS1",
    ("Resistência à compressão", "3 d"): "CS3",
    ("Resistência à compressão", "7 d"): "CS7",
    ("Resistência à compressão", "28 d"): "CS28",
    "Perda ao fogo": "Loss on Ignition",
    "Resíduo insolúvel": "Insoluble Residue",
}

<IPython.core.display.Javascript object>

### Defining a single variable to identify Cement Type

In [14]:
# Single variable for the Cement Type
df["Cement_Type"] = (
    df[
        [
            ("Tipo de cimento", "Unnamed: 6_level_2"),
            ("Classe de resistência", "Unnamed: 7_level_2"),
            ("Obs.", "Unnamed: 8_level_2"),
        ]
    ]
    .fillna("")
    .astype(str)
    .sum(axis=1)
)

# Preprocessing
df["Cement_Type"] = (
    df["Cement_Type"]
    .str.replace("Expedido", "")
    .str.replace("Linha 9", "")
    .str.replace("Itajaí", "")
    .str.replace("D-G", "")
    .str.replace("D-E", "")
    .str.replace("DF2", "")
    .str.replace("CPIIF40", "CP II-F-40")
    .str.replace("CPIIF40", "CP II-F40")
    .str.replace("CP VARI", "CP V-ARI")
    .str.replace("CPVARI", "CP V-ARI")
    .str.replace("CP V-RSARI", "CP V-ARI RS")
    .str.replace("CP III-RS40", "CP III-40 RS")
    .str.replace("CP III-RS32", "CP III-32 RS")
    .str.replace("CP IV-RS32", "CP IV-32 RS")
    .str.replace("CP III40", "CP III-40")
    .str.replace("CP III32", "CP III-32")
    .str.replace("CP I-S40", "CP I-S-40")
    .str.replace("CP I40", "CP I-40")
    .str.replace("CP II-E32", "CP II-E-32")
    .str.replace("CP II-E40", "CP II-E-40")
    .str.replace("CP II-F FIBRO40", "CP II-F-40")
    .str.replace("CP II-F32", "CP II-F-32")
    .str.replace("CP II-F40", "CP II-F-40")
    .str.replace("CP II-Z32", "CP II-Z-32")
    .str.replace("CP II-Z40", "CP II-Z-40")
    .str.replace("CP IV32", "CP IV-32")
    .str.replace("CP IND", "CP I")
    .str.replace("CPINDCC", "CP I")
    .str.strip()
)

df = df.drop(COLUMNS_TO_DROP, axis=1).rename(COLUMNS_TO_RENAME, axis=1).copy()
df = df.drop([0]).reset_index(drop=True)
df = df.infer_objects()

<IPython.core.display.Javascript object>

### Dropping and rename the columns defined above

### Cleaning and converting numeric columns to float

In [15]:
NUMERIC_COLUMNS = [
    "CaO",
    "MgO",
    "Na2O",
    "Al2O3",
    "SiO2",
    "SO3",
    "K2O",
    "TiO2",
    "Fe2O3",
    "Loss on Ignition",
    "Insoluble Residue",
    "Blaine",
    "#200",
    "#325",
    "#400",
    "Specific Gravity",
    "Final setting time",
    "Initial setting time",
    "CS1",
    "CS3",
    "CS7",
    "CS28",
]

df = preprocess_numeric_cols(df, NUMERIC_COLUMNS)

# Preprocessing to fix numeric columns
df[NUMERIC_COLUMNS] = df[NUMERIC_COLUMNS].map(convert_to_float)

# Converting Date to pandas datetime
df["Date"] = pd.to_datetime(df["Date"])

# Drop rows completely missing
index_to_keep = (
    df.drop(["Plant", "Date", "Cement_Type"], axis=1).dropna(axis=0, how="all").index
)
df = df.iloc[index_to_keep].reset_index(drop=True)

df = df.infer_objects()

could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string 

<IPython.core.display.Javascript object>

## Dropping missing features

### Drop rows completely missing

In [16]:
index_to_keep = (
    df.drop(["Plant", "City", "Date", "Cement_Type"], axis=1)
    .dropna(axis=0, how="all")
    .index
)
df = df.iloc[index_to_keep].reset_index(drop=True)

<IPython.core.display.Javascript object>

### Removing features
 that will be fitted on this data.

1. Removing features with features having ~50% or more of missing values
2. Removing features with zero variance
3. Removing features with more than 70% of zeros

In [17]:
(df.isna().sum() / df.shape[0] * 100).sort_values(ascending=False).to_frame(
    name="Missing (%)"
).style.background_gradient(cmap="Reds")

Unnamed: 0,Missing (%)
CS1,100.0
Specific Gravity,100.0
#400,100.0
TiO2,100.0
SiO2,6.469003
K2O,6.469003
SO3,6.469003
Fe2O3,6.469003
Al2O3,6.469003
Na2O,6.469003


<IPython.core.display.Javascript object>

<h3>Percentage of missing values:</h3>
<table>
    <th>Plant A - Full Dataset:</th>
    <th></th>
    <tr>
        <td>Specific Gravity</td>
        <td>100.00</td>
    </tr>
    <tr>
        <td>#400</td>
        <td>100.00</td>
    </tr>
    <tr>
        <td>TiO2</td>
        <td>100.00</td>
    </tr>  
    <tr>
        <td>CS1</td>
        <td>100.00</td>
    </tr>
</table>

In [18]:
FEATRUES_TO_DROP = ["Specific Gravity", "#400", "TiO2", "CS1"]
df = df.drop(labels=FEATRUES_TO_DROP, axis=1)

<IPython.core.display.Javascript object>

### Removing features with many zeros when applicable

In [19]:
zero_values = {}
for col in df.select_dtypes(include="number").columns:
    zero_percentages = (df[df[col].eq(0)].shape[0] / df.shape[0]) * 100
    zero_values[col] = zero_percentages

zero_percentages = pd.Series(zero_values, name=f"Zero (%) for {plant}")
zero_percentages = zero_percentages.sort_values(ascending=False)
zero_percentages = zero_percentages.to_frame(name=f"Zero (%) for {plant}")
zero_percentages.style.background_gradient(cmap="Reds")

Unnamed: 0,Zero (%) for AG
#200,7.54717
CaO,0.0
MgO,0.0
CS7,0.0
CS3,0.0
Final setting time,0.0
Initial setting time,0.0
#325,0.0
Blaine,0.0
Insoluble Residue,0.0


<IPython.core.display.Javascript object>

### Dropping variable with City and Plant identification

In [20]:
FEATRUES_TO_DROP = ["City", "Plant"]
df = df.drop(labels=FEATRUES_TO_DROP, axis=1)

<IPython.core.display.Javascript object>

### Dropping rows where the target variable is null

In [21]:
df = df[df["CS28"].notna()]

<IPython.core.display.Javascript object>

### Dropping rows where the target variable is zero

In [22]:
df = df[~df["CS28"].eq(0)]

<IPython.core.display.Javascript object>

In [23]:
df.shape

(354, 20)

<IPython.core.display.Javascript object>

In [24]:
# df.describe().T

<IPython.core.display.Javascript object>

In [25]:
# df_copy = df.copy()

<IPython.core.display.Javascript object>

### Dropping rows where any other variable has a zero value

In [26]:
for col in df.columns:
    df = df[~df[col].eq(0)]

<IPython.core.display.Javascript object>

In [27]:
df.shape

(326, 20)

<IPython.core.display.Javascript object>

In [28]:
(df.isna().sum() / df.shape[0] * 100).sort_values(ascending=False).to_frame(
    name="Missing (%)"
).style.background_gradient(cmap="Reds")

Unnamed: 0,Missing (%)
CaO,4.294479
MgO,4.294479
Na2O,4.294479
Al2O3,4.294479
SiO2,4.294479
SO3,4.294479
K2O,4.294479
Fe2O3,4.294479
Insoluble Residue,2.760736
#200,2.760736


<IPython.core.display.Javascript object>

In [29]:
df.describe().T

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
Date,326.0,2022-04-29 04:20:36.809816064,2021-01-06 00:00:00,2021-08-28 00:00:00,2022-04-29 12:00:00,2023-01-18 12:00:00,2023-07-18 00:00:00,
CaO,312.0,49.404564,39.583,49.6575,50.900002,51.582501,53.5,3.633734
MgO,312.0,5.562872,4.066,5.01,5.41,6.015,7.72,0.742762
Na2O,312.0,0.259628,0.15,0.23,0.25,0.28,0.373,0.041608
Al2O3,312.0,6.350776,5.22,5.68,5.78,5.95,9.785,1.29627
SiO2,312.0,23.950615,19.809999,22.2675,22.66,23.190001,32.257999,3.121494
SO3,312.0,2.17591,1.678,2.0975,2.21,2.3,2.83,0.204379
K2O,312.0,1.476962,1.11,1.3975,1.44,1.5,2.222,0.152611
Fe2O3,312.0,3.392147,2.99,3.15,3.24,3.36,4.663,0.405582
Loss on Ignition,317.0,7.22817,4.7,7.1,7.6,7.8,8.8,0.922007


<IPython.core.display.Javascript object>

In [30]:
df.describe(exclude="number").T

Unnamed: 0,count,unique,top,freq,mean,min,25%,50%,75%,max
Date,326,,,,2022-04-29 04:20:36.809816064,2021-01-06 00:00:00,2021-08-28 00:00:00,2022-04-29 12:00:00,2023-01-18 12:00:00,2023-07-18 00:00:00
Cement_Type,326,3.0,CP II-Z-32,256.0,,,,,,


<IPython.core.display.Javascript object>

In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 326 entries, 0 to 362
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   Date                  326 non-null    datetime64[ns]
 1   CaO                   312 non-null    float64       
 2   MgO                   312 non-null    float64       
 3   Na2O                  312 non-null    float64       
 4   Al2O3                 312 non-null    float64       
 5   SiO2                  312 non-null    float64       
 6   SO3                   312 non-null    float64       
 7   K2O                   312 non-null    float64       
 8   Fe2O3                 312 non-null    float64       
 9   Loss on Ignition      317 non-null    float64       
 10  Insoluble Residue     317 non-null    float64       
 11  Blaine                318 non-null    float64       
 12  #200                  317 non-null    float64       
 13  #325                  318

<IPython.core.display.Javascript object>

### Dropping duplicated rows

In [32]:
df.shape

(326, 20)

<IPython.core.display.Javascript object>

In [33]:
df = df.reset_index(drop=True)

<IPython.core.display.Javascript object>

In [34]:
df = df.loc[df.select_dtypes("number").drop_duplicates().index].reset_index(drop=True)

chemical_vars = [
    "CaO",
    "MgO",
    "Na2O",
    "Al2O3",
    "SiO2",
    "SO3",
    "K2O",
    "Fe2O3",
    "Loss on Ignition",
    "Insoluble Residue",
]

df = df.loc[
    df.select_dtypes("number").drop_duplicates(subset=chemical_vars).index
].reset_index(drop=True)

<IPython.core.display.Javascript object>

In [35]:
df.shape

(287, 20)

<IPython.core.display.Javascript object>

### Dropping rows with negative values

In [36]:
df[df.drop(["Date"], axis=1).duplicated()].drop("Date", axis=1).select_dtypes(
    include="number"
).apply(lambda x: x < 0).sum().sort_values(ascending=False)

CaO                     0.0
MgO                     0.0
CS7                     0.0
CS3                     0.0
Final setting time      0.0
Initial setting time    0.0
#325                    0.0
#200                    0.0
Blaine                  0.0
Insoluble Residue       0.0
Loss on Ignition        0.0
Fe2O3                   0.0
K2O                     0.0
SO3                     0.0
SiO2                    0.0
Al2O3                   0.0
Na2O                    0.0
CS28                    0.0
dtype: float64

<IPython.core.display.Javascript object>

### Removing outliers

In [37]:
df.describe(percentiles=[0.99]).T

Unnamed: 0,count,mean,min,50%,99%,max,std
Date,287.0,2022-03-10 02:45:34.494773504,2021-01-06 00:00:00,2022-02-25 00:00:00,2023-06-30 13:26:24,2023-07-18 00:00:00,
CaO,282.0,50.132489,39.583,50.985001,53.0209,53.5,2.900079
MgO,282.0,5.632369,4.066,5.56,7.2733,7.72,0.726566
Na2O,282.0,0.253234,0.15,0.25,0.35757,0.373,0.03676
Al2O3,282.0,6.084755,5.22,5.77,9.52151,9.785,1.014549
SiO2,282.0,23.313957,19.809999,22.575,31.46822,32.257999,2.452781
SO3,282.0,2.209245,1.678,2.22,2.61,2.83,0.178216
K2O,282.0,1.450234,1.11,1.43,1.82954,2.222,0.127954
Fe2O3,282.0,3.31344,2.99,3.22,4.50738,4.663,0.3227
Loss on Ignition,284.0,7.411303,4.7,7.6,8.4419,8.8,0.762451


<IPython.core.display.Javascript object>

In [38]:
df = df.reset_index(drop=True)
df = df[~df["#200"].eq(df["#200"].max())].reset_index(drop=True)

<IPython.core.display.Javascript object>

In [39]:
df.describe(percentiles=[0.99]).T

Unnamed: 0,count,mean,min,50%,99%,max,std
Date,286.0,2022-03-09 15:11:19.720279808,2021-01-06 00:00:00,2022-02-24 00:00:00,2023-06-30 14:24:00,2023-07-18 00:00:00,
CaO,281.0,50.134989,39.583,51.0,53.022,53.5,2.904949
MgO,281.0,5.629139,4.066,5.56,7.274,7.72,0.725831
Na2O,281.0,0.253281,0.15,0.25,0.3576,0.373,0.036817
Al2O3,281.0,6.085875,5.22,5.77,9.5218,9.785,1.016185
SiO2,281.0,23.315288,19.809999,22.57,31.4696,32.257999,2.457055
SO3,281.0,2.209669,1.678,2.22,2.61,2.83,0.178392
K2O,281.0,1.450342,1.11,1.43,1.8302,2.222,0.128169
Fe2O3,281.0,3.31427,2.99,3.22,4.5074,4.663,0.322974
Loss on Ignition,283.0,7.411343,4.7,7.6,8.4426,8.8,0.763801


<IPython.core.display.Javascript object>

In [40]:
(df.isna().sum() / df.shape[0] * 100).sort_values(ascending=False).to_frame(
    name="Missing (%)"
).style.background_gradient(cmap="Reds")

Unnamed: 0,Missing (%)
CaO,1.748252
MgO,1.748252
Na2O,1.748252
Al2O3,1.748252
SiO2,1.748252
SO3,1.748252
K2O,1.748252
Fe2O3,1.748252
Insoluble Residue,1.048951
#200,1.048951


<IPython.core.display.Javascript object>

### Sort the dataset by date

In [41]:
# We first make sure Date column was infered correctly
df["Date"].dtype

dtype('<M8[ns]')

<IPython.core.display.Javascript object>

In [42]:
df = df.sort_values(by="Date")

<IPython.core.display.Javascript object>

## Saving the dataset

<h3>Saving for feature engineering</h3>

In [43]:
df.to_csv("../../../../data/interim/209/ag.csv", index=False)

<IPython.core.display.Javascript object>