In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from scipy.stats import pearsonr
from collections import Counter

<IPython.core.display.Javascript object>

In [3]:
def convert_to_float(value):
    try:
        # If it's a string enclosed in single quotes, remove quotes and replace comma with dot
        if isinstance(value, str):
            return float(value.replace(",", "."))
        else:
            return float(value)
    except ValueError as e:
        print(e)
        return float("nan")

<IPython.core.display.Javascript object>

In [4]:
def preprocess_numeric_cols(df, columns):
    for col in columns:
        df[col] = df[col].astype(str)  # Ensure string type for string operations
        df[col] = df[col].str.replace("#", "")  # Remove '#' characters
        df[col] = df[col].str.replace(",", ".")  # Replace ',' with '.' for decimals
    return df

<IPython.core.display.Javascript object>

# Data preparation - 209 plant Q data

### Reading the files and extracting relevant information:

In [5]:
plant = "Q"
xls_files = {}

xls = pd.ExcelFile(
    f"../../../../data/raw/209/EMBRAPII hubIC IACC_CIMENTO_{plant}.xlsx",
    engine="openpyxl",
)
xls_files[plant] = xls

<IPython.core.display.Javascript object>

In [6]:
print("Plant: ", plant, xls.sheet_names)

Plant:  Q ['INSTRUÇÕES', 'Clínquer', '56', '32', '101', '143']


<IPython.core.display.Javascript object>

In [7]:
cement_types_per_plant = []
CEMENT_TYPE_COLS = ["Tipo de cimento", "Classe de resistência"]
dataframes = []

for plant, xls in xls_files.items():
    for sheet_name in xls.sheet_names[2:]:
        df = pd.read_excel(xls_files[plant], sheet_name, header=[1, 2])

        if not df[CEMENT_TYPE_COLS].iloc[2:].isna().all().all():
            cement_type = (
                df[CEMENT_TYPE_COLS]
                .loc[2:]
                .astype(str)
                .sum(axis=1)
                .str.replace(" ", "")
                .str.replace("-", "")
                .str.replace("nan", "")
                .unique()
            )

        else:
            cement_type = (
                df[[("Obs.", "Unnamed: 8_level_1")]]
                .loc[2:]
                .astype(str)
                .sum(axis=1)
                .str.replace(" ", "")
                .str.replace("-", "")
                .unique()
            )

        cement_types_per_plant.append((plant, sheet_name, cement_type))
        dataframes.append(df)

<IPython.core.display.Javascript object>

In [8]:
cement_types_per_plant

[('Q', '56', array(['CPIIF40'], dtype=object)),
 ('Q', '32', array(['CPIIF32'], dtype=object)),
 ('Q', '101', array(['CPIIFFIBRO40'], dtype=object)),
 ('Q', '143', array(['CPIS40'], dtype=object))]

<IPython.core.display.Javascript object>

In [9]:
cements = []
for tup in cement_types_per_plant:
    cements.append(tup[2][0])
Counter(cements)

Counter({'CPIIF40': 1, 'CPIIF32': 1, 'CPIIFFIBRO40': 1, 'CPIS40': 1})

<IPython.core.display.Javascript object>

In [10]:
dataframes = []

for plant, xls in xls_files.items():
    for sheet_name in xls.sheet_names[2:]:
        df = pd.read_excel(xls_files[plant], sheet_name, header=[0, 1, 2])
        df["Unnamed: 0_level_0", "Unnamed: 0_level_1", "Unnamed: 0_level_2"] = plant
        dataframes.append(df.drop([0, 1], axis=0).reset_index(drop=True).copy())

<IPython.core.display.Javascript object>

In [11]:
df = pd.concat(dataframes, axis=0).reset_index(drop=True)

<IPython.core.display.Javascript object>

## Initial Preprocessing

In [12]:
df = pd.concat(
    [
        df["Unnamed: 0_level_0", "Unnamed: 0_level_1", "Unnamed: 0_level_2"],
        df["Dados iniciais"],
        df["Cimento"]["Análise química"],
        df["Característias físicas do cimento"],
        df["Propriedades do cimento"],
    ],
    axis=1,
)

<IPython.core.display.Javascript object>

### 1. Dropping columns where either there is no data nor it contributes to the analysis
### 2. Renaming columns to a short identifiable name

In [13]:
COLUMNS_TO_DROP = [
    ("Data", "Medida"),
    ("Horário", "Produção"),
    ("Horário", "Medida"),
    ("Obs.", "Unnamed: 148_level_2"),
    ("Tipo de cimento", "Unnamed: 6_level_2"),
    ("Classe de resistência", "Unnamed: 7_level_2"),
    ("Obs.", "Unnamed: 8_level_2"),
    "Obs.",
    ("Obs.", "Unnamed: 148_level_2"),
    ("Unnamed: 141_level_1", "Obs."),
    # ("Unnamed: 139_level_1", "#400"),
]

COLUMNS_TO_RENAME = {
    ("Unnamed: 0_level_0", "Unnamed: 0_level_1", "Unnamed: 0_level_2"): "Plant",
    ("Cidade", "Unnamed: 1_level_2"): "City",
    ("Data", "Produção"): "Date",  # Arrumar!
    ("Unnamed: 136_level_1", "Blaine"): "Blaine",
    ("Unnamed: 137_level_1", "#200"): "#200",
    ("Unnamed: 138_level_1", "#325"): "#325",
    ("Unnamed: 139_level_1", "#400"): "#400",
    ("Unnamed: 140_level_1", "Massa específica"): "Specific Gravity",
    ("Tempo de pega", "Inicio"): "Initial setting time",
    ("Tempo de pega", "Fim"): "Final setting time",
    ("Resistência à compressão", "1 d"): "CS1",
    ("Resistência à compressão", "3 d"): "CS3",
    ("Resistência à compressão", "7 d"): "CS7",
    ("Resistência à compressão", "28 d"): "CS28",
    "Perda ao fogo": "Loss on Ignition",
    "Resíduo insolúvel": "Insoluble Residue",
}

<IPython.core.display.Javascript object>

### Defining a single variable to identify Cement Type

In [14]:
# Single variable for the Cement Type
df["Cement_Type"] = (
    df[
        [
            ("Tipo de cimento", "Unnamed: 6_level_2"),
            ("Classe de resistência", "Unnamed: 7_level_2"),
            ("Obs.", "Unnamed: 8_level_2"),
        ]
    ]
    .fillna("")
    .astype(str)
    .sum(axis=1)
)

# Preprocessing
df["Cement_Type"] = (
    df["Cement_Type"]
    .str.replace("Expedido", "")
    .str.replace("Linha 9", "")
    .str.replace("Itajaí", "")
    .str.replace("D-G", "")
    .str.replace("D-E", "")
    .str.replace("DF2", "")
    .str.replace("CPIIF40", "CP II-F-40")
    .str.replace("CPIIF40", "CP II-F40")
    .str.replace("CP VARI", "CP V-ARI")
    .str.replace("CPVARI", "CP V-ARI")
    .str.replace("CP V-RSARI", "CP V-ARI RS")
    .str.replace("CP III-RS40", "CP III-40 RS")
    .str.replace("CP III-RS32", "CP III-32 RS")
    .str.replace("CP IV-RS32", "CP IV-32 RS")
    .str.replace("CP III40", "CP III-40")
    .str.replace("CP III32", "CP III-32")
    .str.replace("CP I-S40", "CP I-S-40")
    .str.replace("CP I40", "CP I-40")
    .str.replace("CP II-E32", "CP II-E-32")
    .str.replace("CP II-E40", "CP II-E-40")
    .str.replace("CP II-F FIBRO40", "CP II-F-40")
    .str.replace("CP II-F32", "CP II-F-32")
    .str.replace("CP II-F40", "CP II-F-40")
    .str.replace("CP II-Z32", "CP II-Z-32")
    .str.replace("CP II-Z40", "CP II-Z-40")
    .str.replace("CP IV32", "CP IV-32")
    .str.replace("CP IND", "CP I")
    .str.replace("CPINDCC", "CP I")
    .str.strip()
)

df = df.drop(COLUMNS_TO_DROP, axis=1).rename(COLUMNS_TO_RENAME, axis=1).copy()
df = df.drop([0]).reset_index(drop=True)
df = df.infer_objects()

<IPython.core.display.Javascript object>

### Dropping and rename the columns defined above

### Cleaning and converting numeric columns to float

In [15]:
NUMERIC_COLUMNS = [
    "CaO",
    "MgO",
    "Na2O",
    "Al2O3",
    "SiO2",
    "SO3",
    "K2O",
    "TiO2",
    "Fe2O3",
    "Loss on Ignition",
    "Insoluble Residue",
    "Blaine",
    "#200",
    "#325",
    "#400",
    "Specific Gravity",
    "Final setting time",
    "Initial setting time",
    "CS1",
    "CS3",
    "CS7",
    "CS28",
]

df = preprocess_numeric_cols(df, NUMERIC_COLUMNS)

# Preprocessing to fix numeric columns
df[NUMERIC_COLUMNS] = df[NUMERIC_COLUMNS].map(convert_to_float)

# Converting Date to pandas datetime
df["Date"] = pd.to_datetime(df["Date"])

# Drop rows completely missing
index_to_keep = (
    df.drop(["Plant", "Date", "Cement_Type"], axis=1).dropna(axis=0, how="all").index
)
df = df.iloc[index_to_keep].reset_index(drop=True)

df = df.infer_objects()

could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string 

<IPython.core.display.Javascript object>

## Dropping missing features

### Drop rows completely missing

In [16]:
index_to_keep = (
    df.drop(["Plant", "City", "Date", "Cement_Type"], axis=1)
    .dropna(axis=0, how="all")
    .index
)
df = df.iloc[index_to_keep].reset_index(drop=True)

<IPython.core.display.Javascript object>

### Removing features
 that will be fitted on this data.

1. Removing features with features having ~50% or more of missing values
2. Removing features with zero variance
3. Removing features with more than 70% of zeros

In [17]:
(df.isna().sum() / df.shape[0] * 100).sort_values(ascending=False).to_frame(
    name="Missing (%)"
).style.background_gradient(cmap="Reds")

Unnamed: 0,Missing (%)
CS1,100.0
Specific Gravity,100.0
#400,53.508772
#325,46.616541
TiO2,34.33584
CS28,2.756892
CS7,0.626566
K2O,0.56391
Fe2O3,0.56391
SO3,0.56391


<IPython.core.display.Javascript object>

<h3>Percentage of missing values:</h3>
<table>
    <th>Plant A - Full Dataset:</th>
    <th></th>
    <tr>
        <td>CS1</td>
        <td>100.00</td>
    </tr>
    <tr>
        <td>Specific Gravity</td>
        <td>100.00</td>
    </tr>
    <tr>
        <td>#400</td>
        <td>53.51</td>
    </tr>      
    <tr>
        <td>#325</td>
        <td>46.62</td>
    </tr>
    <tr>
        <td>TiO2</td>
        <td>34.34</td>
    </tr>        
</table>

In [18]:
FEATRUES_TO_DROP = ["CS1", "Specific Gravity", "#400", "#325", "TiO2"]
df = df.drop(labels=FEATRUES_TO_DROP, axis=1)

<IPython.core.display.Javascript object>

### Removing features with many zeros when applicable

In [19]:
zero_values = {}
for col in df.select_dtypes(include="number").columns:
    zero_percentages = (df[df[col].eq(0)].shape[0] / df.shape[0]) * 100
    zero_values[col] = zero_percentages

zero_percentages = pd.Series(zero_values, name=f"Zero (%) for {plant}")
zero_percentages = zero_percentages.sort_values(ascending=False)
zero_percentages = zero_percentages.to_frame(name=f"Zero (%) for {plant}")
zero_percentages.style.background_gradient(cmap="Reds")

Unnamed: 0,Zero (%) for Q
#200,54.385965
CaO,0.0
Insoluble Residue,0.0
CS7,0.0
CS3,0.0
Final setting time,0.0
Initial setting time,0.0
Blaine,0.0
Loss on Ignition,0.0
MgO,0.0


<IPython.core.display.Javascript object>

### Dropping variable with many zeros

In [20]:
FEATRUES_TO_DROP = ["#200"]
df = df.drop(labels=FEATRUES_TO_DROP, axis=1)

<IPython.core.display.Javascript object>

### Dropping variable with City and Plant identification

In [21]:
FEATRUES_TO_DROP = ["City", "Plant"]
df = df.drop(labels=FEATRUES_TO_DROP, axis=1)

<IPython.core.display.Javascript object>

### Dropping rows where the target variable is null

In [22]:
df = df[df["CS28"].notna()]

<IPython.core.display.Javascript object>

### Dropping rows where the target variable is zero

In [23]:
df = df[~df["CS28"].eq(0)]

<IPython.core.display.Javascript object>

In [24]:
df.shape

(1552, 18)

<IPython.core.display.Javascript object>

In [25]:
# df.describe().T

<IPython.core.display.Javascript object>

In [26]:
# df_copy = df.copy()

<IPython.core.display.Javascript object>

### Dropping rows where any other variable has a zero value

In [27]:
for col in df.columns:
    df = df[~df[col].eq(0)]

<IPython.core.display.Javascript object>

In [28]:
df.shape

(1552, 18)

<IPython.core.display.Javascript object>

In [29]:
(df.isna().sum() / df.shape[0] * 100).sort_values(ascending=False).to_frame(
    name="Missing (%)"
).style.background_gradient(cmap="Reds")

Unnamed: 0,Missing (%)
Initial setting time,0.193299
Final setting time,0.193299
MgO,0.128866
Na2O,0.128866
Al2O3,0.128866
SiO2,0.128866
SO3,0.128866
K2O,0.128866
Fe2O3,0.128866
CaO,0.128866


<IPython.core.display.Javascript object>

### Dropping duplicated rows

In [30]:
df.shape

(1552, 18)

<IPython.core.display.Javascript object>

In [31]:
df = df.reset_index(drop=True)

<IPython.core.display.Javascript object>

In [32]:
df = df.loc[df.select_dtypes("number").drop_duplicates().index].reset_index(drop=True)

chemical_vars = [
    "CaO",
    "MgO",
    "Na2O",
    "Al2O3",
    "SiO2",
    "SO3",
    "K2O",
    "Fe2O3",
    "Loss on Ignition",
    "Insoluble Residue",
]

df = df.loc[
    df.select_dtypes("number").drop_duplicates(subset=chemical_vars).index
].reset_index(drop=True)

<IPython.core.display.Javascript object>

In [33]:
df.shape

(1552, 18)

<IPython.core.display.Javascript object>

### Dropping rows with negative values

In [34]:
df[df.drop(["Date"], axis=1).duplicated()].drop("Date", axis=1).select_dtypes(
    include="number"
).apply(lambda x: x < 0).sum().sort_values(ascending=False)

CaO                     0.0
MgO                     0.0
Na2O                    0.0
Al2O3                   0.0
SiO2                    0.0
SO3                     0.0
K2O                     0.0
Fe2O3                   0.0
Loss on Ignition        0.0
Insoluble Residue       0.0
Blaine                  0.0
Initial setting time    0.0
Final setting time      0.0
CS3                     0.0
CS7                     0.0
CS28                    0.0
dtype: float64

<IPython.core.display.Javascript object>

### Removing outliers

In [35]:
df.describe(percentiles=[0.99]).T

Unnamed: 0,count,mean,min,50%,99%,max,std
Date,1552.0,2022-04-13 08:58:08.659793920,2021-01-02 00:00:00,2022-03-15 00:00:00,2023-07-11 11:45:36,2023-07-25 00:00:00,
CaO,1550.0,61.264914,46.69669,61.37756,64.124419,64.93145,1.654753
MgO,1550.0,2.676268,0.94305,2.617855,4.910585,6.3279,0.787079
Na2O,1550.0,0.147677,0.00037,0.15434,0.241573,0.2662,0.051118
Al2O3,1550.0,4.284923,1.13205,4.329305,4.973948,8.02546,0.37422
SiO2,1550.0,19.293546,13.07616,19.397485,20.509436,21.33109,0.691944
SO3,1550.0,2.677374,0.07531,2.69683,3.12791,3.60785,0.220119
K2O,1550.0,0.454573,0.17397,0.44411,0.617942,0.67241,0.07282
Fe2O3,1550.0,2.729897,0.20984,2.73693,3.256706,5.16973,0.268907
Loss on Ignition,1551.0,8.485003,2.82,5.55,11.195,3430.0,86.966467


<IPython.core.display.Javascript object>

In [36]:
df = df.reset_index(drop=True)
df = df[~df["Loss on Ignition"].eq(df["Loss on Ignition"].max())].reset_index(drop=True)

df = df[~df["Insoluble Residue"].eq(df["Insoluble Residue"].max())].reset_index(
    drop=True
)
df = df[~df["Blaine"].eq(df["Blaine"].max())].reset_index(
    drop=True
)
df = df[~df["Initial setting time"].eq(df["Initial setting time"].max())].reset_index(
    drop=True
)
df = df[~df["Final setting time"].eq(df["Final setting time"].max())].reset_index(
    drop=True
)


<IPython.core.display.Javascript object>

In [37]:
df.describe(percentiles=[0.99]).T

Unnamed: 0,count,mean,min,50%,99%,max,std
Date,1547.0,2022-04-13 14:37:46.580478208,2021-01-02 00:00:00,2022-03-15 00:00:00,2023-07-11 12:57:36,2023-07-25 00:00:00,
CaO,1545.0,61.268651,46.69669,61.38071,64.124711,64.93145,1.65452
MgO,1545.0,2.674563,0.94305,2.61766,4.911533,6.3279,0.78659
Na2O,1545.0,0.1477,0.00037,0.15434,0.241639,0.2662,0.051149
Al2O3,1545.0,4.284422,1.13205,4.32909,4.976187,8.02546,0.374222
SiO2,1545.0,19.292387,13.07616,19.39674,20.503,21.33109,0.690692
SO3,1545.0,2.676948,0.07531,2.69615,3.128059,3.60785,0.220211
K2O,1545.0,0.454652,0.17397,0.44412,0.617978,0.67241,0.072793
Fe2O3,1545.0,2.729545,0.20984,2.73625,3.257414,5.16973,0.269114
Loss on Ignition,1546.0,6.278467,2.82,5.55,11.1855,12.8,2.355323


<IPython.core.display.Javascript object>

In [38]:
df.describe(exclude="number").T

Unnamed: 0,count,unique,top,freq,mean,min,25%,50%,75%,max
Date,1547,,,,2022-04-13 14:37:46.580478208,2021-01-02 00:00:00,2021-08-09 00:00:00,2022-03-15 00:00:00,2023-01-16 12:00:00,2023-07-25 00:00:00
Cement_Type,1547,3.0,CP II-F-40,808.0,,,,,,


<IPython.core.display.Javascript object>

In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1547 entries, 0 to 1546
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   Date                  1547 non-null   datetime64[ns]
 1   CaO                   1545 non-null   float64       
 2   MgO                   1545 non-null   float64       
 3   Na2O                  1545 non-null   float64       
 4   Al2O3                 1545 non-null   float64       
 5   SiO2                  1545 non-null   float64       
 6   SO3                   1545 non-null   float64       
 7   K2O                   1545 non-null   float64       
 8   Fe2O3                 1545 non-null   float64       
 9   Loss on Ignition      1546 non-null   float64       
 10  Insoluble Residue     1546 non-null   float64       
 11  Blaine                1546 non-null   float64       
 12  Initial setting time  1544 non-null   float64       
 13  Final setting time

<IPython.core.display.Javascript object>

### Sort the dataset by date

In [40]:
# We first make sure Date column was infered correctly
df["Date"].dtype

dtype('<M8[ns]')

<IPython.core.display.Javascript object>

In [41]:
df = df.sort_values(by="Date")

<IPython.core.display.Javascript object>

## Saving the dataset

<h3>Saving for feature engineering</h3>

In [42]:
df.to_csv("../../../../data/interim/209/q.csv", index=False)

<IPython.core.display.Javascript object>