In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import pandas as pd
import numpy as np

<IPython.core.display.Javascript object>

In [3]:
def convert_to_float(value):
    try:
        # If it's a string enclosed in single quotes, remove quotes and replace comma with dot
        if isinstance(value, str):
            return float(value.replace(",", "."))
        else:
            return float(value)
    except ValueError as e:
        print(e)
        return float("nan")

<IPython.core.display.Javascript object>

In [4]:
def remove_outliers(df):
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3 - Q1
    return df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]

<IPython.core.display.Javascript object>

## Reading the dataset

In [5]:
plant = "D"

xls = pd.ExcelFile(
    f"../../../../data/raw/203/EMBRAPII hubIC IACC_CIMENTO_{plant}.xlsx",
    engine="openpyxl",
)

<IPython.core.display.Javascript object>

In [6]:
dataframes = []

for sheet_name in xls.sheet_names[1:]:
    df = pd.read_excel(xls, sheet_name, header=[0, 1, 2])
    dataframes.append(df.drop([0, 1], axis=0).reset_index(drop=True).copy())

df = pd.concat(dataframes, axis=0).reset_index(drop=True)

<IPython.core.display.Javascript object>

## Initial Preprocessing

In [7]:
df = pd.concat(
    [
        df["Dados iniciais"],
        df["Cimento"]["Análise química"],
        df["Característias físicas do cimento"],
        df["Propriedades do cimento"],
    ],
    axis=1,
)

<IPython.core.display.Javascript object>

### 1. Dropping columns where either there is no data nor it contributes to the analysis
### 2. Renaming columns to a short identifiable name 

In [8]:
COLUMNS_TO_DROP = [
    ("Data", "Produção"),
    ("Horário", "Produção"),
    ("Horário", "Medida"),
    ("Obs.", "Unnamed: 148_level_2"),
    ("Tipo de cimento", "Unnamed: 6_level_2"),
    ("Classe de resistência", "Unnamed: 7_level_2"),
    ("Obs.", "Unnamed: 8_level_2"),
    "Obs.",
    ("Obs.", "Unnamed: 148_level_2"),
    ("Unnamed: 141_level_1", "Obs."),
    ("Unnamed: 139_level_1", "#400"),
]

COLUMNS_TO_RENAME = {
    ("Cidade", "Unnamed: 1_level_2"): "Plant",
    ("Data", "Medida"): "Date",
    ("Unnamed: 136_level_1", "Blaine"): "Blaine",
    ("Unnamed: 137_level_1", "#200"): "#200",
    ("Unnamed: 138_level_1", "#325"): "#325",
    ("Unnamed: 139_level_1", "#400"): "#400",
    ("Unnamed: 140_level_1", "Massa específica"): "Specific Gravity",
    ("Tempo de pega", "Fim"): "Final setting time",
    ("Tempo de pega", "Inicio"): "Initial setting time",
    ("Resistência à compressão", "1 d"): "CS1",
    ("Resistência à compressão", "3 d"): "CS3",
    ("Resistência à compressão", "7 d"): "CS7",
    ("Resistência à compressão", "28 d"): "CS28",
    "Perda ao fogo": "Loss on Ignition",
    "Resíduo insolúvel": "Insoluble Residue",
}

<IPython.core.display.Javascript object>

### Defining a single variable to identify Cement Type

In [9]:
df["Cement_Type"] = (
    df[
        [
            ("Tipo de cimento", "Unnamed: 6_level_2"),
            ("Classe de resistência", "Unnamed: 7_level_2"),
            ("Obs.", "Unnamed: 8_level_2"),
        ]
    ]
    .fillna("")
    .astype(str)
    .sum(axis=1)
)

<IPython.core.display.Javascript object>

### Replacing the current cement type categories for a short identifiable category

In [10]:
# Preprocessing
df["Cement_Type"] = df["Cement_Type"].replace("FIBROCMENTO", "Fibrocimento")
df["Cement_Type"] = df["Cement_Type"].replace("CP VARIEnsacado", "CP VARI")
df["Cement_Type"] = df["Cement_Type"].replace("CP VARIGranel", "CP VARI")
df["Cement_Type"] = df["Cement_Type"].replace("CP III40Ensacado", "CP III40")
df["Cement_Type"] = df["Cement_Type"].replace("CP III40Granel", "CP III40")
df["Cement_Type"] = df["Cement_Type"].replace("CP VARIRS", "CP VARI")

<IPython.core.display.Javascript object>

### Dropping and rename the columns defined above

In [11]:
df = df.drop(COLUMNS_TO_DROP, axis=1).rename(COLUMNS_TO_RENAME, axis=1).copy()
df = df.infer_objects()

<IPython.core.display.Javascript object>

### Clean and converting numeric columns to float

In [12]:
NUMERIC_COLUMNS = [
    "CaO",
    "MgO",
    "Na2O",
    "Al2O3",
    "SiO2",
    "SO3",
    "K2O",
    "TiO2",
    "Fe2O3",
    "Loss on Ignition",
    "Insoluble Residue",
    "Blaine",
    "#200",
    "#325",
    "Specific Gravity",
    "Final setting time",
    "Initial setting time",
    "CS1",
    "CS3",
    "CS7",
    "CS28",
]

df[NUMERIC_COLUMNS] = df[NUMERIC_COLUMNS].map(convert_to_float)

could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: '             '
could not convert string to float: '-'
could not convert string to float: '-'
could not convert string to float: '-'
could not convert string to float: '-'
could not convert string to float: '-'
could not convert string to float: '-'
could not convert string to float: '-'
could not convert string to float: '-'
could not convert string to float: '48.6.'
could not convert string to float: '-'
could not convert string to float: '-'
could not convert string to float: '-'
could not convert string to float: '-'
could not convert string to float: '-'
could not convert string to float: '-'
could not convert string to float: '-'
could not convert string to float: '-'


<IPython.core.display.Javascript object>

### Converting Date columns to pandas datetime

In [13]:
df["Date"] = pd.to_datetime(df["Date"])

<IPython.core.display.Javascript object>

## Dropping missing features

### Drop rows completely missing

In [14]:
index_to_keep = (
    df.drop(["Plant", "Date", "Cement_Type"], axis=1).dropna(axis=0, how="all").index
)
df = df.iloc[index_to_keep].reset_index(drop=True)

<IPython.core.display.Javascript object>

### Removing features
 that will be fitted on this data.

1. Removing features with features having 60% or more of missing values
2. Removing features with zero variance
3. Removing features with more than 70% of zeros

<h3>Percentage of missing values:</h3>
    
    
<table>
    <th>Plant AV - Full Dataset:</th>
    <th></th>
    <tr>
        <td>TiO2</td>
        <td>100.00</td>
    </tr>
    <tr>
        <td>Al2O3</td>
        <td>100.00</td>
    </tr>        
    <tr>
        <td>CaO</td>
        <td>100.00</td>
    </tr>
    <tr>
        <td>Fe2O3</td>
        <td>100.00</td>
    </tr>    
    <tr>
        <td>SiO2</td>
        <td>100.00</td>
    </tr>    
    <tr>
        <td>Insoluble Residue</td>
        <td>73.55</td>
    </tr>  
    <tr>
        <td>CS3</td>
        <td>48.80</td>
    </tr>
</table>

In [15]:
FEATRUES_TO_DROP = ["TiO2", "Al2O3", "CaO", "Fe2O3", "SiO2", "Insoluble Residue", "CS3"]
df = df.drop(labels=FEATRUES_TO_DROP, axis=1)

<IPython.core.display.Javascript object>

### Dropping duplicated rows

In [16]:
df = df.reset_index(drop=True)

<IPython.core.display.Javascript object>

In [17]:
df.shape

(4908, 17)

<IPython.core.display.Javascript object>

In [18]:
df = df.loc[df.select_dtypes("number").drop_duplicates().index].reset_index(drop=True)

<IPython.core.display.Javascript object>

In [19]:
df.shape

(4765, 17)

<IPython.core.display.Javascript object>

### Dropping rows with negative values

In [20]:
df[df.drop(["Date"], axis=1).duplicated()].drop("Date", axis=1).select_dtypes(
    include="number"
).apply(lambda x: x < 0).sum().sort_values(ascending=False)

MgO                     0.0
Na2O                    0.0
SO3                     0.0
K2O                     0.0
Loss on Ignition        0.0
Blaine                  0.0
#200                    0.0
#325                    0.0
Specific Gravity        0.0
Initial setting time    0.0
Final setting time      0.0
CS1                     0.0
CS7                     0.0
CS28                    0.0
dtype: float64

<IPython.core.display.Javascript object>

### Dropping rows with zero values

In [21]:
df[df.drop(["Date"], axis=1).duplicated()].drop("Date", axis=1).select_dtypes(
    include="number"
).apply(lambda x: x == 0).sum().sort_values(ascending=False)

MgO                     0.0
Na2O                    0.0
SO3                     0.0
K2O                     0.0
Loss on Ignition        0.0
Blaine                  0.0
#200                    0.0
#325                    0.0
Specific Gravity        0.0
Initial setting time    0.0
Final setting time      0.0
CS1                     0.0
CS7                     0.0
CS28                    0.0
dtype: float64

<IPython.core.display.Javascript object>

### Drop rows where the target variable is null

In [22]:
df = df[df["CS28"].notna()]

<IPython.core.display.Javascript object>

### Removing outliers

In [23]:
# import matplotlib.pyplot as plt

# # Number of columns in the dataframe
# num_columns = df.drop(["Plant", "Date", "Cement_Type"], axis=1).shape[1]

# # Creating subplots
# fig, axes = plt.subplots(num_columns, 1, figsize=(8, num_columns * 4))

# # Plotting each variable in its own subplot
# for i, column in enumerate(df.drop(["Plant", "Date", "Cement_Type"], axis=1).columns):
#     df.boxplot(column=column, ax=axes[i])
#     axes[i].set_title(column)
#     axes[i].set_xlabel('')

# plt.tight_layout()
# plt.show()


<IPython.core.display.Javascript object>

In [24]:
df.describe().T

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
Date,4687.0,2019-06-05 00:17:16.910603776,2011-01-19 00:00:00,2017-08-08 12:00:00,2019-05-23 00:00:00,2021-04-19 00:00:00,2023-03-28 00:00:00,
MgO,4660.0,2.116204,1.038,1.69,2.11,2.51,3.92,0.541846
Na2O,4660.0,0.105331,0.0001,0.039,0.051,0.077,2.81,0.166328
SO3,4653.0,3.087341,1.09,2.7,2.84,3.35,5.87,0.590016
K2O,4660.0,1.075985,0.016,1.03,1.08,1.14,1.8,0.103748
Loss on Ignition,4659.0,5.140656,0.49,2.82,4.52,6.35,12.19,2.833233
Blaine,4661.0,4307.433169,1358.0,4063.0,4312.0,4502.0,5926.0,376.214826
#200,4654.0,0.591682,0.0,0.04,0.14,1.09,10.63,0.850133
#325,4659.0,4.589523,0.03,0.73,1.79,9.7,21.34,5.088268
Specific Gravity,4572.0,3.122549,2.83,3.0,3.07,3.1,311.0,4.554702


<IPython.core.display.Javascript object>

In [25]:
idx = df[df["Specific Gravity"] == 311.0].index
idx2 = df[df["Initial setting time"] == 0.0].index
idx3 = df[df["Final setting time"] == 21.0].index
idx4 = df[df["Na2O"] == 2.81].index


df = df.drop(idx)
df = df.drop(idx2)
df = df.drop(idx3)
df = df.drop(idx4)

<IPython.core.display.Javascript object>

In [26]:
df.describe().T

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
Date,4683.0,2019-06-04 11:48:51.197949952,2011-01-19 00:00:00,2017-08-08 00:00:00,2019-05-23 00:00:00,2021-04-17 00:00:00,2023-03-28 00:00:00,
MgO,4656.0,2.116042,1.038,1.6875,2.11,2.51,3.92,0.541843
Na2O,4656.0,0.104782,0.0001,0.039,0.051,0.077,0.94,0.161602
SO3,4649.0,3.087388,1.09,2.69,2.84,3.35,5.87,0.590255
K2O,4656.0,1.075928,0.016,1.03,1.08,1.14,1.8,0.103763
Loss on Ignition,4655.0,5.140701,0.49,2.82,4.53,6.35,12.19,2.83327
Blaine,4657.0,4307.490659,1358.0,4063.0,4312.0,4502.0,5926.0,376.339271
#200,4650.0,0.591978,0.0,0.04,0.14,1.09,10.63,0.850377
#325,4655.0,4.590893,0.03,0.73,1.8,9.7,21.34,5.089274
Specific Gravity,4568.0,3.055209,2.83,3.0,3.07,3.1,3.4,0.06195


<IPython.core.display.Javascript object>

In [27]:
(df.isna().sum() / df.shape[0] * 100).sort_values(ascending=False).to_frame(
    name="Missing (%)"
).style.background_gradient(cmap="Reds")

Unnamed: 0,Missing (%)
Specific Gravity,2.455691
SO3,0.72603
#200,0.704676
Loss on Ignition,0.597907
#325,0.597907
MgO,0.576553
Na2O,0.576553
K2O,0.576553
Blaine,0.5552
CS7,0.192184


<IPython.core.display.Javascript object>

In [28]:
df.describe().T

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
Date,4683.0,2019-06-04 11:48:51.197949952,2011-01-19 00:00:00,2017-08-08 00:00:00,2019-05-23 00:00:00,2021-04-17 00:00:00,2023-03-28 00:00:00,
MgO,4656.0,2.116042,1.038,1.6875,2.11,2.51,3.92,0.541843
Na2O,4656.0,0.104782,0.0001,0.039,0.051,0.077,0.94,0.161602
SO3,4649.0,3.087388,1.09,2.69,2.84,3.35,5.87,0.590255
K2O,4656.0,1.075928,0.016,1.03,1.08,1.14,1.8,0.103763
Loss on Ignition,4655.0,5.140701,0.49,2.82,4.53,6.35,12.19,2.83327
Blaine,4657.0,4307.490659,1358.0,4063.0,4312.0,4502.0,5926.0,376.339271
#200,4650.0,0.591978,0.0,0.04,0.14,1.09,10.63,0.850377
#325,4655.0,4.590893,0.03,0.73,1.8,9.7,21.34,5.089274
Specific Gravity,4568.0,3.055209,2.83,3.0,3.07,3.1,3.4,0.06195


<IPython.core.display.Javascript object>

In [29]:
df.describe(exclude="number").T

Unnamed: 0,count,unique,top,freq,mean,min,25%,50%,75%,max
Plant,4683,1.0,D,4683.0,,,,,,
Date,4683,,,,2019-06-04 11:48:51.197949952,2011-01-19 00:00:00,2017-08-08 00:00:00,2019-05-23 00:00:00,2021-04-17 00:00:00,2023-03-28 00:00:00
Cement_Type,4683,5.0,CP VARI,1580.0,,,,,,


<IPython.core.display.Javascript object>

### Dropping variable with Plant identification

In [30]:
df = df.drop("Plant", axis=1)

<IPython.core.display.Javascript object>

### Sort the dataset by date

In [31]:
# We first make sure Date column was infered correctly
df["Date"].dtype

dtype('<M8[ns]')

<IPython.core.display.Javascript object>

In [32]:
df = df.sort_values(by="Date")

<IPython.core.display.Javascript object>

## Saving the dataset

<h3>Saving for feature engineering</h3>

In [33]:
df.to_csv("../../../../data/interim/203/d.csv", index=False)

<IPython.core.display.Javascript object>