In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import pandas as pd
import numpy as np

<IPython.core.display.Javascript object>

## Reading the dataset

In [3]:
xls = pd.ExcelFile(
    "../../../data/raw/partner_i-Oficial/DB_Master_CP1_latest v03.06.xlsx",
    engine="openpyxl",
)
df = pd.read_excel(xls, "DB2 Clínquer", header=[1, 2])

<IPython.core.display.Javascript object>

## Initial Preprocessing

In [4]:
df = pd.concat([df["Date"], df["Time"], df["Composition"]], axis=1)
df = df.drop([0, 1, 2], axis=0).reset_index(drop=True)

df = pd.concat(
    [
        df["Production"].iloc[:, 0],
        df["Measurement"].iloc[:, 0],
        df["Production"].iloc[:, 1].rename("Production.1"),
        df["Measurement"].iloc[:, 1].rename("Measurement.1"),
        df.drop(["Production", "Measurement"], axis=1),
    ],
    axis=1,
)

df = df.infer_objects()

<IPython.core.display.Javascript object>

# Removing Useless Features

Here we will drop some features that may not contribute to the performance of the models that will be fitted on this data.

<h3>Percentage of missing values:</h3>
    
    
<table>
    <th>Full Dataset:</th>
    <th></th>
    <tr>
        <td>Remarks</td>
        <td>100</td>
    </tr>
    <tr>
        <td>Bassanite</td>
        <td>100</td>
    </tr>        
    <tr>
        <td>Thenardite</td>
        <td>100</td>
    </tr>
    <tr>
        <td>Dolomite</td>
        <td>100</td>
    </tr>
    <tr>    
        <td>Ca(OH)2</td>
        <td>100</td>
    <tr/>
    <tr>    
        <td>CaCO3</td>
        <td>100</td>
    <tr/>
    <tr>    
        <td>Anhidrite</td>
        <td>100</td>
    <tr/>
    <tr>    
        <td>Gypsum</td>
        <td>100</td>
    <tr/>
    <tr>    
        <td>Syngenite</td>
        <td>100</td>
    <tr/>
</table>

In [5]:
FEATURES_TO_DROP = [
    # Missing Values
    "Thenardite",
    "CaCO3",
    "Ca(OH)2",
    "Gypsum",
    "Bassanite",
    "Anhidrite",
    "Dolomite",
    "Syngenite",
    "Remarks",
]

<IPython.core.display.Javascript object>

### Removing features with features having 60% or more of missing values

### Removing features related to the properties of Cement

### Removing features with zero variance

### Removing features with more than 70% of zeros

In [6]:
df = df.drop(labels=FEATURES_TO_DROP, axis=1)

<IPython.core.display.Javascript object>

### Filling missing values with interpolation

In [7]:
df = df.interpolate(method="linear", limit_direction="both")

<IPython.core.display.Javascript object>

### Convert Dates to appropriate Type

In [8]:
dates = pd.to_datetime(
    (
        df["Production"].astype(str)
        + " "
        + df["Production.1"]
        .astype(str)
        .apply(lambda x: x + ":00" if len(x) <= 5 else x)
    ).str.strip(),
    format="%d/%m/%Y %H:%M:%S",
)

df = df.drop(["Measurement", "Production.1", "Measurement.1"], axis=1)
df["Production"] = dates
df = df.rename({"Production": "Date"}, axis=1)

<IPython.core.display.Javascript object>

### Sort the dataset by date

In [9]:
df = df.sort_values(by="Date")

<IPython.core.display.Javascript object>

## Saving the dataset

<h3>Saving for modelling</h3>

In [10]:
df.to_csv("../../../data/processed/partner_i-Oficial/clinker.csv", index=False)

<IPython.core.display.Javascript object>