In [1]:
import pandas as pd
import numpy as np

## Reading the dataset

In [2]:
xls = pd.ExcelFile(
    "../../../data/raw/Inter/estudo.xlsm", engine="openpyxl"
)
df = pd.read_excel(xls, "dados")#, parse_dates=["Data"])

df_2018_2019 = pd.read_excel(
    "../../../data/raw/Inter/Dados_CAJ_2018_19.xlsx",
    engine="openpyxl",
    #parse_dates=["Data"],
)

## Initial Preprocessing

In [3]:
# 2008-2017
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df = df[~df.isna().all(axis=1)]
df = df.rename({"G75�": "G75", "G44�": "G44"}, axis=1)

# 2018-2019
df_2018_2019 = df_2018_2019.rename({"G75�": "G75", "G44�": "G44"}, axis=1)
df_2018_2019["Data"] = pd.to_datetime(df_2018_2019["Data"], format="%d/%m/%Y")
df["Data"] = pd.to_datetime(df["Data"], format="%Y/%m/%d")
df = pd.concat([df, df_2018_2019]).reset_index(drop=True)

In [4]:
df_cpiif32 = df[df["COD_PROD"] ==  "CPIIF32"].reset_index(drop=True)
df_cpiif40 = df[df["COD_PROD"] ==  "CPIIF40"].reset_index(drop=True)

# Removing Useless Features

Here we will drop some features that may not contribute to the performance of the models that will be fitted on this data.

<h3>Percentage of missing values:</h3>

<table>
    <th>Full Dataset:</th>
    <th> </th>
    <tr>
        <td>RIC_OBS</td>
        <td>100.000000</td>
    </tr>
    <tr>
        <td>RIE_OBS</td>
        <td>100.000000</td>
    </tr>        
    <tr>
        <td>Rept</td>
        <td>100.000000</td>
    </tr>
    <tr>
    <td>RC91</td>
    <td>99.695013</td>
    </tr>
    <tr>    
    <td>Nat1</td>
    <td>99.605310</td>
    <tr/>
    <tr>    
    <td>CO2</td>
    <td>99.551489</td>
    <tr/>
    <tr>    
    <td>NA2O</td>
    <td>82.579835</td>
    <tr/>
    <tr>    
    <td>EXP</td>
    <td>78.238249</td>
    <tr/>
    <tr>    
    <td>RC1</td>
    <td>42.752063</td>
    <tr/>
    <tr>    
    <td>MVOL</td>
    <td>28.040904</td>
    <tr/>
    <tr>    
    <td>RICARB</td>
    <td>18.066021</td>
    <tr/>
</table>

<table>
    <th>CPIIF40</th>
    <th></th>
    <tr>
        <td>TiO2</td>
        <td>99.836334</td>
    </tr>
    <tr>
        <td>Ca(OH)2</td>
        <td>99.836334</td>
    </tr>
    <tr>        
        <td>CaCO3</td>
        <td>99.836334</td>
    </tr>
    <tr>
    <td>Langbeinite</td>
    <td>69.558101</td>
    </tr>
    <tr>
    <td>1 day Compressive strength</td>
    <td>100.000000</td>
    </tr>
</table>

<table>
    <th>CPIIF32</th>
    <th></th>
    <tr>
        <td>TiO2</td>
        <td>100.000000</td>
    </tr>
    <tr>
        <td>Ca(OH)2</td>
        <td>100.000000</td>
    </tr>
    <tr>
        <td>CaCO3</td>
        <td>100.000000</td>
    </tr>
    <tr>
        <td>Langbeinite</td>
        <td>68.646081</td>
    </tr>
    <tr>
        <td>1 day Compressive strength</td>
        <td>100.000000</td>
    </tr>
    <tr>
        <td>#400</td>
        <td>100.000000</td>
    </tr>
</table>

In [5]:
FEATURES_TO_DROP = [
    # Missing Values
    "RIC_OBS",
    "RIE_OBS",
    "Rept",
    "RC91",
    "Nat1",
    "CO2",
    "NA2O",
    "EXP",

    # Features with zero variance
    # or more than 70% percent of zeros
    
    # Properties
    "IP",     # Initial setting time
    "FP",     # Final setting time
    "SBL",    # Blaine
    "G75",    # Fineness
    "G44",    # Fineness
    "MVOL",   # Density,
    "RICARB", # Insoluble residue
    "AGP",    # Paste consistency water
    "PF",     # Fire Loss

    # Useless Features
    "RIC", 
    "Nat",
    "Nat1",
    "Lp",
    "Ec",
    "Lc",
    "RIE",
    "COD_MN",
    "COD_ENT",
    "COD_VAR_PROD"
]

### Removing features with features having 60% or more of missing values

### Removing features related to the properties of Cement

### Removing features with zero variance

### Removing features with more than 70% of zeros

In [6]:
df = df.drop(labels=FEATURES_TO_DROP, axis=1)
df_cpiif32 = df_cpiif32.drop(labels=FEATURES_TO_DROP + ["COD_PROD"], axis=1)
df_cpiif40 = df_cpiif40.drop(labels=FEATURES_TO_DROP + ["COD_PROD"], axis=1)

### Drop rows where the target variable is null

In [7]:
df = df[df["RC28"].notna()]
df_cpiif32 = df_cpiif32[df_cpiif32["RC28"].notna()]
df_cpiif40 = df_cpiif40[df_cpiif40["RC28"].notna()]

In [8]:
(df.isna().sum().sort_values(ascending=False)/df.shape[0] * 100).to_frame(
    name="Missing (%)"
).style.background_gradient(cmap="Reds")

Unnamed: 0,Missing (%)
RC1,42.726128
P2O5,2.319727
K2O,2.032009
FE2O3,1.996044
SIO2,1.996044
CAOT,1.996044
AL2O3,1.996044
MGO,1.870167
SO3,1.744291
RC7,0.017982


In [9]:
(df_cpiif32.isna().sum().sort_values(ascending=False)/df_cpiif32.shape[0] * 100).to_frame(
    name="Missing (%)"
).style.background_gradient(cmap="Reds")

Unnamed: 0,Missing (%)
RC1,62.957075
P2O5,2.596714
K2O,2.411235
FE2O3,2.358241
SIO2,2.358241
CAOT,2.358241
AL2O3,2.358241
MGO,2.252252
SO3,2.066773
RC7,0.026497


In [10]:
(df_cpiif40.isna().sum().sort_values(ascending=False)/df_cpiif40.shape[0] * 100).to_frame(
    name="Missing (%)"
).style.background_gradient(cmap="Reds")

Unnamed: 0,Missing (%)
P2O5,1.734751
FE2O3,1.231114
SIO2,1.231114
K2O,1.231114
CAOT,1.231114
AL2O3,1.231114
SO3,1.063234
MGO,1.063234
RC1,0.0
RC28,0.0


In [11]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
RC3,5561.0,23.013181,5.483241,5.6,18.1,23.5,27.5,39.5
RC7,5560.0,31.45596,6.443364,8.9,25.7,32.2,36.7,51.4
RC28,5561.0,47.585197,6.948911,28.1,41.6,48.8,52.8,64.0
AL2O3,5450.0,3.779171,0.41262,1.39,3.47,3.78,4.05,5.92
CAOT,5450.0,56.277451,1.743092,5.89,55.37,56.05,57.14,62.82
K2O,5448.0,0.279939,0.774718,0.02,0.22,0.26,0.31,57.06
MGO,5457.0,6.037064,0.91629,0.29,5.78,6.07,6.35,61.1
SIO2,5450.0,16.060031,1.260572,0.0,15.27,16.39,16.97,20.37
FE2O3,5450.0,3.722193,0.534989,1.7,3.32,3.67,4.07,16.7
SO3,5464.0,2.682041,0.237515,0.6,2.54,2.67,2.81,4.41


In [12]:
df_cpiif32.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
RC3,3774.0,20.359028,4.374606,5.6,17.0,19.5,23.9,34.5
RC7,3773.0,28.305484,5.11572,8.9,24.4,27.4,32.6,43.2
RC28,3774.0,44.230742,5.643229,28.1,39.1,44.9,49.0,59.3
AL2O3,3685.0,3.627775,0.362145,2.68,3.36,3.61,3.86,5.92
CAOT,3685.0,55.892499,1.621989,5.89,55.13,55.84,56.59,61.97
K2O,3683.0,0.281852,0.939208,0.1,0.23,0.26,0.3,57.06
MGO,3689.0,6.064888,0.568579,0.29,5.8,6.12,6.41,8.11
SIO2,3685.0,15.732404,1.301391,2.66,14.53,16.02,16.77,20.37
FE2O3,3685.0,3.751788,0.611057,1.7,3.27,3.66,4.24,16.7
SO3,3696.0,2.668872,0.226915,0.6,2.54,2.65,2.77,4.41


In [13]:
df_cpiif40.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
RC3,1787.0,28.618539,2.61663,19.6,26.9,28.7,30.3,39.5
RC7,1787.0,38.10775,2.951655,28.2,36.1,38.1,40.1,51.4
RC28,1787.0,54.669536,3.008993,43.5,52.5,54.3,56.7,64.0
AL2O3,1765.0,4.095258,0.322759,1.39,3.87,4.09,4.33,5.24
CAOT,1765.0,57.081161,1.71327,28.23,55.83,56.64,58.18,62.82
K2O,1765.0,0.275949,0.109753,0.02,0.16,0.29,0.35,0.64
MGO,1768.0,5.97901,1.382999,2.85,5.71,6.01,6.21,61.1
SIO2,1765.0,16.744057,0.824106,0.0,16.35,16.81,17.2,19.95
FE2O3,1765.0,3.660402,0.314106,2.61,3.44,3.67,3.88,6.24
SO3,1768.0,2.70957,0.25616,0.72,2.5575,2.71,2.87,3.72


### Sort the dataset by date

In [14]:
df = df.sort_values(by="Data")
df_cpiif32 = df_cpiif32.sort_values(by="Data")
df_cpiif40 = df_cpiif40.sort_values(by="Data")

## Saving the dataset

<h3>Saving for feature engineering</h3>

In [15]:
df.to_csv("../../../data/interim/Inter/cement-shipping.csv", index=False)
df_cpiif32.to_csv("../../../data/interim/Inter/cpiif32.csv", index=False)
df_cpiif40.to_csv("../../../data/interim/Inter/cpiif40.csv", index=False)