In [1]:
import pandas as pd
import numpy as np

In [2]:
xls = pd.ExcelFile(
    "../../../data/raw/partner_iii/Dati CADD 2020-2022.xlsx", engine="openpyxl"
)

In [3]:
xls.sheet_names

['2020-2022']

In [4]:
df = pd.read_excel(xls, '2020-2022')

# Removing Useless Features

Here we will drop some features that may not contribute to the performance of the models that will be fitted on this data.

## Removing features with features having 65% of missing values

In [5]:
miss_perc = df.isnull().sum()/df.shape[0] * 100
miss_perc = miss_perc.sort_values(ascending=False)

In [6]:
miss_perc

Flow at 60 min (%)                         99.372057
Flow at 30 min (%)                         99.058085
3d compressive strength (MPa)              96.703297
Pozzolan (%)                               96.075353
Fly ash (%)                                94.819466
Slag (%)                                   94.191523
Flow (%)                                   92.935636
Mixing water for normal consistency (%)    92.621664
Alpine residual 200µm (%)                  91.993721
Other (%)                                  89.952904
Final setting time (min)                   88.383046
Initial setting time (min)                 88.383046
Alpine residual 90µm (%)                   85.086342
Blaine specific surface (cm2/g)            79.591837
7d compressive strength (MPa)              73.783359
Alpine residual 63µm (%)                   72.213501
Alpine residual 32µm (%)                   67.032967
Alpine residual 40µm (%)                   67.032967
Limestone (%)                              61.

In [7]:
miss_perc[miss_perc < 65.0]

Limestone (%)                     61.381476
24h compressive strength (MPa)    46.467818
2d compressive strength (MPa)     40.816327
28d compressive strength (MPa)    23.547881
Gypsum (%)                         0.000000
Clinker (%)                        0.000000
Description                        0.000000
ID sample                          0.000000
dtype: float64

In [8]:
columns_to_keep = [
    "Limestone (%)",
    "Gypsum (%)",
    "Clinker (%)",
    "28d compressive strength (MPa)",
]

df = df[columns_to_keep]

### Drop rows where the target variable is null

In [9]:
df = df[~df["28d compressive strength (MPa)"].isna()]

In [10]:
df.isna().sum()/df.shape[0]

Limestone (%)                     0.587269
Gypsum (%)                        0.000000
Clinker (%)                       0.000000
28d compressive strength (MPa)    0.000000
dtype: float64

In [11]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Limestone (%),201.0,12.314925,7.552779,3.0,6.0,11.7,15.0,37.5
Gypsum (%),487.0,4.616591,0.816562,1.75,4.2,5.0,5.0,6.5
Clinker (%),487.0,86.931745,12.651087,27.0,80.0,95.0,95.0,96.0
28d compressive strength (MPa),487.0,52.717923,7.021033,25.41,48.955,53.67,57.09,67.32


In [12]:
df_without_missing = df.fillna(-1)

In [13]:
df_without_missing

Unnamed: 0,Limestone (%),Gypsum (%),Clinker (%),28d compressive strength (MPa)
0,-1.0,5.0,95.0,55.60000
1,-1.0,5.0,95.0,53.50000
2,-1.0,5.0,95.0,50.20000
3,-1.0,5.0,95.0,53.20000
4,-1.0,5.0,95.0,52.30000
...,...,...,...,...
615,-1.0,5.0,95.0,53.56167
616,-1.0,5.0,95.0,59.82333
617,-1.0,5.0,95.0,55.58500
618,-1.0,5.0,95.0,53.79667


## Saving the dataset with the transforms

<h3>Saving for linear regression and Neural Nets</h3>

In [14]:
df_without_missing.to_csv(
    "../../../data/interim/partner_iii/Dati CADD 2020-2022_with_fillna.csv", index=False
)

<h3>Saving for XGBoost</h3>

In [16]:
df.to_csv(
    "../../../data/interim/partner_iii/Dati CADD 2020-2022_without_fillna.csv", index=False
)