In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import pandas as pd
import numpy as np

<IPython.core.display.Javascript object>

In [3]:
def convert_to_float(value):
    try:
        # If it's a string enclosed in single quotes, remove quotes and replace comma with dot
        if isinstance(value, str):
            return float(value.replace(",", "."))
        else:
            return float(value)
    except ValueError as e:
        print(e)
        return float("nan")

<IPython.core.display.Javascript object>

In [4]:
def preprocess_numeric_cols(df, columns):
    for col in columns:
        df[col] = df[col].astype(str)  # Ensure string type for string operations
        df[col] = df[col].str.replace("#", "")  # Remove '#' characters
        df[col] = df[col].str.replace(",", ".")  # Replace ',' with '.' for decimals
    return df

<IPython.core.display.Javascript object>

In [5]:
def check_columns_match(df1, df2):
    """
    Check if two dataframes have the same columns in the same order.
    """

    df1_cols = set(df1.columns)
    df2_cols = set(df2.columns)

    same_cols = df1_cols == df2_cols

    print(f"Dataframes have the same columns: {same_cols}")

    same_order = all(df1.columns[i] == df2.columns[i] for i in range(len(df1.columns)))

    print(f"Dataframes have the same column order: {same_order}")

    return same_cols, same_order

<IPython.core.display.Javascript object>

In [6]:
innovandi = [
    "Alite total",
    "Belite total",
    "Belite alpha",
    "Belite beta",
    "Belite gamma",
    "Ferrite",
    "Aluminate",
    "Aluminate cubic",
    "Aluminate orto",
    "Free lime",
    "Portlandite",
    "Periclase",
    "Arcanite",
    "Aphthalite",
    "Langbeinite",
    "Gypsum",
    "Bassanite",
    "Anhydrite",
    "Calcite",
    "Dolimite",
    "Quartz",
    "Muscovite",
]


embrapii_cics = [
    "C3S total",
    "C2S total",
    "C2S alpha",
    "C2S beta",
    "C2S gama",
    "C4AF",
    "C3A",
    "C3A cubic",
    "C3A orto",
    "CaO livre",
    "Portlandita",
    "Periclasio",
    "Arcanita",
    "Aphthalita",
    "Langbeinita",
    "Sulfato de cálcio",
    "Bassanita",
    "Anidrita",
    "Calcita",
    "Dolimita",
    "Quartzo",
    "Muscovita",
]
mineral_map = dict(zip(innovandi, embrapii_cics))

<IPython.core.display.Javascript object>

# 203

## Reading the dataset

In [7]:
plants = ["AV", "C", "D", "G", "M", "Y"]
xls_files = {}

for plant in plants:
    xls = pd.ExcelFile(
        f"../../../../data/raw/203/EMBRAPII hubIC IACC_CIMENTO_{plant}.xlsx",
        engine="openpyxl",
    )
    xls_files[plant] = xls

<IPython.core.display.Javascript object>

In [8]:
dataframes = []

for plant, xls in xls_files.items():
    for sheet_name in xls.sheet_names[1:]:
        df = pd.read_excel(xls_files[plant], sheet_name, header=[0, 1, 2])
        dataframes.append(df.drop([0, 1], axis=0).reset_index(drop=True).copy())

<IPython.core.display.Javascript object>

In [9]:
for plant, xls in xls_files.items():
    print("Plant: ", plant, xls.sheet_names)

Plant:  AV ['INSTRUÇÕES', '122', '24', '134', '50']
Plant:  C ['INSTRUÇÕES', '114', '36', '144', '74', '54']
Plant:  D ['INSTRUÇÕES', '10', '131', '117', '197', '8']
Plant:  G ['INSTRUÇÕES', '3', '160', '49', '137', '2', '17', '142']
Plant:  M ['INSTRUÇÕES', '147', '141', '159', '73', '170', '53', '135', '67', '161']
Plant:  Y ['INSTRUÇÕES', '77', '151', '179', '136', '71', '113']


<IPython.core.display.Javascript object>

In [10]:
df = pd.concat(dataframes, axis=0).reset_index(drop=True)

<IPython.core.display.Javascript object>

## Initial Preprocessing

In [11]:
df = pd.concat(
    [
        df["Dados iniciais"],
        df["Cimento"]["Análise química"],
        df["Característias físicas do cimento"],
        df["Propriedades do cimento"],
    ],
    axis=1,
)

<IPython.core.display.Javascript object>

### 1. Dropping columns where either there is no data nor it contributes to the analysis
### 2. Renaming columns to a short identifiable name 

In [12]:
COLUMNS_TO_DROP = [
    ("Data", "Produção"),
    ("Horário", "Produção"),
    ("Horário", "Medida"),
    ("Obs.", "Unnamed: 148_level_2"),
    ("Tipo de cimento", "Unnamed: 6_level_2"),
    ("Classe de resistência", "Unnamed: 7_level_2"),
    ("Obs.", "Unnamed: 8_level_2"),
    "Obs.",
    ("Obs.", "Unnamed: 148_level_2"),
    ("Unnamed: 141_level_1", "Obs."),
]

COLUMNS_TO_RENAME = {
    ("Cidade", "Unnamed: 1_level_2"): "Plant",
    ("Data", "Medida"): "Date",
    ("Unnamed: 136_level_1", "Blaine"): "Blaine",
    ("Unnamed: 137_level_1", "#200"): "#200",
    ("Unnamed: 138_level_1", "#325"): "#325",
    ("Unnamed: 139_level_1", "#400"): "#400",
    ("Unnamed: 140_level_1", "Massa específica"): "Specific Gravity",
    ("Tempo de pega", "Fim"): "Final setting time",
    ("Tempo de pega", "Inicio"): "Initial setting time",
    ("Resistência à compressão", "1 d"): "CS1",
    ("Resistência à compressão", "3 d"): "CS3",
    ("Resistência à compressão", "7 d"): "CS7",
    ("Resistência à compressão", "28 d"): "CS28",
    "Perda ao fogo": "Loss on Ignition",
    "Resíduo insolúvel": "Insoluble Residue",
}

<IPython.core.display.Javascript object>

### Defining a single variable to identify Cement Type

In [13]:
df["Cement_Type"] = (
    df[
        [
            ("Tipo de cimento", "Unnamed: 6_level_2"),
            ("Classe de resistência", "Unnamed: 7_level_2"),
            ("Obs.", "Unnamed: 8_level_2"),
        ]
    ]
    .fillna("")
    .astype(str)
    .sum(axis=1)
)

<IPython.core.display.Javascript object>

### Replacing the current cement type categories for a short identifiable category

In [14]:
# Preprocessing
df["Cement_Type"] = df["Cement_Type"].replace("FIBROCMENTO", "Fibrocimento")
df["Cement_Type"] = df["Cement_Type"].replace("CP VARIEnsacado", "CP VARI")
df["Cement_Type"] = df["Cement_Type"].replace("CP VARIGranel", "CP VARI")
df["Cement_Type"] = df["Cement_Type"].replace("CP III40Ensacado", "CP III40")
df["Cement_Type"] = df["Cement_Type"].replace("CP III40Granel", "CP III40")
df["Cement_Type"] = df["Cement_Type"].replace("CP VARIRS", "CP VARI")

<IPython.core.display.Javascript object>

### Dropping and rename the columns defined above

In [15]:
df = df.drop(COLUMNS_TO_DROP, axis=1).rename(COLUMNS_TO_RENAME, axis=1).copy()
df = df.infer_objects()

<IPython.core.display.Javascript object>

### Clean and converting numeric columns to float

In [16]:
NUMERIC_COLUMNS = [
    "CaO",
    "MgO",
    "Na2O",
    "Al2O3",
    "SiO2",
    "SO3",
    "K2O",
    "TiO2",
    "Fe2O3",
    "Loss on Ignition",
    "Insoluble Residue",
    "Blaine",
    "#200",
    "#325",
    "Specific Gravity",
    "Final setting time",
    "Initial setting time",
    "CS1",
    "CS3",
    "CS7",
    "CS28",
]

df[NUMERIC_COLUMNS] = df[NUMERIC_COLUMNS].map(convert_to_float)

could not convert string to float: '-'
could not convert string to float: '-'
could not convert string to float: '-'
could not convert string to float: '-'
could not convert string to float: '-'
could not convert string to float: '-'
could not convert string to float: '-'
could not convert string to float: '-'
could not convert string to float: '1..10'
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: '             '
could not convert string to float: '  '
could not convert string to float: ' '
could not convert string to float: '1.6.0'
could not convert string to float: '-'
could not convert string to float: '-'
could not convert string to float: '-'
could not convert string to float: '-'
could not convert string to float: '-'
could not convert string to float: '-'
could not convert string to float: '-'
could not convert string to float: '-'
could not convert string to float: '-'
coul

<IPython.core.display.Javascript object>

### Converting Date columns to pandas datetime

In [17]:
df["Date"] = pd.to_datetime(df["Date"])

<IPython.core.display.Javascript object>

In [18]:
df.shape

(20203, 25)

<IPython.core.display.Javascript object>

## Dropping missing features

### Drop rows completely missing

In [19]:
index_to_keep = (
    df.drop(["Plant", "Date", "Cement_Type"], axis=1).dropna(axis=0, how="all").index
)
df = df.iloc[index_to_keep].reset_index(drop=True)

<IPython.core.display.Javascript object>

In [20]:
df.shape

(19506, 25)

<IPython.core.display.Javascript object>

### Dropping variable with Plant identification

In [21]:
df["Factory_Plant"] = "203_" + df["Plant"]
df = df.drop("Plant", axis=1)

<IPython.core.display.Javascript object>

In [22]:
df_203 = df.copy()

<IPython.core.display.Javascript object>

# 204

In [23]:
plants = ["A", "AB", "AE", "AQ", "AY", "F", "I", "O", "R", "T"]
xls_files = {}

for plant in plants:
    xls = pd.ExcelFile(
        f"../../../../data/raw/204/EMBRAPII hubIC IACC CIMENTO_{plant}.xlsx",
        engine="openpyxl",
    )
    xls_files[plant] = xls

<IPython.core.display.Javascript object>

In [24]:
cement_types_per_plant = []
CEMENT_TYPE_COLS = ["Tipo de cimento", "Classe de resistência"]
dataframes = []

for plant, xls in xls_files.items():
    for sheet_name in xls.sheet_names[1:]:
        df = pd.read_excel(xls_files[plant], sheet_name, header=[1, 2])
        if not df[CEMENT_TYPE_COLS].iloc[2:].isna().all().all():
            cement_type = (
                df[CEMENT_TYPE_COLS]
                .loc[2:]
                .astype(str)
                .sum(axis=1)
                .str.replace(" ", "")
                .str.replace("-", "")
                .str.replace("nan", "")
                .unique()
            )

        else:
            cement_type = (
                df[[("Obs.", "Unnamed: 8_level_1")]]
                .loc[2:]
                .astype(str)
                .sum(axis=1)
                .str.replace(" ", "")
                .str.replace("-", "")
                .unique()
            )

        cement_types_per_plant.append((plant, sheet_name, cement_type))
        dataframes.append(df)

<IPython.core.display.Javascript object>

In [25]:
for plant, xls in xls_files.items():
    print("Palnt: ", plant, xls.sheet_names)

Palnt:  A ['INSTRUÇÕES', '176', '102']
Palnt:  AB ['INSTRUÇÕES', '13', '51', '173']
Palnt:  AE ['INSTRUÇÕES', '181', '109', '23']
Palnt:  AQ ['INSTRUÇÕES', 'Fibro', '7', '153', '89']
Palnt:  AY ['INSTRUÇÕES', '12']
Palnt:  F ['INSTRUÇÕES', '55', '75', '14', '91', '65']
Palnt:  I ['INSTRUÇÕES', '47', '11', '1']
Palnt:  O ['INSTRUÇÕES', '132', '81', 'FIBRO']
Palnt:  R ['INSTRUÇÕES', '21', '61']
Palnt:  T ['INSTRUÇÕES', '72', '104']


<IPython.core.display.Javascript object>

In [26]:
dataframes = []

for plant, xls in xls_files.items():
    for sheet_name in xls.sheet_names[1:]:
        df = pd.read_excel(xls_files[plant], sheet_name, header=[0, 1, 2])
        df["Unnamed: 0_level_0", "Unnamed: 0_level_1", "Unnamed: 0_level_2"] = plant
        dataframes.append(df.drop([0, 1], axis=0).reset_index(drop=True).copy())

<IPython.core.display.Javascript object>

In [27]:
df = pd.concat(dataframes, axis=0).reset_index(drop=True)

<IPython.core.display.Javascript object>

## Initial Preprocessing

In [28]:
df = pd.concat(
    [
        df["Unnamed: 0_level_0", "Unnamed: 0_level_1", "Unnamed: 0_level_2"],
        df["Dados iniciais"],
        df["Cimento"]["Análise química"],
        df["Característias físicas do cimento"],
        df["Propriedades do cimento"],
    ],
    axis=1,
)

<IPython.core.display.Javascript object>

### 1. Dropping columns where either there is no data nor it contributes to the analysis
### 2. Renaming columns to a short identifiable name

In [29]:
COLUMNS_TO_DROP = [
    ("Data", "Medida"),
    ("Horário", "Produção"),
    ("Horário", "Medida"),
    ("Obs.", "Unnamed: 148_level_2"),
    ("Tipo de cimento", "Unnamed: 6_level_2"),
    ("Classe de resistência", "Unnamed: 7_level_2"),
    ("Obs.", "Unnamed: 8_level_2"),
    "Obs.",
    ("Obs.", "Unnamed: 148_level_2"),
    ("Unnamed: 141_level_1", "Obs."),
]


COLUMNS_TO_RENAME = {
    ("Unnamed: 0_level_0", "Unnamed: 0_level_1", "Unnamed: 0_level_2"): "Plant",
    ("Cidade", "Unnamed: 1_level_2"): "City",
    ("Data", "Produção"): "Date",  # Arrumar!
    ("Unnamed: 136_level_1", "Blaine"): "Blaine",
    ("Unnamed: 137_level_1", "#200"): "#200",
    ("Unnamed: 138_level_1", "#325"): "#325",
    ("Unnamed: 139_level_1", "#400"): "#400",
    ("Unnamed: 140_level_1", "Massa específica"): "Specific Gravity",
    ("Tempo de pega", "Fim"): "Final setting time",
    ("Tempo de pega", "Inicio"): "Initial setting time",
    ("Resistência à compressão", "1 d"): "CS1",
    ("Resistência à compressão", "3 d"): "CS3",
    ("Resistência à compressão", "7 d"): "CS7",
    ("Resistência à compressão", "28 d"): "CS28",
    "Perda ao fogo": "Loss on Ignition",
    "Resíduo insolúvel": "Insoluble Residue",
}

<IPython.core.display.Javascript object>

### Defining a single variable to identify Cement Type

In [30]:
# Single variable for the Cement Type
df["Cement_Type"] = (
    df[
        [
            ("Tipo de cimento", "Unnamed: 6_level_2"),
            ("Classe de resistência", "Unnamed: 7_level_2"),
            ("Obs.", "Unnamed: 8_level_2"),
        ]
    ]
    .fillna("")
    .astype(str)
    .sum(axis=1)
)

# # Preprocessing
df["Cement_Type"] = df["Cement_Type"].replace("Fibro", "Fibrocimento")

<IPython.core.display.Javascript object>

### Dropping and rename the columns defined above

In [31]:
df = df.drop(COLUMNS_TO_DROP, axis=1).rename(COLUMNS_TO_RENAME, axis=1).copy()
df = df.drop([0]).reset_index(drop=True)
df = df.infer_objects()

<IPython.core.display.Javascript object>

### Cleaning and converting numeric columns to float

In [32]:
NUMERIC_COLUMNS = [
    "CaO",
    "MgO",
    "Na2O",
    "Al2O3",
    "SiO2",
    "SO3",
    "K2O",
    "TiO2",
    "Fe2O3",
    "Loss on Ignition",
    "Insoluble Residue",
    "Blaine",
    "#200",
    "#325",
    "#400",
    "Specific Gravity",
    "Final setting time",
    "Initial setting time",
    "CS1",
    "CS3",
    "CS7",
    "CS28",
]

df = preprocess_numeric_cols(df, NUMERIC_COLUMNS)

# Preprocessing to fix numeric columns
df[NUMERIC_COLUMNS] = df[NUMERIC_COLUMNS].map(convert_to_float)

# Converting Date to pandas datetime
df["Date"] = pd.to_datetime(df["Date"])

# Drop rows completely missing
index_to_keep = (
    df.drop(["Plant", "Date", "Cement_Type"], axis=1).dropna(axis=0, how="all").index
)
df = df.iloc[index_to_keep].reset_index(drop=True)

df = df.infer_objects()

<IPython.core.display.Javascript object>

### Dropping variable with City and Plant identification

In [33]:
df["Factory_Plant"] = "204_" + df["Plant"]
FEATRUES_TO_DROP = ["City", "Plant"]
df = df.drop(labels=FEATRUES_TO_DROP, axis=1)

<IPython.core.display.Javascript object>

In [34]:
df.shape

(16000, 25)

<IPython.core.display.Javascript object>

In [35]:
df_204 = df.copy()

<IPython.core.display.Javascript object>

In [36]:
check_columns_match(df_203, df_204)

Dataframes have the same columns: True
Dataframes have the same column order: True


(True, True)

<IPython.core.display.Javascript object>

# 206

In [37]:
plant = "B"
xls_files = {}


xls = pd.ExcelFile(
    f"../../../../data/raw/206/EMBRAPII hubIC IACC_CIMENTO_{plant}.xlsx",
    engine="openpyxl",
)
xls_files[plant] = xls

<IPython.core.display.Javascript object>

In [38]:
print("Plant: ", plant, xls.sheet_names)

Plant:  B ['127', '90', 'RX']


<IPython.core.display.Javascript object>

In [39]:
cement_types_per_plant = []
CEMENT_TYPE_COLS = ["Tipo de cimento", "Classe de resistência"]
dataframes = []

for plant, xls in xls_files.items():
    for sheet_name in xls.sheet_names:
        if sheet_name == "RX":
            continue
        df = pd.read_excel(xls_files[plant], sheet_name, header=[1, 2])

        if not df[CEMENT_TYPE_COLS].iloc[2:].isna().all().all():
            cement_type = (
                df[CEMENT_TYPE_COLS]
                .loc[3:]
                .astype(str)
                .sum(axis=1)
                .str.replace(" ", "")
                .str.replace("-", "")
                .str.replace("nan", "")
                .unique()
            )

        else:
            cement_type = (
                df[[("Obs.", "Unnamed: 8_level_1")]]
                .loc[3:]
                .astype(str)
                .sum(axis=1)
                .str.replace(" ", "")
                .str.replace("-", "")
                .unique()
            )

        cement_types_per_plant.append((plant, sheet_name, cement_type))
        dataframes.append(df)

<IPython.core.display.Javascript object>

In [40]:
dataframes = []

for plant, xls in xls_files.items():
    for sheet_name in xls.sheet_names:
        if sheet_name == "RX":
            continue
        df = pd.read_excel(xls_files[plant], sheet_name, header=[0, 1, 2])
        df["Unnamed: 0_level_0", "Unnamed: 0_level_1", "Unnamed: 0_level_2"] = plant
        dataframes.append(df.drop([0, 1], axis=0).reset_index(drop=True).copy())

<IPython.core.display.Javascript object>

In [41]:
df = pd.concat(dataframes, axis=0).reset_index(drop=True)

<IPython.core.display.Javascript object>

## Initial Preprocessing

In [42]:
df = pd.concat(
    [
        df["Unnamed: 0_level_0", "Unnamed: 0_level_1", "Unnamed: 0_level_2"],
        df["Dados iniciais"],
        df["Cimento"]["Análise química"],
        df["Característias físicas do cimento"],
        df["Propriedades do cimento"],
    ],
    axis=1,
)

<IPython.core.display.Javascript object>

### 1. Dropping columns where either there is no data nor it contributes to the analysis
### 2. Renaming columns to a short identifiable name

In [43]:
COLUMNS_TO_DROP = [
    ("Data", "Medida"),
    ("Horário", "Produção"),
    ("Horário", "Medida"),
    ("Cidade", "Unnamed: 1_level_2"),
    ("Tipo de cimento", "Unnamed: 6_level_2"),
    ("Classe de resistência", "Unnamed: 7_level_2"),
    ("Unnamed: 83_level_1", "Obs."),
    ("Obs.", "Unnamed: 8_level_2"),
    ("Obs.", "Unnamed: 90_level_2"),
    "Obs.",
]

COLUMNS_TO_RENAME = {
    ("Unnamed: 0_level_0", "Unnamed: 0_level_1", "Unnamed: 0_level_2"): "Plant",
    ("Cidade", "Unnamed: 1_level_2"): "City",
    ("Data", "Produção"): "Date",
    ("Unnamed: 78_level_1", "Blaine"): "Blaine",
    ("Unnamed: 79_level_1", "#200"): "#200",
    ("Unnamed: 80_level_1", "#325"): "#325",
    ("Unnamed: 81_level_1", "#400"): "#400",
    ("Unnamed: 82_level_1", "Massa específica"): "Specific Gravity",
    ("Tempo de pega", "Inicio"): "Initial setting time",
    ("Tempo de pega", "Fim"): "Final setting time",
    ("Resistência à compressão", "1 d"): "CS1",
    ("Resistência à compressão", "3 d"): "CS3",
    ("Resistência à compressão", "7 d"): "CS7",
    ("Resistência à compressão", "28 d"): "CS28",
    "Perda ao fogo": "Loss on Ignition",
    "Resíduo insolúvel": "Insoluble Residue",
}

<IPython.core.display.Javascript object>

### Defining a single variable to identify Cement Type

In [44]:
# Single variable for the Cement Type
df["Cement_Type"] = (
    df[
        [
            ("Tipo de cimento", "Unnamed: 6_level_2"),
            ("Classe de resistência", "Unnamed: 7_level_2"),
            ("Obs.", "Unnamed: 8_level_2"),
        ]
    ]
    .fillna("")
    .astype(str)
    .sum(axis=1)
)

<IPython.core.display.Javascript object>

### Dropping and rename the columns defined above

In [45]:
df = df[(df["Cement_Type"] != "ManualManual")]
df = df.drop(COLUMNS_TO_DROP, axis=1).rename(COLUMNS_TO_RENAME, axis=1).copy()
df = df.infer_objects()

<IPython.core.display.Javascript object>

### Cleaning and converting numeric columns to float

In [46]:
NUMERIC_COLUMNS = [
    "CaO",
    "MgO",
    "Na2O",
    "Al2O3",
    "SiO2",
    "SO3",
    "K2O",
    "TiO2",
    "Fe2O3",
    "Loss on Ignition",
    "Insoluble Residue",
    "Blaine",
    "#200",
    "#325",
    "#400",
    "Specific Gravity",
    "Final setting time",
    "Initial setting time",
    "CS1",
    "CS3",
    "CS7",
    "CS28",
]

df = preprocess_numeric_cols(df, NUMERIC_COLUMNS)

# Preprocessing to fix numeric columns
df[NUMERIC_COLUMNS] = df[NUMERIC_COLUMNS].map(convert_to_float)

# Converting Date to pandas datetime
df["Date"] = pd.to_datetime(df["Date"])

df = df.reset_index(drop=True)
# Drop rows completely missing
index_to_keep = (
    df.drop(["Plant", "Date", "Cement_Type"], axis=1).dropna(axis=0, how="all").index
)
df = df.iloc[index_to_keep].reset_index(drop=True)

df = df.infer_objects()

<IPython.core.display.Javascript object>

## Dropping missing features

In [47]:
index_to_keep = (
    df.drop(["Plant", "Date", "Cement_Type"], axis=1).dropna(axis=0, how="all").index
)
df = df.iloc[index_to_keep].reset_index(drop=True)

<IPython.core.display.Javascript object>

### Dropping variable with Plant identification

In [48]:
df["Factory_Plant"] = "206_" + df["Plant"]
df = df.drop("Plant", axis=1)

<IPython.core.display.Javascript object>

In [49]:
df_206 = df.copy()

<IPython.core.display.Javascript object>

In [50]:
check_columns_match(df_203, df_204)

Dataframes have the same columns: True
Dataframes have the same column order: True


(True, True)

<IPython.core.display.Javascript object>

In [51]:
check_columns_match(df_203, df_206)

Dataframes have the same columns: True
Dataframes have the same column order: True


(True, True)

<IPython.core.display.Javascript object>

# 207

In [52]:
plant = "AT"
xls_files = {}

xls = pd.ExcelFile(
    f"../../../../data/raw/207/EMBRAPII hubIC IACC_CIMENTO_{plant}.xlsx",
    engine="openpyxl",
)
xls_files[plant] = xls

<IPython.core.display.Javascript object>

In [53]:
print("Plant: ", plant, xls.sheet_names)

Plant:  AT ['INSTRUÇÕES', '98', '195', '5']


<IPython.core.display.Javascript object>

In [54]:
cement_types_per_plant = []
CEMENT_TYPE_COLS = ["Tipo de cimento", "Classe de resistência"]
dataframes = []

for plant, xls in xls_files.items():
    for sheet_name in xls.sheet_names[1:]:
        df = pd.read_excel(xls_files[plant], sheet_name, header=[1, 2])
        if not df[CEMENT_TYPE_COLS].iloc[2:].isna().all().all():
            cement_type = (
                df[CEMENT_TYPE_COLS]
                .loc[2:]
                .astype(str)
                .sum(axis=1)
                .str.replace(" ", "")
                .str.replace("-", "")
                .str.replace("nan", "")
                .unique()
            )

        else:
            cement_type = (
                df[[("Obs.", "Unnamed: 8_level_1")]]
                .loc[2:]
                .astype(str)
                .sum(axis=1)
                .str.replace(" ", "")
                .str.replace("-", "")
                .unique()
            )

        cement_types_per_plant.append((plant, sheet_name, cement_type))
        dataframes.append(df)

<IPython.core.display.Javascript object>

In [55]:
dataframes = []

for plant, xls in xls_files.items():
    for sheet_name in xls.sheet_names[1:]:
        df = pd.read_excel(xls_files[plant], sheet_name, header=[0, 1, 2])
        df["Unnamed: 0_level_0", "Unnamed: 0_level_1", "Unnamed: 0_level_2"] = plant
        dataframes.append(df.drop([0, 1], axis=0).reset_index(drop=True).copy())

<IPython.core.display.Javascript object>

In [56]:
df = pd.concat(dataframes, axis=0).reset_index(drop=True)

<IPython.core.display.Javascript object>

## Initial Preprocessing

In [57]:
df = pd.concat(
    [
        df["Unnamed: 0_level_0", "Unnamed: 0_level_1", "Unnamed: 0_level_2"],
        df["Dados iniciais"],
        df["Cimento"]["Análise química"],
        df["Característias físicas do cimento"],
        df["Propriedades do cimento"],
    ],
    axis=1,
)

<IPython.core.display.Javascript object>

### 1. Dropping columns where either there is no data nor it contributes to the analysis
### 2. Renaming columns to a short identifiable name

In [58]:
COLUMNS_TO_DROP = [
    ("Data", "Medida"),
    ("Horário", "Produção"),
    ("Horário", "Medida"),
    ("Obs.", "Unnamed: 148_level_2"),
    ("Tipo de cimento", "Unnamed: 6_level_2"),
    ("Classe de resistência", "Unnamed: 7_level_2"),
    ("Obs.", "Unnamed: 8_level_2"),
    "Obs.",
    ("Obs.", "Unnamed: 148_level_2"),
    ("Unnamed: 141_level_1", "Obs."),
]

COLUMNS_TO_RENAME = {
    ("Unnamed: 0_level_0", "Unnamed: 0_level_1", "Unnamed: 0_level_2"): "Plant",
    ("Cidade", "Unnamed: 1_level_2"): "City",
    ("Data", "Produção"): "Date",  # Arrumar!
    ("Unnamed: 136_level_1", "Blaine"): "Blaine",
    ("Unnamed: 137_level_1", "#200"): "#200",
    ("Unnamed: 138_level_1", "#325"): "#325",
    ("Unnamed: 139_level_1", "#400"): "#400",
    ("Unnamed: 140_level_1", "Massa específica"): "Specific Gravity",
    ("Tempo de pega", "Inicio"): "Initial setting time",
    ("Tempo de pega", "Fim"): "Final setting time",
    ("Resistência à compressão", "1 d"): "CS1",
    ("Resistência à compressão", "3 d"): "CS3",
    ("Resistência à compressão", "7 d"): "CS7",
    ("Resistência à compressão", "28 d"): "CS28",
    "Perda ao fogo": "Loss on Ignition",
    "Resíduo insolúvel": "Insoluble Residue",
}

<IPython.core.display.Javascript object>

### Defining a single variable to identify Cement Type

In [59]:
# Single variable for the Cement Type
df["Cement_Type"] = (
    df[
        [
            ("Tipo de cimento", "Unnamed: 6_level_2"),
            ("Classe de resistência", "Unnamed: 7_level_2"),
            ("Obs.", "Unnamed: 8_level_2"),
        ]
    ]
    .fillna("")
    .astype(str)
    .sum(axis=1)
)

# # Preprocessing
df["Cement_Type"] = df["Cement_Type"].replace("Fibro", "Fibrocimento")

<IPython.core.display.Javascript object>

### Dropping and rename the columns defined above

In [60]:
df = df.drop(COLUMNS_TO_DROP, axis=1).rename(COLUMNS_TO_RENAME, axis=1).copy()
df = df.drop([0]).reset_index(drop=True)
df = df.infer_objects()

<IPython.core.display.Javascript object>

### Cleaning and converting numeric columns to float

In [61]:
NUMERIC_COLUMNS = [
    "CaO",
    "MgO",
    "Na2O",
    "Al2O3",
    "SiO2",
    "SO3",
    "K2O",
    "TiO2",
    "Fe2O3",
    "Loss on Ignition",
    "Insoluble Residue",
    "Blaine",
    "#200",
    "#325",
    "#400",
    "Specific Gravity",
    "Final setting time",
    "Initial setting time",
    "CS1",
    "CS3",
    "CS7",
    "CS28",
]

df = preprocess_numeric_cols(df, NUMERIC_COLUMNS)

# Preprocessing to fix numeric columns
df[NUMERIC_COLUMNS] = df[NUMERIC_COLUMNS].map(convert_to_float)

# Converting Date to pandas datetime
df["Date"] = pd.to_datetime(df["Date"])

# Drop rows completely missing
index_to_keep = (
    df.drop(["Plant", "Date", "Cement_Type"], axis=1).dropna(axis=0, how="all").index
)

# Filling with cement type due problems with given excel file
index = df[df["Cement_Type"] == ""].index
df.loc[index, "Cement_Type"] = "CPVARI"

df = df.iloc[index_to_keep].reset_index(drop=True)

df = df.infer_objects()

<IPython.core.display.Javascript object>

## Dropping missing features

### Drop rows completely missing

In [62]:
index_to_keep = (
    df.drop(["Plant", "City", "Date", "Cement_Type"], axis=1)
    .dropna(axis=0, how="all")
    .index
)
df = df.iloc[index_to_keep].reset_index(drop=True)

<IPython.core.display.Javascript object>

In [63]:
df.shape

(3002, 26)

<IPython.core.display.Javascript object>

### Dropping variable with Plant identification

In [64]:
df["Factory_Plant"] = "207_" + df["Plant"]
df = df.drop(["Plant", "City"], axis=1)

<IPython.core.display.Javascript object>

In [65]:
df_207 = df.copy()

<IPython.core.display.Javascript object>

In [66]:
check_columns_match(df_203, df_207)

Dataframes have the same columns: True
Dataframes have the same column order: True


(True, True)

<IPython.core.display.Javascript object>

In [67]:
check_columns_match(df_204, df_207)

Dataframes have the same columns: True
Dataframes have the same column order: True


(True, True)

<IPython.core.display.Javascript object>

In [68]:
check_columns_match(df_206, df_207)

Dataframes have the same columns: True
Dataframes have the same column order: True


(True, True)

<IPython.core.display.Javascript object>

# 209

In [69]:
plants = [
    "X",
    "H",
    "AS",
    "AG",
    "AW",
    "AZ",
    "W",
    "V",
    "AX",
    "L",
    "N",
    "AI",
    "AA",
    "P",
    "Q",
    "AP",
    "E",
    "AN",
    "AC",
    "K",
    "S",
    "U",
    "J",
    "AM",
    "Z",
]
xls_files = {}

for plant in plants:
    xls = pd.ExcelFile(
        f"../../../../data/raw/209/EMBRAPII hubIC IACC_CIMENTO_{plant}.xlsx",
        engine="openpyxl",
    )
    xls_files[plant] = xls

<IPython.core.display.Javascript object>

In [70]:
for plant, xls in xls_files.items():
    print("Palnt: ", plant, xls.sheet_names)

Palnt:  X ['INSTRUÇÕES', 'Clínquer', '78', '83', '19']
Palnt:  H ['INSTRUÇÕES', 'Clínquer', '119', '76', '87']
Palnt:  AS ['INSTRUÇÕES', 'Clínquer', '31', '63']
Palnt:  AG ['INSTRUÇÕES', 'Clínquer', '146', '115', '166']
Palnt:  AW ['INSTRUÇÕES', 'Clínquer', '88']
Palnt:  AZ ['INSTRUÇÕES', 'Clínquer', '4', '92', '70', '111', '156']
Palnt:  W ['INSTRUÇÕES', 'Clínquer', '140', '196', '163']
Palnt:  V ['INSTRUÇÕES', 'Clínquer', '93']
Palnt:  AX ['INSTRUÇÕES', 'Clínquer', '152', '108', '118']
Palnt:  L ['INSTRUÇÕES', 'Clínquer', '6', '39', '16']
Palnt:  N ['INSTRUÇÕES', 'Clínquer', '80', '66', '94']
Palnt:  AI ['INSTRUÇÕES', 'Clínquer', '79', '27', '120']
Palnt:  AA ['INSTRUÇÕES', 'Clínquer', '95', '110', '133', '145']
Palnt:  P ['INSTRUÇÕES', 'Clínquer', '175']
Palnt:  Q ['INSTRUÇÕES', 'Clínquer', '56', '32', '101', '143']
Palnt:  AP ['INSTRUÇÕES', 'Clínquer', '191', '189', '199', '41']
Palnt:  E ['INSTRUÇÕES', 'Clínquer', '121', '178']
Palnt:  AN ['INSTRUÇÕES', 'Clínquer', '112', '172']
P

<IPython.core.display.Javascript object>

In [71]:
cement_types_per_plant = []
CEMENT_TYPE_COLS = ["Tipo de cimento", "Classe de resistência"]
dataframes = []

for plant, xls in xls_files.items():
    for sheet_name in xls.sheet_names[2:]:
        df = pd.read_excel(xls_files[plant], sheet_name, header=[1, 2])

        if not df[CEMENT_TYPE_COLS].iloc[2:].isna().all().all():
            cement_type = (
                df[CEMENT_TYPE_COLS]
                .loc[2:]
                .astype(str)
                .sum(axis=1)
                .str.replace(" ", "")
                .str.replace("-", "")
                .str.replace("nan", "")
                .unique()
            )

        else:
            cement_type = (
                df[[("Obs.", "Unnamed: 8_level_1")]]
                .loc[2:]
                .astype(str)
                .sum(axis=1)
                .str.replace(" ", "")
                .str.replace("-", "")
                .unique()
            )

        cement_types_per_plant.append((plant, sheet_name, cement_type))
        dataframes.append(df)

<IPython.core.display.Javascript object>

In [72]:
dataframes = []

for plant, xls in xls_files.items():
    for sheet_name in xls.sheet_names[2:]:
        df = pd.read_excel(xls_files[plant], sheet_name, header=[0, 1, 2])
        df["Unnamed: 0_level_0", "Unnamed: 0_level_1", "Unnamed: 0_level_2"] = plant
        dataframes.append(df.drop([0, 1], axis=0).reset_index(drop=True).copy())

<IPython.core.display.Javascript object>

In [73]:
df = pd.concat(dataframes, axis=0).reset_index(drop=True)

<IPython.core.display.Javascript object>

## Initial Preprocessing

In [74]:
df = pd.concat(
    [
        df["Unnamed: 0_level_0", "Unnamed: 0_level_1", "Unnamed: 0_level_2"],
        df["Dados iniciais"],
        df["Cimento"]["Composição"].rename(
            {"Sulfato de cálcio": "Sulfato de cálcio (composição)"}, axis=1
        ),
        df["Cimento"]["Análise mineralógica"],
        df["Cimento"]["Análise química"],
        df["Característias físicas do cimento"],
        df["Propriedades do cimento"],
    ],
    axis=1,
)

<IPython.core.display.Javascript object>

### 1. Dropping columns where either there is no data nor it contributes to the analysis
### 2. Renaming columns to a short identifiable name

In [75]:
COLUMNS_TO_DROP = [
    ("Data", "Medida"),
    ("Horário", "Produção"),
    ("Horário", "Medida"),
    ("Obs.", "Unnamed: 148_level_2"),
    ("Tipo de cimento", "Unnamed: 6_level_2"),
    ("Classe de resistência", "Unnamed: 7_level_2"),
    ("Obs.", "Unnamed: 8_level_2"),
    "Obs.",
    ("Obs.", "Unnamed: 148_level_2"),
    ("Unnamed: 141_level_1", "Obs."),
    # ("Unnamed: 139_level_1", "#400"),
]

COLUMNS_TO_RENAME = {
    ("Unnamed: 0_level_0", "Unnamed: 0_level_1", "Unnamed: 0_level_2"): "Plant",
    ("Cidade", "Unnamed: 1_level_2"): "City",
    ("Data", "Produção"): "Date",  # Arrumar!
    ("Unnamed: 136_level_1", "Blaine"): "Blaine",
    ("Unnamed: 137_level_1", "#200"): "#200",
    ("Unnamed: 138_level_1", "#325"): "#325",
    ("Unnamed: 139_level_1", "#400"): "#400",
    ("Unnamed: 140_level_1", "Massa específica"): "Specific Gravity",
    ("Tempo de pega", "Inicio"): "Initial setting time",
    ("Tempo de pega", "Fim"): "Final setting time",
    ("Resistência à compressão", "1 d"): "CS1",
    ("Resistência à compressão", "3 d"): "CS3",
    ("Resistência à compressão", "7 d"): "CS7",
    ("Resistência à compressão", "28 d"): "CS28",
    "Perda ao fogo": "Loss on Ignition",
    "Resíduo insolúvel": "Insoluble Residue",
    "Clínquer": "Clinker",
    "Fíler calcário": "Calcareous filler",
    "Cinza volante": "Fly ash",
    "Escória": "Slag",
    "Argila calcinada": "Calcined clay",
    "Sulfato de cálcio": "Calcium sulfate",
    "C3S total": "Total C3S",
    "C2S total": "Total C2S",
    "C2S alpha": "Alpha C2S",
    "C2S beta": "Beta C2S",
    "C2S gama": "Gamma C2S",
    "C4AF": "C4AF",
    "C3A": "C3A",
    "C3A cubic": "Cubic C3A",
    "C3A orto": "Orthorhombic C3A",
    "CaO livre": "Free CaO",
    "Portlandita": "Portlandite",
    "Periclasio": "Periclase",
    "Arcanita": "Arcanite",
    "Aphthalita": "Aphthitalite",
    "Langbeinita": "Langbeinite",
    "Bassanita": "Bassanite",
    "Anidrita": "Anhydrite",
    "Calcita": "Calcite",
    # "Dolomita": "Dolomite",
    # "Dolomite": "Dolomite",  # TODO - change on dataprep of local models!
    "Dolimita": "Dolomite",  # TODO - change on dataprep of local models!
    "Quartzo": "Quartz",
    "Muscovita": "Muscovite",
    "Sulfato de cálcio (composição)": "Calcium sulfate (composition)",
}

<IPython.core.display.Javascript object>

### Defining a single variable to identify Cement Type

In [76]:
# Single variable for the Cement Type
df["Cement_Type"] = (
    df[
        [
            ("Tipo de cimento", "Unnamed: 6_level_2"),
            ("Classe de resistência", "Unnamed: 7_level_2"),
            ("Obs.", "Unnamed: 8_level_2"),
        ]
    ]
    .fillna("")
    .astype(str)
    .sum(axis=1)
)

# # Preprocessing
df["Cement_Type"] = df["Cement_Type"].replace("Fibro", "Fibrocimento")

# Preprocessing
df["Cement_Type"] = (
    df["Cement_Type"]
    .str.replace("Expedido", "")
    .str.replace("Linha 9", "")
    .str.replace("Itajaí", "")
    .str.replace("D-G", "")
    .str.replace("D-E", "")
    .str.replace("DF2", "")
    .str.replace("CPIIF40", "CP II-F-40")
    .str.replace("CPIIF40", "CP II-F40")
    .str.replace("CP VARI", "CP V-ARI")
    .str.replace("CPVARI", "CP V-ARI")
    .str.replace("CP V-RSARI", "CP V-ARI RS")
    .str.replace("CP III-RS40", "CP III-40 RS")
    .str.replace("CP III-RS32", "CP III-32 RS")
    .str.replace("CP IV-RS32", "CP IV-32 RS")
    .str.replace("CP III40", "CP III-40")
    .str.replace("CP III32", "CP III-32")
    .str.replace("CP I-S40", "CP I-S-40")
    .str.replace("CP I40", "CP I-40")
    .str.replace("CP II-E32", "CP II-E-32")
    .str.replace("CP II-E40", "CP II-E-40")
    .str.replace("CP II-F FIBRO40", "CP II-F-40")
    .str.replace("CP II-F32", "CP II-F-32")
    .str.replace("CP II-F40", "CP II-F-40")
    .str.replace("CP II-Z32", "CP II-Z-32")
    .str.replace("CP II-Z40", "CP II-Z-40")
    .str.replace("CP IV32", "CP IV-32")
    .str.replace("CP IND", "CP I")
    .str.replace("CPINDCC", "CP I")
    .str.strip()
)

df = df.drop(COLUMNS_TO_DROP, axis=1).rename(COLUMNS_TO_RENAME, axis=1).copy()
df = df.drop([0]).reset_index(drop=True)
df = df.infer_objects()

<IPython.core.display.Javascript object>

In [77]:
df.shape

(31109, 54)

<IPython.core.display.Javascript object>

### Dropping and rename the columns defined above

### Cleaning and converting numeric columns to float

In [78]:
NUMERIC_COLUMNS = [
    "Calcium sulfate (composition)",
    "Clinker",
    "Calcareous filler",
    "Fly ash",
    "Slag",
    "Calcined clay",
    "Calcium sulfate",
    "Total C3S",
    "Total C2S",
    "Alpha C2S",
    "Beta C2S",
    "Gamma C2S",
    "C4AF",
    "C3A",
    "Cubic C3A",
    "Orthorhombic C3A",
    "Free CaO",
    "Portlandite",
    "Periclase",
    "Arcanite",
    "Aphthitalite",
    "Langbeinite",
    "Bassanite",
    "Anhydrite",
    "Calcite",
    "Dolomite",
    "Quartz",
    "Muscovite",
    "CaO",
    "MgO",
    "Na2O",
    "Al2O3",
    "SiO2",
    "SO3",
    "K2O",
    "TiO2",
    "Fe2O3",
    "Loss on Ignition",
    "Insoluble Residue",
    "Blaine",
    "#200",
    "#325",
    "#400",
    "Specific Gravity",
    "Final setting time",
    "Initial setting time",
    "CS1",
    "CS3",
    "CS7",
    "CS28",
]

df = preprocess_numeric_cols(df, NUMERIC_COLUMNS)

# Preprocessing to fix numeric columns
df[NUMERIC_COLUMNS] = df[NUMERIC_COLUMNS].map(convert_to_float)

# Converting Date to pandas datetime
df["Date"] = pd.to_datetime(df["Date"])

# Drop rows completely missing
index_to_keep = (
    df.drop(["Plant", "Date", "Cement_Type"], axis=1).dropna(axis=0, how="all").index
)
df = df.iloc[index_to_keep].reset_index(drop=True)

df = df.infer_objects()

could not convert string to float: '-'
could not convert string to float: '-'
could not convert string to float: '-'
could not convert string to float: '-'
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: '-'
could not convert string to float: '-'
could not convert string to float: '-'
could not convert string to float: '-'
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string 

could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string 

could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string to float: ' '
could not convert string 

<IPython.core.display.Javascript object>

## Dropping missing features

### Drop rows completely missing

In [79]:
index_to_keep = (
    df.drop(["Plant", "City", "Date", "Cement_Type"], axis=1)
    .dropna(axis=0, how="all")
    .index
)
df = df.iloc[index_to_keep].reset_index(drop=True)

<IPython.core.display.Javascript object>

In [80]:
df.shape

(31035, 54)

<IPython.core.display.Javascript object>

### Dropping variable with Plant identification

In [81]:
df["Factory_Plant"] = "209_" + df["Plant"]
df = df.drop(["Plant", "City"], axis=1)

<IPython.core.display.Javascript object>

In [82]:
df_209 = df.copy()

<IPython.core.display.Javascript object>

### Fixing Column ordering:

In [83]:
EXTRA_COLS_209 = [
    "Clinker",
    "Calcareous filler",
    "Fly ash",
    "Slag",
    "Calcined clay",
    "Calcium sulfate (composition)",
    "Total C3S",
    "Total C2S",
    "Alpha C2S",
    "Beta C2S",
    "Gamma C2S",
    "C4AF",
    "C3A",
    "Cubic C3A",
    "Orthorhombic C3A",
    "Free CaO",
    "Portlandite",
    "Periclase",
    "Arcanite",
    "Aphthitalite",
    "Langbeinite",
    "Calcium sulfate",
    "Bassanite",
    "Anhydrite",
    "Calcite",
    "Dolomite",
    "Quartz",
    "Muscovite",
]

<IPython.core.display.Javascript object>

In [84]:
USUAL_COLS_209 = [
    col for col in filter(lambda c: c not in EXTRA_COLS_209, df_209.columns)
]

<IPython.core.display.Javascript object>

In [85]:
df_209 = pd.concat([df_209[USUAL_COLS_209], df_209[EXTRA_COLS_209]], axis=1)

<IPython.core.display.Javascript object>

In [86]:
check_columns_match(df_203, df_209[USUAL_COLS_209])

Dataframes have the same columns: True
Dataframes have the same column order: True


(True, True)

<IPython.core.display.Javascript object>

# partner_ii

In [87]:
df = pd.read_csv("../../../../data/raw/partner_ii/db3_cement.csv")

<IPython.core.display.Javascript object>

## Initial Preprocessing

### 1. Dropping columns where either there is no data nor it contributes to the analysis
### 2. Renaming columns to a short identifiable name

In [88]:
COLUMNS_TO_DROP = [
    "sample_id_plant",
    "raw_material_code",
    "water_demand",
]

COLUMNS_TO_RENAME = {
    "sample_date": "Date",
    "unified_material_name": "Cement_Type",
    "cao": "CaO",
    "sio2": "SiO2",
    "al2o3": "Al2O3",
    "fe2o3": "Fe2O3",
    "mgo": "MgO",
    "so3": "SO3",
    "k2o": "K2O",
    "na2o": "Na2O",
    # "cl": "Cl-",
    "cl": "Cl",
    "tio2": "TiO2",
    "alite": "Alite",
    "belite": "Belite",
    "aluminate": "Aluminate",
    "ferrite": "Ferrite",
    "free_lime": "Free CaO",
    "loi": "Loss on Ignition",
    "water_demand": "Water Demand",
    "setting_initial": "Initial setting time",
    "strength_1d": "CS1",
    "strength_3d": "CS3",
    "strength_7d": "CS7",
    "strength_28d": "CS28",
    "blaine": "Blaine",
    "sieve_32um": "#400",
    "sieve_45um": "#325",
}

<IPython.core.display.Javascript object>

In [89]:
COLUMNS_TO_RENAME_2 = {
    "Alite": "Total C3S",
    "Belite": "Total C2S",
    "Aluminate": "C3A",
    "Ferrite": "C4AF",
}

<IPython.core.display.Javascript object>

### Defining a single variable to identify Cement Type

### Dropping and rename the columns defined above

In [90]:
df = (
    df.drop(COLUMNS_TO_DROP, axis=1)
    .rename(COLUMNS_TO_RENAME, axis=1)
    .rename(COLUMNS_TO_RENAME_2, axis=1)
    .copy()
)
df = df.drop([0]).reset_index(drop=True)
df = df.infer_objects()

<IPython.core.display.Javascript object>

### Cleaning and converting numeric columns to float

In [91]:
NUMERIC_COLUMNS = [
    "CaO",
    "SiO2",
    "Al2O3",
    "Fe2O3",
    "MgO",
    "SO3",
    "K2O",
    "Na2O",
    # "Cl-",
    "Cl",
    "TiO2",
    # "Alite",
    # "Belite",
    # "Aluminate",
    # "Ferrite",
    "Total C3S",
    "Total C2S",
    "C3A",
    "C4AF",
    "Free CaO",
    "Loss on Ignition",
    "Initial setting time",
    "CS1",
    "CS3",
    "CS7",
    "CS28",
    "Blaine",
    "#400",
    "#325",
]

df = preprocess_numeric_cols(df, NUMERIC_COLUMNS)

# Preprocessing to fix numeric columns
df[NUMERIC_COLUMNS] = df[NUMERIC_COLUMNS].map(convert_to_float)

# Converting Date to pandas datetime
df["Date"] = pd.to_datetime(df["Date"])

# Drop rows completely missing
index_to_keep = df.drop(["Date", "Cement_Type"], axis=1).dropna(axis=0, how="all").index
df = df.iloc[index_to_keep].reset_index(drop=True)

df = df.infer_objects()

<IPython.core.display.Javascript object>

## Dropping missing features

### Drop rows completely missing

In [92]:
index_to_keep = df.drop(["Date", "Cement_Type"], axis=1).dropna(axis=0, how="all").index
df = df.iloc[index_to_keep].reset_index(drop=True)

<IPython.core.display.Javascript object>

In [93]:
df.shape

(36818, 26)

<IPython.core.display.Javascript object>

### Dropping variable with Plant identification

In [94]:
df["Factory_Plant"] = "partner_ii"

<IPython.core.display.Javascript object>

#### Removing the timezone

In [95]:
df["Date"].dtype

datetime64[ns, UTC]

<IPython.core.display.Javascript object>

In [96]:
df["Date"] = df["Date"].dt.tz_localize(None)

<IPython.core.display.Javascript object>

In [97]:
df["Date"].dtype

dtype('<M8[ns]')

<IPython.core.display.Javascript object>

In [98]:
df_heildelberg = df.copy()

<IPython.core.display.Javascript object>

# partner_iv (skipped)

In [99]:
import datetime as dt

<IPython.core.display.Javascript object>

In [100]:
xlsx = pd.ExcelFile(
    "../../../../data/raw/partner_iv/DB_Master_CP1_latest.xlsx", engine="openpyxl"
)

<IPython.core.display.Javascript object>

In [101]:
df_cem_a = pd.read_excel(xlsx, sheet_name="Cem A exp control", skiprows=[1, 2])
df_cem_a = df_cem_a[
    [col for col in df_cem_a.columns.fillna("Unnamed") if "Unnamed" not in col]
]

df_cem_b = pd.read_excel(xlsx, sheet_name="Cem B exp control", skiprows=[1, 2])
df_cem_b = df_cem_b[
    [col for col in df_cem_b.columns.fillna("Unnamed") if "Unnamed" not in col]
]

df_cem_c = pd.read_excel(xlsx, sheet_name="Cem c exp control", skiprows=[1, 2])
df_cem_c = df_cem_c[
    [col for col in df_cem_c.columns.fillna("Unnamed") if "Unnamed" not in col]
]

df_cem_c["data"] = pd.TimedeltaIndex(df_cem_c["data"], unit="d") + dt.datetime(
    1900, 1, 1
)

<IPython.core.display.Javascript object>

In [102]:
df = pd.concat([df_cem_a, df_cem_b, df_cem_c], ignore_index=True, axis=0)

<IPython.core.display.Javascript object>

In [103]:
df.columns

Index(['CEM A expedition control', 'sigla', 'data', 'LOI', 'SO3', 'Cl-',
       'Blaine', 'ph2oimm', 'Initial Setting Time', 'Soundness', 'flow',
       'rihcl', '2 days strenght', '7 days strenght', '28 days strenght',
       'residuo 24 micron', 'tratt. 32 µm', 'tratt. 40 µm', 'tratt. 63 µm',
       'SampleName', 'R_wp', 'Alite_M3 C3S M3', 'Alite_M1 C3S M1',
       'Alite_Sum C3S tot', 'Ratio_M1 (rapporto M1/M3)',
       'C3S_CS (taglia dei cristalliti C3S)', 'Belite_beta', 'C3A cub',
       'C3A_ortho', 'C3A tot', 'C4AF', 'CaO', 'Ca(OH)2', 'Calce libera',
       'Periclasio (MgO)', 'Quartz', 'K2SO4', 'Langbeinite – MgK2(SO4)2',
       'Aphthitalite – (K,Na)3(SO4)2', 'Gesso', 'Emiidrato', 'Anidrite',
       'Calcite – CaCO3', 'SO3_XRD', 'CO2_XRD', 'CEM B  expedition control',
       'silo', 'CEM C  expedition control'],
      dtype='object')

<IPython.core.display.Javascript object>

# partner_i

In [104]:
xls = pd.ExcelFile(
    "../../../../data/raw/partner_i-Oficial/DB_Master_CP1_latest v03.06.xlsx",
    engine="openpyxl",
)
df = pd.read_excel(xls, "DB3 Cimento Shipping", header=[1, 2])

<IPython.core.display.Javascript object>

## Initial Preprocessing

In [105]:
df = pd.concat([df["Date"], df["Time"], df["Composition"], df["Properties"]], axis=1)
df = df.drop([0, 1, 2], axis=0).reset_index(drop=True)

df = pd.concat(
    [
        df["Production"].iloc[:, 0],
        df["Measurement"].iloc[:, 0],
        df["Production"].iloc[:, 1].rename("Production.1"),
        df["Measurement"].iloc[:, 1].rename("Measurement.1"),
        df.drop(["Production", "Measurement"], axis=1),
    ],
    axis=1,
)

df = df.infer_objects()

<IPython.core.display.Javascript object>

## Initial Preprocessing

### 1. Dropping columns where either there is no data nor it contributes to the analysis
### 2. Renaming columns to a short identifiable name

In [106]:
COLUMNS_TO_DROP = [
    "Measurement",
    "Production.1",
    "Measurement.1",
]

COLUMNS_TO_RENAME = {
    "Production": "Date",
    "Remarks": "Cement_Type",
    "LOI": "Loss on Ignition",
    "IR": "Insoluble Residue",
    "Total alkali as Na2O": "Total Alkali as Na2O",
    "Alite total": "Alite Total",
    "Belite total": "Total C2S",
    "Belite alpha": "Belite Alpha",
    "Belite beta": "Belite Beta",
    "Belite gamma": "Belite Gamma",
    "Ferrite": "C4AF",
    "Aluminate cubic": "Aluminate Cubic",
    "Aluminate orto": "Aluminate Orto",
    "Free lime": "Free CaO",
    "Dolimite": "Dolomite",
    "Aphthalite": "Aphthitalite",
    "1 day Compressive strength": "CS1",
    "3 day Compressive strength": "CS3",
    "7 day Compressive strength": "CS7",
    "28 day Compressive strength": "CS28",
}

<IPython.core.display.Javascript object>

In [107]:
COLUMNS_TO_RENAME_2 = {
    "Alite Total": "Total C3S",
    "Total C2S": "Total C2S",
    "Belite Alpha": "Alpha C2S",
    "Belite Beta": "Beta C2S",
    "Belite Gamma": "Gamma C2S",
    "C4AF": "C4AF",
    "Aluminate": "C3A",
    "Aluminate Cubic": "Cubic C3A",
    "Aluminate Orto": "Orthorhombic C3A",
    "Free lime": "Free CaO",
}

<IPython.core.display.Javascript object>

### Defining a single variable to identify Cement Type

### Dropping and rename the columns defined above

In [108]:
df = (
    df.drop(COLUMNS_TO_DROP, axis=1)
    .rename(COLUMNS_TO_RENAME, axis=1)
    .rename(COLUMNS_TO_RENAME_2, axis=1)
    .copy()
)
df = df.drop([0]).reset_index(drop=True)
df = df.infer_objects()

<IPython.core.display.Javascript object>

### Cleaning and converting numeric columns to float

In [109]:
NUMERIC_COLUMNS = [
    "CaO",
    "MgO",
    "Na2O",
    "Al2O3",
    "SiO2",
    "SO3",
    "K2O",
    "TiO2",
    "Fe2O3",
    "Ca(OH)2",
    "CaCO3",
    "Total Alkali as Na2O",
    "Soundness",
    "Loss on Ignition",
    "Insoluble Residue",
    # "Alite Total",
    # "Total C2S",
    # "Belite Alpha",
    # "Belite Beta",
    # "Belite Gamma",
    # "C4AF",
    # "Aluminate",
    # "Aluminate Cubic",
    # "Aluminate Orto",
    "Total C3S",
    "Total C2S",
    "Alpha C2S",
    "Beta C2S",
    "Gamma C2S",
    "C4AF",
    "C3A",
    "Cubic C3A",
    "Orthorhombic C3A",
    "Free CaO",
    "Portlandite",
    "Periclase",
    "Arcanite",
    "Aphthitalite",
    "Langbeinite",
    "Gypsum",
    "Bassanite",
    "Anhydrite",
    "Calcite",
    "Dolomite",
    "Quartz",
    "Muscovite",
    "%Gypsum",
    "%Limestone",
    "%Clinker",
    "Dehydration",
    "Blaine",
    "Initial setting time",
    "Final setting time",
    "Density",
    "CS1",
    "CS3",
    "CS7",
    "CS28",
    "#200",
    "#325",
    "#400",
]

df = preprocess_numeric_cols(df, NUMERIC_COLUMNS)

# Preprocessing to fix numeric columns
df[NUMERIC_COLUMNS] = df[NUMERIC_COLUMNS].map(convert_to_float)

# Converting Date to pandas datetime
df["Date"] = pd.to_datetime(df["Date"], format="%d/%m/%Y")

# Drop rows completely missing
index_to_keep = df.drop(["Date", "Cement_Type"], axis=1).dropna(axis=0, how="all").index
df = df.iloc[index_to_keep].reset_index(drop=True)

df = df.infer_objects()

<IPython.core.display.Javascript object>

## Dropping missing features

### Drop rows completely missing

In [110]:
index_to_keep = df.drop(["Date", "Cement_Type"], axis=1).dropna(axis=0, how="all").index
df = df.iloc[index_to_keep].reset_index(drop=True)

<IPython.core.display.Javascript object>

In [111]:
df.shape

(1265, 54)

<IPython.core.display.Javascript object>

### Dropping variable with Plant identification

In [112]:
df["Factory_Plant"] = "partner_i"

<IPython.core.display.Javascript object>

In [113]:
df_partner_i = df.copy()

<IPython.core.display.Javascript object>

In [114]:
df_partner_i.columns

Index(['Date', 'CaO', 'MgO', 'Na2O', 'Al2O3', 'SiO2', 'SO3', 'K2O', 'TiO2',
       'Fe2O3', 'Ca(OH)2', 'CaCO3', 'Total Alkali as Na2O', 'Soundness',
       'Loss on Ignition', 'Insoluble Residue', 'Total C3S', 'Total C2S',
       'Alpha C2S', 'Beta C2S', 'Gamma C2S', 'C4AF', 'C3A', 'Cubic C3A',
       'Orthorhombic C3A', 'Free CaO', 'Portlandite', 'Periclase', 'Arcanite',
       'Aphthitalite', 'Langbeinite', 'Gypsum', 'Bassanite', 'Anhydrite',
       'Calcite', 'Dolomite', 'Quartz', 'Muscovite', '%Gypsum', '%Limestone',
       '%Clinker', 'Dehydration', 'Cement_Type', 'Blaine',
       'Initial setting time', 'Final setting time', 'Density', 'CS1', 'CS3',
       'CS7', 'CS28', '#200', '#325', '#400', 'Factory_Plant'],
      dtype='object')

<IPython.core.display.Javascript object>

# Global Dataset

In [115]:
df = pd.concat([df_203, df_204, df_206, df_207, df_209, df_heildelberg, df_partner_i])

<IPython.core.display.Javascript object>

In [116]:
df.shape

(108823, 64)

<IPython.core.display.Javascript object>

## Preprocessing

### Missing Values (%)

In [117]:
(df.isna().sum() / df.shape[0] * 100).sort_values(ascending=False).to_frame(
    name="Missing (%)"
).style.background_gradient(cmap="Reds")


Unnamed: 0,Missing (%)
Slag,100.0
Fly ash,100.0
CaCO3,100.0
Ca(OH)2,100.0
Calcareous filler,100.0
Calcined clay,100.0
Clinker,100.0
Cl,99.993568
Calcium sulfate,99.441295
Dehydration,99.243726


<IPython.core.display.Javascript object>

### Zeroed Values (%)

In [118]:
zero_values = {}
for col in df.select_dtypes(include="number").columns:
    zero_percentages = (df[df[col].eq(0)].shape[0] / df.shape[0]) * 100
    zero_values[col] = zero_percentages

zero_percentages = pd.Series(zero_values, name=f"Zero (%) for {plant}")
zero_percentages = zero_percentages.sort_values(ascending=False)
zero_percentages = zero_percentages.to_frame(name=f"Zero (%) for {plant}")
zero_percentages.style.background_gradient(cmap="Reds")

Unnamed: 0,Zero (%) for Z
#200,9.382208
Dolomite,4.372237
Arcanite,2.513255
Muscovite,2.103416
Gamma C2S,1.523575
Anhydrite,1.262601
Langbeinite,1.171627
Soundness,1.057681
Na2O,1.046654
Free CaO,0.96487


<IPython.core.display.Javascript object>

### Drop rows where the target variable is null

In [119]:
df = df[df["CS28"].notna()]

<IPython.core.display.Javascript object>

In [120]:
df.shape

(70970, 64)

<IPython.core.display.Javascript object>

### Dropping rows where the target variable is zero

In [121]:
df = df[~df["CS28"].eq(0)]

<IPython.core.display.Javascript object>

In [122]:
df.shape

(70950, 64)

<IPython.core.display.Javascript object>

### Plant and cement type info

In [123]:
df["Factory_Plant"].unique()

array(['203_AV', '203_C', '203_D', '203_G', '203_M', '203_Y', '204_A',
       '204_AB', '204_AE', '204_AQ', '204_AY', '204_F', '204_I', '204_O',
       '204_R', '204_T', '206_B', '207_AT', '209_X', '209_H', '209_AS',
       '209_AG', '209_AW', '209_AZ', '209_W', '209_V', '209_AX', '209_L',
       '209_N', '209_AI', '209_AA', '209_P', '209_Q', '209_AP', '209_E',
       '209_AN', '209_AC', '209_K', '209_S', '209_U', '209_J', '209_AM',
       '209_Z', 'partner_ii', 'partner_i'], dtype=object)

<IPython.core.display.Javascript object>

In [124]:
df["Cement_Type"].unique()

array(['CP II-E40', 'CP III32', 'CP VARI', 'Fibrocimento', 'CP II-E32',
       'CP II-F40', 'CP II-F32', 'CP II-Z32', 'CP IV32', 'CP III40',
       'CPV ARI RS', 'CPIV 32', 'CPII F32', 'CPV ARI', 'CPII F40',
       'CPIV 32 RS', 'CPIII 40 RS', 'CP II E 32', 'CPII F 32', 'CPIV32RS',
       'CPVARI', 'CPIII40', 'CPIIF32', 'CP II-F-32', 'CP II-F-40',
       'CP IV-32', 'CP I-40', 'CP II-Z-32', 'CP II-Z-40', 'CP IV-32 RS',
       'CP V-ARI RS', 'CP II-E-40', 'CP III-32 RS', 'CP I', 'CP V-ARI',
       'CP III-40 RS', 'CP II-E-32', 'CP III-40', 'CP I-S-40',
       'CP III-32', 'Type III', 'Type I-II', 'Type IL', 'CPVARI Expedido',
       'CPIIF40 Expedido', 'CPIIF32 Expedido'], dtype=object)

<IPython.core.display.Javascript object>

## Standardize Cement Type naming

In [125]:
# Preprocessing
df["Cement_Type"] = (
    df["Cement_Type"]
    .str.replace("Expedido", "")
    .str.replace("Linha 9", "")
    .str.replace("Itajaí", "")
    .str.replace("D-G", "")
    .str.replace("D-E", "")
    .str.replace("DF2", "")
    .str.replace("CPIIF40", "CP II-F-40")
    .str.replace("CPIIF40", "CP II-F-40")
    .str.replace("CP VARI", "CP V-ARI")
    .str.replace("CPV ARI", "CP V-ARI")
    .str.replace("CPVARI", "CP V-ARI")
    .str.replace("CP V-RSARI", "CP V-ARI RS")
    .str.replace("CPV ARI RS", "CP V-ARI RS")
    .str.replace("CP III-RS40", "CP III-40 RS")
    .str.replace("CPIII 40 RS", "CP III-40 RS")
    .str.replace("CP III-RS32", "CP III-32 RS")
    .str.replace("CPIV32RS", "CP IV-32 RS")
    .str.replace("CP IV-RS32", "CP IV-32 RS")
    .str.replace("CP III40", "CP III-40")
    .str.replace("CPIII40", "CP III-40")
    .str.replace("CP III32", "CP III-32")
    .str.replace("CP I-S40", "CP I-S-40")
    .str.replace("CP I40", "CP I-40")
    .str.replace("CP II-E32", "CP II-E-32")
    .str.replace("CP II E 32", "CP II-E-32")
    .str.replace("CP II E 32", "CP II-E-32")
    .str.replace("CP II-E40", "CP II-E-40")
    .str.replace("CP II-F FIBRO40", "CP II-F-40")
    .str.replace("CP II-F32", "CP II-F-32")
    .str.replace("CPII F 32", "CP II-F-32")
    .str.replace("CPII F32", "CP II-F-32")
    .str.replace("CPIIF32", "CP II-F-32")
    .str.replace("CP II-F40", "CP II-F-40")
    .str.replace("CPII F40", "CP II-F-40")
    .str.replace("CP II-Z32", "CP II-Z-32")
    .str.replace("CP II-Z40", "CP II-Z-40")
    .str.replace("CP IV32", "CP IV-32")
    .str.replace("CPIV 32", "CP IV-32")
    .str.replace("CP IND", "CP I")
    .str.replace("CPINDCC", "CP I")
    .str.strip()
)

<IPython.core.display.Javascript object>

In [126]:
for cement_type in df["Cement_Type"].sort_values().unique():
    print(cement_type)

CP I
CP I-40
CP I-S-40
CP II-E-32
CP II-E-40
CP II-F-32
CP II-F-40
CP II-Z-32
CP II-Z-40
CP III-32
CP III-32 RS
CP III-40
CP III-40 RS
CP IV-32
CP IV-32 RS
CP V-ARI
CP V-ARI RS
Fibrocimento
Type I-II
Type III
Type IL


<IPython.core.display.Javascript object>

In [127]:
df["Cement_Type"].nunique()

21

<IPython.core.display.Javascript object>

In [128]:
df["Cement_Type"].value_counts()

Cement_Type
CP V-ARI        14526
CP II-F-40      12191
CP II-F-32       9931
CP IV-32         6322
CP II-E-40       4588
CP II-E-32       4293
CP II-Z-32       4021
CP IV-32 RS      2646
CP III-40        1994
CP III-40 RS     1811
CP V-ARI RS      1626
CP I             1261
CP III-32        1173
CP II-Z-40       1033
Fibrocimento     1030
Type I-II         735
CP III-32 RS      581
CP I-40           508
CP I-S-40         358
Type III          244
Type IL            78
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [129]:
df = df.reset_index(drop=True)
df = df[~df["Factory_Plant"].isin(["partner_ii", "partner_i"])]

<IPython.core.display.Javascript object>

## Removing Some Outliers

In [130]:
# This index has noise that makes the model do a bad prediction
# df = df.reset_index(drop=True)
TO_REMOVE = [
    13303,
    56303,
    66828,
    65411,
    66829,
    29628,
    64109,
    56301,
    29678,
    64110,
    61588,
    31968,
    29679,
    64121,
    65412,
    63183,
    29676,
    64125,
    65413,
    31812,
    25139,
    4740,
    34498,
    64071,
    31700,
    63436,
    61936,
    62777,
    34500,
    34766,
    64124,
    31110,
    65414,
    61634,
    48489,
    11510,
    13773,
    64119,
    25046,
    56304,
    31318,
    3312,
    11511,
    41037,
    65445,
    64123,
    49078,
    62353,
    62523,
    61960,
    62857,
    49218,
    41036,
    8058,
    65457,
    64120,
    4266,
    32553,
    63144,
    27188,
    62155,
    61690,
    48979,
    41495,
    40852,
    44474,
    44474,
    65451,
    63762,
    32744,
    32750,
    32751,
    32752,
    34504,
    41155,
    41158,
    43267,
    53188,
    54496,
    60989,
    61785,
    62085,
    65532,
    66323,
    68643,
]
plants = []
for idx in TO_REMOVE:
    plant = df.iloc[idx]["Factory_Plant"]
    plants.append(plant)

df = df.drop(TO_REMOVE)

<IPython.core.display.Javascript object>

In [131]:
cols = [
    "Na2O",
]
for col in cols:
    idxmax = df[col].idxmax()
    print(str(idxmax) + ",")

63435,


<IPython.core.display.Javascript object>

In [132]:
pd.set_option("display.max_rows", 500)
df[cols].describe(percentiles=[0.1, 0.99]).T

Unnamed: 0,count,mean,std,min,10%,50%,99%,max
Na2O,61021.0,0.144746,0.113429,-0.28,0.04,0.13,0.44,4.68505


<IPython.core.display.Javascript object>

In [133]:
len(TO_REMOVE)

85

<IPython.core.display.Javascript object>

In [134]:
df[(df["Na2O"] > 1) & (df["Na2O"] <= 4)][cols]

Unnamed: 0,Na2O
2090,1.52
2946,1.66
5384,2.81
5402,2.81
12367,1.17
12368,1.16
12369,1.25
12370,1.21
12371,1.35
12372,1.39


<IPython.core.display.Javascript object>

In [135]:
print(set(plants))

{'204_I', '209_AG', '209_Q', '204_F', '203_C', '204_O', '203_M', '209_AS', '209_U', '209_AZ', '209_AM', '209_J', '209_W', '204_AQ', '206_B', '209_AI', '209_E', '209_AC', '209_Z', '204_R', '203_D', '209_S'}


<IPython.core.display.Javascript object>

In [136]:
print(set(plants))

{'204_I', '209_AG', '209_Q', '204_F', '203_C', '204_O', '203_M', '209_AS', '209_U', '209_AZ', '209_AM', '209_J', '209_W', '204_AQ', '206_B', '209_AI', '209_E', '209_AC', '209_Z', '204_R', '203_D', '209_S'}


<IPython.core.display.Javascript object>

In [137]:
df.shape

(68576, 64)

<IPython.core.display.Javascript object>

### Dropping duplicated rows

In [138]:
# df.iloc[4266]

<IPython.core.display.Javascript object>

In [139]:
df = df.reset_index(drop=True)

<IPython.core.display.Javascript object>

In [140]:
df.shape

(68576, 64)

<IPython.core.display.Javascript object>

In [141]:
df.drop(["Date"], axis=1).groupby("Factory_Plant").apply(
    lambda x: x.duplicated().sum()
).sort_values(ascending=False)

Factory_Plant
207_AT    838
203_D      86
203_M      17
209_AM      8
204_AQ      8
209_AN      6
204_T       5
209_AG      2
206_B       2
203_Y       1
209_J       1
203_AV      0
209_E       0
209_H       0
209_K       0
209_N       0
209_L       0
209_AX      0
209_P       0
209_Q       0
209_S       0
209_U       0
209_V       0
209_W       0
209_X       0
209_AZ      0
209_AI      0
209_AW      0
209_AS      0
209_AP      0
203_C       0
209_AC      0
209_AA      0
204_R       0
204_O       0
204_I       0
204_F       0
204_AY      0
204_AE      0
204_AB      0
204_A       0
203_G       0
209_Z       0
dtype: int64

<IPython.core.display.Javascript object>

In [142]:
df["Factory"] = df["Factory_Plant"].apply(lambda x: x.split("_")[0])

df.drop(["Date", "Cement_Type", "Factory_Plant"], axis=1).groupby("Factory").apply(
    lambda x: x.duplicated().sum()
).sort_values(ascending=False)

Factory
207    838
203    578
209    504
204     13
206      2
dtype: int64

<IPython.core.display.Javascript object>

In [143]:
df = df.drop(["Factory"], axis=1)

<IPython.core.display.Javascript object>

In [144]:
df = df.loc[df.select_dtypes("number").drop_duplicates().index].reset_index(drop=True)

<IPython.core.display.Javascript object>

In [145]:
df.shape

(66641, 64)

<IPython.core.display.Javascript object>

### Dropping rows with negative values

In [146]:
df[df.drop(["Date"], axis=1).duplicated()].drop("Date", axis=1).select_dtypes(
    include="number"
).apply(lambda x: x < 0).sum().sort_values(ascending=False)

CaO                              0.0
Beta C2S                         0.0
C4AF                             0.0
C3A                              0.0
Cubic C3A                        0.0
Orthorhombic C3A                 0.0
Free CaO                         0.0
Portlandite                      0.0
Periclase                        0.0
Arcanite                         0.0
Aphthitalite                     0.0
Langbeinite                      0.0
Calcium sulfate                  0.0
Bassanite                        0.0
Anhydrite                        0.0
Calcite                          0.0
Dolomite                         0.0
Quartz                           0.0
Muscovite                        0.0
Cl                               0.0
Ca(OH)2                          0.0
CaCO3                            0.0
Total Alkali as Na2O             0.0
Soundness                        0.0
Gypsum                           0.0
%Gypsum                          0.0
%Limestone                       0.0
%

<IPython.core.display.Javascript object>

In [147]:
df.shape

(66641, 64)

<IPython.core.display.Javascript object>

In [148]:
df.drop("Date", axis=1).select_dtypes(include="number").apply(
    lambda x: x < 0
).sum().sort_values(ascending=False)

Na2O                             529
CaO                                0
Calcite                            0
C4AF                               0
C3A                                0
Cubic C3A                          0
Orthorhombic C3A                   0
Free CaO                           0
Portlandite                        0
Periclase                          0
Arcanite                           0
Aphthitalite                       0
Langbeinite                        0
Calcium sulfate                    0
Bassanite                          0
Anhydrite                          0
Dolomite                           0
Beta C2S                           0
Quartz                             0
Muscovite                          0
Cl                                 0
Ca(OH)2                            0
CaCO3                              0
Total Alkali as Na2O               0
Soundness                          0
Gypsum                             0
%Gypsum                            0
%

<IPython.core.display.Javascript object>

In [149]:
df[
    df.drop("Date", axis=1)
    .select_dtypes(include="number")
    .apply(lambda x: x < 0)
    .any(axis=1)
]["Factory_Plant"].value_counts()

Factory_Plant
207_AT    431
203_C      56
209_AP     29
209_AM      9
209_L       3
209_V       1
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [150]:
index = df[
    df.drop("Date", axis=1)
    .select_dtypes(include="number")
    .apply(lambda x: x < 0)
    .any(axis=1)
].index

df = df.drop(index).reset_index(drop=True)

<IPython.core.display.Javascript object>

### Dropping remaining rows with full of nans

In [151]:
df = df.dropna(how="all", axis=1)

<IPython.core.display.Javascript object>

In [152]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66112 entries, 0 to 66111
Data columns (total 48 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   Date                           66112 non-null  datetime64[ns]
 1   CaO                            50142 non-null  float64       
 2   MgO                            64102 non-null  float64       
 3   Na2O                           58601 non-null  float64       
 4   Al2O3                          50037 non-null  float64       
 5   SiO2                           50053 non-null  float64       
 6   SO3                            65463 non-null  float64       
 7   K2O                            60133 non-null  float64       
 8   TiO2                           9304 non-null   float64       
 9   Fe2O3                          50033 non-null  float64       
 10  Loss on Ignition               65609 non-null  float64       
 11  Insoluble Resid

<IPython.core.display.Javascript object>

### Reseting the index

In [153]:
df = df.reset_index(drop=True)

<IPython.core.display.Javascript object>

### Sort the dataset by date

In [154]:
# We first make sure Date column was infered correctly
df["Date"].dtype

dtype('<M8[ns]')

<IPython.core.display.Javascript object>

In [155]:
df["Date"] = df["Date"].dt.date

<IPython.core.display.Javascript object>

In [156]:
df = df.sort_values(by="Date")

<IPython.core.display.Javascript object>

In [157]:
df = df.reset_index(drop=True)

<IPython.core.display.Javascript object>

In [158]:
df = df.iloc[1:].reset_index(drop=True)

<IPython.core.display.Javascript object>

## Saving the dataset

<h3>Saving for feature engineering</h3>

In [159]:
df.to_csv("../../../../data/interim/Global-Dataset/global_dataset.csv", index=False)

<IPython.core.display.Javascript object>

In [160]:
df.shape

(66111, 48)

<IPython.core.display.Javascript object>