In [7]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import tqdm
from sklearn.model_selection import train_test_split
# data dumb store
import joblib
# to locate yaml file
import yaml
# to locate directore
import os

In [8]:
params_dir = "config/config.yaml"

In [9]:
def load_params(param_dir):
    "Function to read params config"
    with open(param_dir, 'r') as file:
        params = yaml.safe_load(file)
        
    return params

In [10]:
params = load_params(params_dir)

In [11]:
params

{'dataset_path': 'data/raw/machining_maintenance.csv',
 'dataset_cleaned_path': 'data/processed/machining_maintenance.pkl',
 'train_set_path': ['data/processed/x_train.pkl',
  'data/processed/y_train.pkl'],
 'valid_set_path': ['data/processed/x_valid.pkl',
  'data/processed/y_valid.pkl'],
 'test_set_path': ['data/processed/x_test.pkl', 'data/processed/y_test.pkl'],
 'train_feng_set_path': ['data/processed/x_train_feng.pkl',
  'data/processed/y_train_feng.pkl'],
 'valid_feng_set_path': ['data/processed/x_valid_feng.pkl',
  'data/processed/y_valid_feng.pkl'],
 'test_feng_set_path': ['data/processed/x_test_feng.pkl',
  'data/processed/y_test_feng.pkl'],
 'production_model_path': 'model/production_model.pkl',
 'ohe_stasiun_path': 'model/encoder.pkl',
 'le_encoder_path': 'model/le_failure_type.pkl',
 'training_log_path': 'log/training_log.json',
 'print_debug': True,
 'drop_list': ['Product ID'],
 'int64_columns': ['Rotational speed [rpm]', 'Tool wear [min]'],
 'float64_columns': ['Air temp

# 1. Read Data

In [12]:
params['dataset_path']

'data/raw/machining_maintenance.csv'

In [13]:
params['mapping_target_feature']

{'Heat Dissipation Failure': 'Failure',
 'Power Failure': 'Failure',
 'Overstrain Failure': 'Failure',
 'Tool Wear Failure': 'Failure',
 'Random Failures': 'Failure'}

In [14]:
# fungsi read data csv
def read_data(path):
    "Function to read data with csv formar"
    data = pd.read_csv(path)
    return data

In [15]:
# read data
df = read_data(path = params['dataset_path'])

In [16]:
# show data
df.head()

Unnamed: 0,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Failure Type
0,M14860,M,298.1,308.6,1551,42.8,0,No Failure
1,L47181,L,298.2,308.7,1408,46.3,3,No Failure
2,L47182,L,298.1,308.5,1498,49.4,5,No Failure
3,L47183,L,298.2,308.6,1433,39.5,7,No Failure
4,L47184,L,298.2,308.7,1408,40.0,9,No Failure


In [17]:
# check data dimension
df.shape

(10000, 8)

# 2. Data Type

In [18]:
# Create Data Description

def data_desc(df):
    "Function to Show Sample Information / Description about the data"
    
    list_item = []
    for col in df.columns:
        list_item.append([col, df[col].dtype, 
                          df[col].isna().sum(),
                          100*df[col].isna().sum()/len(df[col]),
                          df[col].nunique(),
                          df[col].unique()[:4]])
        
    desc_df = pd.DataFrame(data=list_item, columns='feature data_type null_num null_pct unique_num unique_sample'.split())
    
    return desc_df

In [19]:
# show information data
desc = data_desc(df)

In [20]:
# show info data
desc

Unnamed: 0,feature,data_type,null_num,null_pct,unique_num,unique_sample
0,Product ID,object,0,0.0,10000,"[M14860, L47181, L47182, L47183]"
1,Type,object,0,0.0,3,"[M, L, H]"
2,Air temperature [K],float64,0,0.0,93,"[298.1, 298.2, 298.3, 298.5]"
3,Process temperature [K],float64,0,0.0,82,"[308.6, 308.7, 308.5, 309.0]"
4,Rotational speed [rpm],int64,0,0.0,941,"[1551, 1408, 1498, 1433]"
5,Torque [Nm],float64,0,0.0,577,"[42.8, 46.3, 49.4, 39.5]"
6,Tool wear [min],int64,0,0.0,246,"[0, 3, 5, 7]"
7,Failure Type,object,0,0.0,6,"[No Failure, Power Failure, Tool Wear Failure,..."


# 3. Statistic Descriptive

In [21]:
# describe
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Air temperature [K],10000.0,300.00493,2.000259,295.3,298.3,300.1,301.5,304.5
Process temperature [K],10000.0,310.00556,1.483734,305.7,308.8,310.1,311.1,313.8
Rotational speed [rpm],10000.0,1538.7761,179.284096,1168.0,1423.0,1503.0,1612.0,2886.0
Torque [Nm],10000.0,39.98691,9.968934,3.8,33.2,40.1,46.8,76.6
Tool wear [min],10000.0,107.951,63.654147,0.0,53.0,108.0,162.0,253.0


# 4. Handling Failure Type

In [22]:
# check value counts
df['Failure Type'].value_counts()

No Failure                  9652
Heat Dissipation Failure     112
Power Failure                 95
Overstrain Failure            78
Tool Wear Failure             45
Random Failures               18
Name: Failure Type, dtype: int64

In [23]:
# map target feature
mapping_target_feature = {
    'Heat Dissipation Failure' : 'Failure',
    'Power Failure' : 'Failure',
    'Overstrain Failure' : 'Failure',
    'Tool Wear Failure' : 'Failure',
    'Random Failures' : 'Failure'
}

# replace
df['Failure Type'] = df['Failure Type'].replace(mapping_target_feature)

In [24]:
# cek value counts
df['Failure Type'].value_counts()

No Failure    9652
Failure        348
Name: Failure Type, dtype: int64

# 5. Data Defense

In [25]:
def check_data(input_data, params):
    "Function to Defense the Data"
    # check data types
    assert input_data.select_dtypes("object").columns.to_list() == params["object_columns"], "an error occurs in object column(s)."
    assert input_data.select_dtypes("int").columns.to_list() == params["int64_columns"], "an error occurs in int32 column(s)."
    assert input_data.select_dtypes("float").columns.to_list() == params["float64_columns"], "an error occurs in int32 column(s)."

    # check range of data
    assert set(input_data['Type']).issubset(set(params["range_Type"])), "an error occurs in Type range."
    assert input_data['Rotational speed [rpm]'].between(params["range_Rotational speed [rpm]"][0], params["range_Rotational speed [rpm]"][1]).sum() == len(input_data), "an error occurs in Rotational speed [rpm] range."
    assert input_data['Tool wear [min]'].between(params["range_Tool wear [min]"][0], params["range_Tool wear [min]"][1]).sum() == len(input_data), "an error occurs in Tool wear [min] range."
    assert input_data['Air temperature [K]'].between(params["range_Air temperature [K]"][0], params["range_Air temperature [K]"][1]).sum() == len(input_data), "an error occurs in Air temperature [K] range."
    assert input_data['Process temperature [K]'].between(params["range_Process temperature [K]"][0], params["range_Process temperature [K]"][1]).sum() == len(input_data), "an error occurs in Process temperature [K] range."
    assert input_data['Torque [Nm]'].between(params["range_Torque [Nm]"][0], params["range_Torque [Nm]"][1]).sum() == len(input_data), "an error occurs in Torque [Nm] range."

In [26]:
# data defense
check_data(input_data = df, 
           params = params)

# 6. Data Spliting

In [27]:
# 5. split data
def split_data(input_data, config):
    "Function to split data into train, valid, and test"
    
    # Split predictor and label
    x = input_data[config["predictors"]].copy()
    y = input_data[config["label"]].copy()

    # 1st split train and test
    x_train, x_test, \
    y_train, y_test = train_test_split(
        x, y,
        test_size = config["test_size"],
        random_state = 42,
        stratify = y
    )

    # 2nd split test and valid
    x_valid, x_test, \
    y_valid, y_test = train_test_split(
        x_test, y_test,
        test_size = config["valid_size"],
        random_state = 42,
        stratify = y_test
    )

    return x_train, x_valid, x_test, y_train, y_valid, y_test

In [28]:
# Splitting train, valid, and test set
x_train, x_valid, x_test, \
    y_train, y_valid, y_test = split_data(input_data = df, 
                                          config = params)

In [29]:
# check dimension
print(f"Data dimension x_train :  {x_train.shape}")
print(f"Data dimension x_valid :  {x_valid.shape}")
print(f"Data dimension x_test  :  {x_test.shape}")
print(f"Data dimension y_train :  {y_train.shape}")
print(f"Data dimension y_valid :  {y_valid.shape}")
print(f"Data dimension y_test  :  {y_test.shape}")

Data dimension x_train :  (8000, 6)
Data dimension x_valid :  (1000, 6)
Data dimension x_test  :  (1000, 6)
Data dimension y_train :  (8000,)
Data dimension y_valid :  (1000,)
Data dimension y_test  :  (1000,)


 # 6. Save train, valid and test set

In [30]:
joblib.dump(x_train, "data/raw/x_train.pkl")
joblib.dump(y_train, "data/raw/y_train.pkl")
joblib.dump(x_valid, "data/raw/x_valid.pkl")
joblib.dump(y_valid, "data/raw/y_valid.pkl")
joblib.dump(x_test, "data/raw/x_test.pkl")
joblib.dump(y_test, "data/raw/y_test.pkl")

['data/raw/y_test.pkl']