# Session 1: All is Function

---

Training PT. Astra Honda Motor with Pacmann AI

Buat semua menjadi fungsi
- `read_data`
- `split_input_output`
- `split_train_test`

## Fungsi `read_data`
---

In [1]:
# Import library
import pandas as pd

In [2]:
# Read data
data = pd.read_csv('data/raw/machining_maintenance.csv')

# Validasi
print('Data shape:', data.shape)
data.head()

Data shape: (10000, 8)


Unnamed: 0,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Failure Type
0,M14860,M,298.1,308.6,1551,42.8,0,No Failure
1,L47181,L,298.2,308.7,1408,46.3,3,No Failure
2,L47182,L,298.1,308.5,1498,49.4,5,No Failure
3,L47183,L,298.2,308.6,1433,39.5,7,No Failure
4,L47184,L,298.2,308.7,1408,40.0,9,No Failure


In [3]:
# Cek apa ada duplikat
print('Duplikat data:', data.duplicated().sum())

# Drop duplikat
data = data.drop_duplicates()

# Validasi hasil
print('Data shape:', data.shape)

Duplikat data: 0
Data shape: (10000, 8)


In [4]:
# Buat fungsi
def read_data(data_path):
    # Read data
    data = pd.read_csv(data_path)
    print('Data shape       :', data.shape)

    # Cek apa ada duplikat
    print('Duplikat data    :', data.duplicated().sum())

    # Drop duplikat
    data = data.drop_duplicates()

    # Validasi hasil
    print('Data shape final :', data.shape)

    return data

In [5]:
# Panggil fungsi
data = read_data(data_path='data/raw/machining_maintenance.csv')

data.head()

Data shape       : (10000, 8)
Duplikat data    : 0
Data shape final : (10000, 8)


Unnamed: 0,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Failure Type
0,M14860,M,298.1,308.6,1551,42.8,0,No Failure
1,L47181,L,298.2,308.7,1408,46.3,3,No Failure
2,L47182,L,298.1,308.5,1498,49.4,5,No Failure
3,L47183,L,298.2,308.6,1433,39.5,7,No Failure
4,L47184,L,298.2,308.7,1408,40.0,9,No Failure


- Beberapa modifikasi
  - Buat fungsi tanpa input, agar mudah dikonfigurasi di luar file.
  - Buat fungsi menyimpan output data secara otomatis, barangkali prosesnya terlalu lama.

- Fungsi tanpa input, solusinya pakai `config` file
- Fungsi untuk menyimpan output, solusinya pakai dumping dengan `pickle`

In [6]:
# Load configuration
import src.utils as utils

In [7]:
CONFIG_DATA = utils.config_load()
CONFIG_DATA

{'raw_dataset_path': 'data/raw/machining_maintenance.csv',
 'dataset_path': 'data/output/data.pkl',
 'input_set_path': 'data/output/input.pkl',
 'output_set_path': 'data/output/output.pkl',
 'input_cols_path': 'data/output/input_cols.pkl',
 'train_set_path': ['data/output/X_train.pkl', 'data/output/y_train.pkl'],
 'valid_set_path': ['data/output/X_valid.pkl', 'data/output/y_valid.pkl'],
 'test_set_path': ['data/output/X_test.pkl', 'data/output/y_test.pkl'],
 'output_cols': 'Failure Type',
 'drop_cols': ['Product ID', 'Failure Type'],
 'seed': 123,
 'test_size': 0.2,
 'num_cols': ['Air temperature [K]',
  'Process temperature [K]',
  'Rotational speed [rpm]',
  'Torque [Nm]',
  'Tool wear [min]'],
 'cat_cols': ['Type'],
 'num_imputer_path': 'data/output/num_imputer.pkl',
 'cat_imputer_path': 'data/output/cat_imputer.pkl',
 'scaler_path': 'data/output/scaler.pkl',
 'train_clean_path': 'data/output/X_train_clean.pkl',
 'valid_clean_path': 'data/output/X_valid_clean.pkl',
 'test_clean_path

In [8]:
# Buat fungsi
def read_data():
    # Read data
    data_path = CONFIG_DATA['raw_dataset_path']
    data = pd.read_csv(data_path)
    print('Data shape       :', data.shape)

    # Cek apa ada duplikat
    print('Duplikat data    :', data.duplicated().sum())

    # Drop duplikat
    data = data.drop_duplicates()

    # Validasi hasil
    print('Data shape final :', data.shape)

    # Simpan hasil dalam pickle
    dump_path = CONFIG_DATA['dataset_path']
    utils.pickle_dump(data, dump_path)

    return data

In [9]:
# Panggil fungsi
data = read_data()

data.head()

Data shape       : (10000, 8)
Duplikat data    : 0
Data shape final : (10000, 8)


Unnamed: 0,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Failure Type
0,M14860,M,298.1,308.6,1551,42.8,0,No Failure
1,L47181,L,298.2,308.7,1408,46.3,3,No Failure
2,L47182,L,298.1,308.5,1498,49.4,5,No Failure
3,L47183,L,298.2,308.6,1433,39.5,7,No Failure
4,L47184,L,298.2,308.7,1408,40.0,9,No Failure


## Fungsi `split_input_output`
---

In [10]:
# Buat data output, y
y = data['Failure Type']

# Validasi
print('Data shape:', y.shape)
y.head()

Data shape: (10000,)


0    No Failure
1    No Failure
2    No Failure
3    No Failure
4    No Failure
Name: Failure Type, dtype: object

In [11]:
# Edit
y = y.apply(lambda types: 0 if types=="No Failure" else 1)

# Validasi
print('Data shape:', y.shape)
y.head()

Data shape: (10000,)


0    0
1    0
2    0
3    0
4    0
Name: Failure Type, dtype: int64

In [12]:
# Buat data input, X
X = data.drop(columns=['Product ID',        # kolom ini tidak penting
                       'Failure Type'],
              axis=1)

# Validasi
print('Data shape:', X.shape)
X.head()

Data shape: (10000, 6)


Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min]
0,M,298.1,308.6,1551,42.8,0
1,L,298.2,308.7,1408,46.3,3
2,L,298.1,308.5,1498,49.4,5
3,L,298.2,308.6,1433,39.5,7
4,L,298.2,308.7,1408,40.0,9


In [13]:
# Buat fungsi
def split_input_output(data):
    # Buat data output, y
    y = data['Failure Type']
    y = y.apply(lambda types: 0 if types=="No Failure" else 1)
    
    # Buat data input, X
    X = data.drop(columns=['Product ID', 'Failure Type'], axis=1)

    # Validasi
    print('Input shape   :', X.shape)
    print('Output shape  :', y.shape)

    return X, y

In [14]:
# Panggil fungsi
X, y = split_input_output(data)

X.head()

Input shape   : (10000, 6)
Output shape  : (10000,)


Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min]
0,M,298.1,308.6,1551,42.8,0
1,L,298.2,308.7,1408,46.3,3
2,L,298.1,308.5,1498,49.4,5
3,L,298.2,308.6,1433,39.5,7
4,L,298.2,308.7,1408,40.0,9


- Beberapa modifikasi
  - Buat fungsi tanpa input, agar mudah dikonfigurasi di luar file.
  - Buat fungsi menyimpan output data secara otomatis, barangkali prosesnya terlalu lama.
  - load data otomatis dari pickle file

In [15]:
# Update file config & load kembali
CONFIG_DATA = utils.config_load()
CONFIG_DATA

{'raw_dataset_path': 'data/raw/machining_maintenance.csv',
 'dataset_path': 'data/output/data.pkl',
 'input_set_path': 'data/output/input.pkl',
 'output_set_path': 'data/output/output.pkl',
 'input_cols_path': 'data/output/input_cols.pkl',
 'train_set_path': ['data/output/X_train.pkl', 'data/output/y_train.pkl'],
 'valid_set_path': ['data/output/X_valid.pkl', 'data/output/y_valid.pkl'],
 'test_set_path': ['data/output/X_test.pkl', 'data/output/y_test.pkl'],
 'output_cols': 'Failure Type',
 'drop_cols': ['Product ID', 'Failure Type'],
 'seed': 123,
 'test_size': 0.2,
 'num_cols': ['Air temperature [K]',
  'Process temperature [K]',
  'Rotational speed [rpm]',
  'Torque [Nm]',
  'Tool wear [min]'],
 'cat_cols': ['Type'],
 'num_imputer_path': 'data/output/num_imputer.pkl',
 'cat_imputer_path': 'data/output/cat_imputer.pkl',
 'scaler_path': 'data/output/scaler.pkl',
 'train_clean_path': 'data/output/X_train_clean.pkl',
 'valid_clean_path': 'data/output/X_valid_clean.pkl',
 'test_clean_path

In [16]:
# Buat fungsi
def split_input_output():
    # Load data otomatis
    dataset_path = CONFIG_DATA['dataset_path']
    data = utils.pickle_load(dataset_path)
    
    # Buat data output, y
    output_cols = CONFIG_DATA['output_cols']
    y = data[output_cols]
    y = y.apply(lambda types: 0 if types=="No Failure" else 1)
    
    # Buat data input, X
    drop_cols = CONFIG_DATA['drop_cols']
    X = data.drop(columns=drop_cols, axis=1)

    # Validasi
    print('Input shape   :', X.shape)
    print('Output shape  :', y.shape)

    # Dump file
    dump_path_input = CONFIG_DATA['input_set_path']
    dump_path_output = CONFIG_DATA['output_set_path']
    dump_path_input_cols = CONFIG_DATA['input_cols_path']
    utils.pickle_dump(X, dump_path_input)
    utils.pickle_dump(y, dump_path_output)
    utils.pickle_dump(X.columns, dump_path_input_cols)
    
    return X, y

In [17]:
# Panggil fungsi
X, y = split_input_output()

X.head()

Input shape   : (10000, 6)
Output shape  : (10000,)


Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min]
0,M,298.1,308.6,1551,42.8,0
1,L,298.2,308.7,1408,46.3,3
2,L,298.1,308.5,1498,49.4,5
3,L,298.2,308.6,1433,39.5,7
4,L,298.2,308.7,1408,40.0,9


## Fungsi `split_train_test`
---

In [18]:
# Import library untuk train-test-split
from sklearn.model_selection import train_test_split

In [19]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    stratify=y,
                                                    test_size = 0.2,
                                                    random_state = 123)

# Train valid split
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train,
                                                      stratify=y_train,
                                                      test_size = 0.2,
                                                      random_state = 123)

In [20]:
print('X_train shape :', X_train.shape)
print('y_train shape :', y_train.shape)
print('X_valid shape  :', X_valid.shape)
print('y_valid shape  :', y_valid.shape)
print('X_test shape  :', X_test.shape)
print('y_test shape  :', y_test.shape)

X_train shape : (6400, 6)
y_train shape : (6400,)
X_valid shape  : (1600, 6)
y_valid shape  : (1600,)
X_test shape  : (2000, 6)
y_test shape  : (2000,)


In [21]:
# Buat jadi fungsi
def split_train_test(X, y):
    # Train test split
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        stratify=y,
                                                        test_size = 0.2,
                                                        random_state = 123)

    # Train valid split
    X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train,
                                                        stratify=y_train,
                                                        test_size = 0.2,
                                                        random_state = 123)
    
    # Validasi
    print('X_train shape :', X_train.shape)
    print('y_train shape :', y_train.shape)
    print('X_valid shape  :', X_valid.shape)
    print('y_valid shape  :', y_valid.shape)
    print('X_test shape  :', X_test.shape)
    print('y_test shape  :', y_test.shape)

    return X_train, X_valid, X_test, y_train, y_valid, y_test

In [22]:
X_train, X_valid, X_test, y_train, y_valid, y_test = split_train_test(X, y)

X_train shape : (6400, 6)
y_train shape : (6400,)
X_valid shape  : (1600, 6)
y_valid shape  : (1600,)
X_test shape  : (2000, 6)
y_test shape  : (2000,)


- Lakukan modifikasi yang serupa dengan sebelumnya

In [23]:
# Update file config & load kembali
CONFIG_DATA = utils.config_load()
CONFIG_DATA

{'raw_dataset_path': 'data/raw/machining_maintenance.csv',
 'dataset_path': 'data/output/data.pkl',
 'input_set_path': 'data/output/input.pkl',
 'output_set_path': 'data/output/output.pkl',
 'input_cols_path': 'data/output/input_cols.pkl',
 'train_set_path': ['data/output/X_train.pkl', 'data/output/y_train.pkl'],
 'valid_set_path': ['data/output/X_valid.pkl', 'data/output/y_valid.pkl'],
 'test_set_path': ['data/output/X_test.pkl', 'data/output/y_test.pkl'],
 'output_cols': 'Failure Type',
 'drop_cols': ['Product ID', 'Failure Type'],
 'seed': 123,
 'test_size': 0.2,
 'num_cols': ['Air temperature [K]',
  'Process temperature [K]',
  'Rotational speed [rpm]',
  'Torque [Nm]',
  'Tool wear [min]'],
 'cat_cols': ['Type'],
 'num_imputer_path': 'data/output/num_imputer.pkl',
 'cat_imputer_path': 'data/output/cat_imputer.pkl',
 'scaler_path': 'data/output/scaler.pkl',
 'train_clean_path': 'data/output/X_train_clean.pkl',
 'valid_clean_path': 'data/output/X_valid_clean.pkl',
 'test_clean_path

In [24]:
# Buat jadi fungsi
def split_train_test():
    # Load data X dan y
    input_path = CONFIG_DATA['input_set_path']
    output_path = CONFIG_DATA['output_set_path']
    X = utils.pickle_load(input_path)
    y = utils.pickle_load(output_path)

    # Train test split
    test_size = CONFIG_DATA['test_size']
    random_state = CONFIG_DATA['seed']
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        stratify=y,
                                                        test_size = test_size,
                                                        random_state = random_state)

    # Train valid split
    X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train,
                                                        stratify=y_train,
                                                        test_size = test_size,
                                                        random_state = random_state)
    
    # Validasi
    print('X_train shape :', X_train.shape)
    print('y_train shape :', y_train.shape)
    print('X_valid shape  :', X_valid.shape)
    print('y_valid shape  :', y_valid.shape)
    print('X_test shape  :', X_test.shape)
    print('y_test shape  :', y_test.shape)

    # Dump file
    xtrain_path = CONFIG_DATA['train_set_path'][0]
    ytrain_path = CONFIG_DATA['train_set_path'][1]
    xvalid_path = CONFIG_DATA['valid_set_path'][0]
    yvalid_path = CONFIG_DATA['valid_set_path'][1]
    xtest_path = CONFIG_DATA['test_set_path'][0]
    ytest_path = CONFIG_DATA['test_set_path'][1]
    utils.pickle_dump(X_train, xtrain_path)
    utils.pickle_dump(y_train, ytrain_path)
    utils.pickle_dump(X_valid, xvalid_path)
    utils.pickle_dump(y_valid, yvalid_path)
    utils.pickle_dump(X_test, xtest_path)
    utils.pickle_dump(y_test, ytest_path)

    return X_train, X_valid, X_test, y_train, y_valid, y_test

In [25]:
X_train, X_valid, X_test, y_train, y_valid, y_test = split_train_test()

X_train shape : (6400, 6)
y_train shape : (6400,)
X_valid shape  : (1600, 6)
y_valid shape  : (1600,)
X_test shape  : (2000, 6)
y_test shape  : (2000,)


In [28]:
X_train.to_csv('data/output/data.csv', index=False)
X_train.head()

Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min]
8588,L,297.3,307.8,1385,44.4,169
7189,L,300.4,310.4,1486,46.3,48
7205,L,299.8,309.7,1483,39.6,87
9492,L,299.1,309.9,1586,36.6,215
9241,L,298.1,308.7,1683,29.1,161


Great! Untuk finalisasi, buat python file dengan nama data pipeline