In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import optuna
import torch.nn as nn
import torch.nn.functional as F
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from pytorch_tabnet.tab_model import TabNetClassifier
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import cross_val_score, train_test_split, KFold
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

# Data Loading

In [2]:
df_train = pd.read_csv('..\\Dataset\\raw\\train.csv')
df_test = pd.read_csv('..\\Dataset\\raw\\test.csv')

In [3]:
df_train.head(10)

Unnamed: 0,id,product_code,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,...,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,failure
0,0,A,80.1,material_7,material_8,9,5,7,8,4,...,10.672,15.859,17.594,15.193,15.029,,13.034,14.684,764.1,0
1,1,A,84.89,material_7,material_8,9,5,14,3,3,...,12.448,17.947,17.915,11.755,14.732,15.425,14.395,15.631,682.057,0
2,2,A,82.43,material_7,material_8,9,5,12,1,5,...,12.715,15.607,,13.798,16.711,18.631,14.094,17.946,663.376,0
3,3,A,101.07,material_7,material_8,9,5,13,2,6,...,12.471,16.346,18.377,10.02,15.25,15.562,16.154,17.172,826.282,0
4,4,A,188.06,material_7,material_8,9,5,9,2,8,...,10.337,17.082,19.932,12.428,16.182,12.76,13.153,16.412,579.885,0
5,5,A,75.35,material_7,material_8,9,5,11,4,0,...,10.622,14.904,19.107,13.327,15.354,19.251,,17.625,832.902,0
6,6,A,161.71,material_7,material_8,9,5,12,2,4,...,11.37,17.714,19.924,11.56,16.653,17.734,,16.637,684.438,1
7,7,A,177.92,material_7,material_8,9,5,4,8,8,...,10.254,16.449,20.478,12.207,15.624,16.968,15.176,17.231,684.0,1
8,8,A,109.5,material_7,material_8,9,5,9,6,5,...,11.557,15.965,19.604,14.091,15.674,13.327,13.535,15.408,,0
9,9,A,98.72,material_7,material_8,9,5,10,4,7,...,10.384,15.237,18.427,12.635,14.318,14.327,12.867,,,0


In [5]:
df_train.shape

(26570, 26)

In [4]:
num_cols = ['loading', 'measurement_0', 'measurement_1',
       'measurement_2', 'measurement_3', 'measurement_4', 'measurement_5',
       'measurement_6', 'measurement_7', 'measurement_8', 'measurement_9',
       'measurement_10', 'measurement_11', 'measurement_12', 'measurement_13',
       'measurement_14', 'measurement_15', 'measurement_16', 'measurement_17']
cat_cols = ['attribute_0', 'attribute_1', 'attribute_2', 'attribute_3']

# Preprocessing

In [5]:
def prepo(df):
    df_processed = df.copy()
    
    # Drop the "material_" word
    df_processed['attribute_0'] = df_processed['attribute_0'].str.replace('material_', '').astype(int)
    df_processed['attribute_1'] = df_processed['attribute_1'].str.replace('material_', '').astype(int)

    # Dropping the Non-informative Feature
    df_processed = df_processed.drop(columns=['id', 'product_code'])

    # Define Pipeline & Column Transformer
    num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')), # best method so far on this dataset
    ('scaler', StandardScaler()) # for speed up the runtime
    ])

    column_transformer = ColumnTransformer(
        transformers=[
            ('num', num_pipeline, num_cols),
            ('cat', SimpleImputer(strategy='mean'), cat_cols)
        ]
    )

    # Fit and transform data
    df_processed = column_transformer.fit_transform(df_processed)

    # split data into feature & target
    X = df_processed.drop(columns='Exited')
    y = df_processed['Exited']

    # split data into train & test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # change the scale
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # change to array
    X_train = X_train.to_numpy() if isinstance(X_train, pd.DataFrame) else X_train
    X_test = X_test.to_numpy() if isinstance(X_test, pd.DataFrame) else X_test
    y_train = y_train.to_numpy() if isinstance(y_train, pd.Series) else y_train
    y_test = y_test.to_numpy() if isinstance(y_test, pd.Series) else y_test

    return X_train, X_test, y_train, y_test
