In [1]:
from warnings import filterwarnings

filterwarnings("ignore")

In [2]:
import os
import sys

cwd = os.getcwd()
print(cwd)

c:\ML\Algo_evaluation_proj


In [3]:
root_dir = os.path.abspath(os.path.join(cwd, ".."))
print(root_dir)

c:\ML


In [4]:
sys.path.append(root_dir)

In [5]:
from utils import algo_evaluation

# Step 1 - Data Ingestion

In [6]:
import pandas as pd

df = pd.read_csv("train_machine.csv")
df.head()

Unnamed: 0,id,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,0,L50096,L,300.6,309.6,1596,36.1,140,0,0,0,0,0,0
1,1,M20343,M,302.6,312.1,1759,29.1,200,0,0,0,0,0,0
2,2,L49454,L,299.3,308.5,1805,26.5,25,0,0,0,0,0,0
3,3,L53355,L,301.0,310.9,1524,44.3,197,0,0,0,0,0,0
4,4,M24050,M,298.0,309.0,1641,35.4,34,0,0,0,0,0,0


# Target feature is y

In [7]:
df["Machine failure"].value_counts()

Machine failure
0    134281
1      2148
Name: count, dtype: int64

# Step 3 - Perform basic data quality checks


In [8]:
df.shape

(136429, 14)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136429 entries, 0 to 136428
Data columns (total 14 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   id                       136429 non-null  int64  
 1   Product ID               136429 non-null  object 
 2   Type                     136429 non-null  object 
 3   Air temperature [K]      136429 non-null  float64
 4   Process temperature [K]  136429 non-null  float64
 5   Rotational speed [rpm]   136429 non-null  int64  
 6   Torque [Nm]              136429 non-null  float64
 7   Tool wear [min]          136429 non-null  int64  
 8   Machine failure          136429 non-null  int64  
 9   TWF                      136429 non-null  int64  
 10  HDF                      136429 non-null  int64  
 11  PWF                      136429 non-null  int64  
 12  OSF                      136429 non-null  int64  
 13  RNF                      136429 non-null  int64  
dtypes: f

In [10]:
df.isna().sum()

id                         0
Product ID                 0
Type                       0
Air temperature [K]        0
Process temperature [K]    0
Rotational speed [rpm]     0
Torque [Nm]                0
Tool wear [min]            0
Machine failure            0
TWF                        0
HDF                        0
PWF                        0
OSF                        0
RNF                        0
dtype: int64

In [11]:
df.duplicated().sum()

np.int64(0)

In [12]:
df.select_dtypes(include="object").nunique()

Product ID    9976
Type             3
dtype: int64

# Step 2 - Seperate X and Y


In [13]:
X = df.drop(columns=["id","Product ID","Machine failure"])
Y = df["Machine failure"]

In [14]:
X.head()

Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],TWF,HDF,PWF,OSF,RNF
0,L,300.6,309.6,1596,36.1,140,0,0,0,0,0
1,M,302.6,312.1,1759,29.1,200,0,0,0,0,0
2,L,299.3,308.5,1805,26.5,25,0,0,0,0,0
3,L,301.0,310.9,1524,44.3,197,0,0,0,0,0
4,M,298.0,309.0,1641,35.4,34,0,0,0,0,0


In [15]:
Y.head()


0    0
1    0
2    0
3    0
4    0
Name: Machine failure, dtype: int64

# Step 3 - Train Test Split

In [16]:
from sklearn.model_selection import train_test_split

xtrain, xtest, ytrain, ytest = train_test_split(X, Y, test_size=0.2, random_state=42)

In [17]:

xtrain.head()

Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],TWF,HDF,PWF,OSF,RNF
12407,L,301.5,310.7,1481,43.7,77,0,0,0,0,0
85011,L,300.7,309.5,1452,43.4,85,0,0,0,0,0
65292,M,298.5,309.1,1365,57.0,27,0,0,0,0,0
18478,L,301.6,310.7,1543,31.9,16,0,0,0,0,0
100267,L,298.9,308.6,1633,32.9,124,0,0,0,0,0


In [18]:
ytrain.head()

12407     0
85011     0
65292     0
18478     0
100267    0
Name: Machine failure, dtype: int64

In [19]:

xtest.head()

Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],TWF,HDF,PWF,OSF,RNF
123128,L,300.9,310.7,1368,62.0,12,0,0,0,1,0
54705,L,300.5,311.3,1543,36.0,190,0,0,0,0,0
40016,L,300.6,310.4,1499,38.5,114,0,0,0,0,0
75025,L,297.6,308.4,1502,38.5,130,0,0,0,0,0
120421,L,300.8,310.7,1539,38.5,203,0,0,0,0,0


In [20]:

ytest.head()

123128    1
54705     0
40016     0
75025     0
120421    0
Name: Machine failure, dtype: int64

In [21]:
xtrain.shape

(109143, 11)

In [22]:
xtest.shape

(27286, 11)

# Step 4 - Apply preprocessing on X

In [23]:
cat_cols = X.select_dtypes(include="object").columns.tolist()
cat_cols

['Type']

In [24]:
num_cols = X.select_dtypes(include="number").columns.tolist()
num_cols

['Air temperature [K]',
 'Process temperature [K]',
 'Rotational speed [rpm]',
 'Torque [Nm]',
 'Tool wear [min]',
 'TWF',
 'HDF',
 'PWF',
 'OSF',
 'RNF']

In [25]:
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [26]:
num_pipe = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())

In [27]:
cat_pipe = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore", sparse_output=False, drop="first"),
)

In [28]:
pre = ColumnTransformer(
    [("num", num_pipe, num_cols), ("cat", cat_pipe, cat_cols)]
).set_output(transform="pandas")

In [29]:
pre.fit(xtrain)

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [30]:

xtrain_pre = pre.transform(xtrain)
xtrain_pre.head()

Unnamed: 0,num__Air temperature [K],num__Process temperature [K],num__Rotational speed [rpm],num__Torque [Nm],num__Tool wear [min],num__TWF,num__HDF,num__PWF,num__OSF,num__RNF,cat__Type_L,cat__Type_M
12407,0.879407,0.546161,-0.282669,0.394662,-0.426183,-0.038436,-0.071621,-0.048677,-0.063694,-0.04714,1.0,0.0
85011,0.449463,-0.320041,-0.491809,0.359361,-0.301258,-0.038436,-0.071621,-0.048677,-0.063694,-0.04714,1.0,0.0
65292,-0.732883,-0.608775,-1.11923,1.959702,-1.206971,-0.038436,-0.071621,-0.048677,-0.063694,-0.04714,0.0,1.0
18478,0.93315,0.546161,0.164459,-0.993869,-1.378744,-0.038436,-0.071621,-0.048677,-0.063694,-0.04714,1.0,0.0
100267,-0.517911,-0.969693,0.813515,-0.876197,0.307756,-0.038436,-0.071621,-0.048677,-0.063694,-0.04714,1.0,0.0


In [31]:

xtest_pre = pre.transform(xtest)
xtest_pre.head()

Unnamed: 0,num__Air temperature [K],num__Process temperature [K],num__Rotational speed [rpm],num__Torque [Nm],num__Tool wear [min],num__TWF,num__HDF,num__PWF,num__OSF,num__RNF,cat__Type_L,cat__Type_M
123128,0.556949,0.546161,-1.097595,2.548063,-1.441207,-0.038436,-0.071621,-0.048677,15.699994,-0.04714,1.0,0.0
54705,0.341977,0.979262,0.164459,-0.511413,1.338396,-0.038436,-0.071621,-0.048677,-0.063694,-0.04714,1.0,0.0
40016,0.39572,0.32961,-0.152858,-0.217233,0.151599,-0.038436,-0.071621,-0.048677,-0.063694,-0.04714,1.0,0.0
75025,-1.21657,-1.11406,-0.131223,-0.217233,0.401451,-0.038436,-0.071621,-0.048677,-0.063694,-0.04714,1.0,0.0
120421,0.503206,0.546161,0.135612,-0.217233,1.5414,-0.038436,-0.071621,-0.048677,-0.063694,-0.04714,1.0,0.0


# Apply Algo evaluation


In [32]:
!uv add xgboost

[2mResolved [1m123 packages[0m [2min 4ms[0m[0m
[2mAudited [1m117 packages[0m [2min 0.08ms[0m[0m


In [33]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from xgboost import XGBClassifier

In [34]:
models = [
    LogisticRegression(random_state=42),
    DecisionTreeClassifier(max_depth=3, random_state=42),
    RandomForestClassifier(max_depth=3, random_state=42),
    HistGradientBoostingClassifier(max_depth=3, random_state=42),
    XGBClassifier(max_depth=3, random_state=42),
]

In [35]:
best_res , best_model = algo_evaluation(models, xtrain_pre, ytrain, xtest_pre, ytest)

{'name': 'LogisticRegression', 'model': LogisticRegression(random_state=42), 'cv_mean': np.float64(0.9301), 'cv_std': np.float64(0.0092), 'f1_train': 0.9303, 'f1_test': 0.9315, 'gen_err': 0.0012}
{'name': 'DecisionTreeClassifier', 'model': DecisionTreeClassifier(max_depth=3, random_state=42), 'cv_mean': np.float64(0.901), 'cv_std': np.float64(0.0081), 'f1_train': 0.9021, 'f1_test': 0.8961, 'gen_err': 0.006}
{'name': 'RandomForestClassifier', 'model': RandomForestClassifier(max_depth=3, random_state=42), 'cv_mean': np.float64(0.8896), 'cv_std': np.float64(0.0059), 'f1_train': 0.8843, 'f1_test': 0.8778, 'gen_err': 0.0065}
{'name': 'HistGradientBoostingClassifier', 'model': HistGradientBoostingClassifier(max_depth=3, random_state=42), 'cv_mean': np.float64(0.9299), 'cv_std': np.float64(0.0089), 'f1_train': 0.9303, 'f1_test': 0.9315, 'gen_err': 0.0012}


ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\ML\.venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\ML\.venv\Lib\site-packages\xgboost\core.py", line 729, in inner_f
    return func(**kwargs)
  File "c:\ML\.venv\Lib\site-packages\xgboost\sklearn.py", line 1664, in fit
    train_dmatrix, evals = _wrap_evaluation_matrices(
                           ~~~~~~~~~~~~~~~~~~~~~~~~~^
        missing=self.missing,
        ^^^^^^^^^^^^^^^^^^^^^
    ...<14 lines>...
        feature_types=self.feature_types,
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "c:\ML\.venv\Lib\site-packages\xgboost\sklearn.py", line 628, in _wrap_evaluation_matrices
    train_dmatrix = create_dmatrix(
        data=X,
    ...<9 lines>...
        ref=None,
    )
  File "c:\ML\.venv\Lib\site-packages\xgboost\sklearn.py", line 1137, in _create_dmatrix
    return QuantileDMatrix(
        **kwargs, ref=ref, nthread=self.n_jobs, max_bin=self.max_bin
    )
  File "c:\ML\.venv\Lib\site-packages\xgboost\core.py", line 729, in inner_f
    return func(**kwargs)
  File "c:\ML\.venv\Lib\site-packages\xgboost\core.py", line 1614, in __init__
    self._init(
    ~~~~~~~~~~^
        data,
        ^^^^^
    ...<12 lines>...
        max_quantile_blocks=max_quantile_batches,
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "c:\ML\.venv\Lib\site-packages\xgboost\core.py", line 1678, in _init
    it.reraise()
    ~~~~~~~~~~^^
  File "c:\ML\.venv\Lib\site-packages\xgboost\core.py", line 572, in reraise
    raise exc  # pylint: disable=raising-bad-type
    ^^^^^^^^^
  File "c:\ML\.venv\Lib\site-packages\xgboost\core.py", line 553, in _handle_exception
    return fn()
  File "c:\ML\.venv\Lib\site-packages\xgboost\core.py", line 640, in <lambda>
    return self._handle_exception(lambda: int(self.next(input_data)), 0)
                                              ~~~~~~~~~^^^^^^^^^^^^
  File "c:\ML\.venv\Lib\site-packages\xgboost\data.py", line 1654, in next
    input_data(**self.kwargs)
    ~~~~~~~~~~^^^^^^^^^^^^^^^
  File "c:\ML\.venv\Lib\site-packages\xgboost\core.py", line 729, in inner_f
    return func(**kwargs)
  File "c:\ML\.venv\Lib\site-packages\xgboost\core.py", line 629, in input_data
    self.proxy.set_info(
    ~~~~~~~~~~~~~~~~~~~^
        feature_names=feature_names,
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
        feature_types=feature_types,
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
        **kwargs,
        ^^^^^^^^^
    )
    ^
  File "c:\ML\.venv\Lib\site-packages\xgboost\core.py", line 729, in inner_f
    return func(**kwargs)
  File "c:\ML\.venv\Lib\site-packages\xgboost\core.py", line 975, in set_info
    self.feature_names = feature_names
    ^^^^^^^^^^^^^^^^^^
  File "c:\ML\.venv\Lib\site-packages\xgboost\core.py", line 1364, in feature_names
    raise ValueError(
        "feature_names must be string, and may not contain [, ] or <"
    )
ValueError: feature_names must be string, and may not contain [, ] or <
