## Machine Failure Project

## Importing all the necessary libraries

In [28]:
# ignore non-harmful warnings
from warnings import filterwarnings
filterwarnings('ignore')

# To read the dataset
import pandas as pd

# To clean the data and preprocessing it
# from sklearn.pipeline import makepipeline
# from sklearn.impute import SimpleImputer
# from sklearn.preprocessing import StandardScaler,OneHotEncoder
# from sklearn.compose import ColumnTransformer
from Data_clean_preprocess import clean_process_data

# To Visualize and ploting
import seaborn as sns
import matplotlib.pyplot as plt

# Divide the data into training and testing 
from sklearn.model_selection import train_test_split,cross_val_score

# Model building=> Decision Tree, Logistic Regression, Random Forest, Gradient Boost,XGBoost
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from xgboost import XGBClassifier

# Evaluation metrics
from sklearn.metrics import f1_score,ConfusionMatrixDisplay,classification_report

## Read the dataset

In [2]:
path = r"G:\Machine Learning\Projects\Classification Project\datasets-train.csv"
df = pd.read_csv(path)
df.head()

Unnamed: 0,id,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,0,L50096,L,300.6,309.6,1596,36.1,140,0,0,0,0,0,0
1,1,M20343,M,302.6,312.1,1759,29.1,200,0,0,0,0,0,0
2,2,L49454,L,299.3,308.5,1805,26.5,25,0,0,0,0,0,0
3,3,L53355,L,301.0,310.9,1524,44.3,197,0,0,0,0,0,0
4,4,M24050,M,298.0,309.0,1641,35.4,34,0,0,0,0,0,0


In [3]:
print(f'Shape of Data {df.shape}')
print(f'Columns Present In Dataset: {df.columns}')
print(f'Shape of Data: {df.info()}')

Shape of Data (136429, 14)
Columns Present In Dataset: Index(['id', 'Product ID', 'Type', 'Air temperature [K]',
       'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]',
       'Tool wear [min]', 'Machine failure', 'TWF', 'HDF', 'PWF', 'OSF',
       'RNF'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136429 entries, 0 to 136428
Data columns (total 14 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   id                       136429 non-null  int64  
 1   Product ID               136429 non-null  object 
 2   Type                     136429 non-null  object 
 3   Air temperature [K]      136429 non-null  float64
 4   Process temperature [K]  136429 non-null  float64
 5   Rotational speed [rpm]   136429 non-null  int64  
 6   Torque [Nm]              136429 non-null  float64
 7   Tool wear [min]          136429 non-null  int64  
 8   Machine failure          136429 non-

In [27]:
df['Machine failure'].value_counts()

Machine failure
0    134281
1      2148
Name: count, dtype: int64

## Perform basic data quality checks

In [4]:
# Missing Values
print(f'Missing Values ')
df.isna().sum()

Missing Values 


id                         0
Product ID                 0
Type                       0
Air temperature [K]        0
Process temperature [K]    0
Rotational speed [rpm]     0
Torque [Nm]                0
Tool wear [min]            0
Machine failure            0
TWF                        0
HDF                        0
PWF                        0
OSF                        0
RNF                        0
dtype: int64

In [5]:
## Duplicated rows
print(f'Duplicated rows')
df.duplicated().sum()

Duplicated rows


np.int64(0)

In [6]:
df = df.drop_duplicates()
print(f'Duplicated rows are dropped')

Duplicated rows are dropped


## Perform EDA

# Separate X and Y features
    Y: Machine Failure
    X: Remaining all the features

In [7]:
X = df.drop(columns=['id','Machine failure'])
Y = df[['Machine failure']]
print(X.head())
print('-----------')
print(Y.head())

  Product ID Type  Air temperature [K]  Process temperature [K]  \
0     L50096    L                300.6                    309.6   
1     M20343    M                302.6                    312.1   
2     L49454    L                299.3                    308.5   
3     L53355    L                301.0                    310.9   
4     M24050    M                298.0                    309.0   

   Rotational speed [rpm]  Torque [Nm]  Tool wear [min]  TWF  HDF  PWF  OSF  \
0                    1596         36.1              140    0    0    0    0   
1                    1759         29.1              200    0    0    0    0   
2                    1805         26.5               25    0    0    0    0   
3                    1524         44.3              197    0    0    0    0   
4                    1641         35.4               34    0    0    0    0   

   RNF  
0    0  
1    0  
2    0  
3    0  
4    0  
-----------
   Machine failure
0                0
1                0

## Data Preprocessing and Data Cleaning 

In [8]:
cat = list(X.columns[X.dtypes == 'object'])
con = list(X.columns[X.dtypes != 'object'])

In [9]:
from Data_clean_preprocess import clean_process_data

In [10]:
pre,X_pre = clean_process_data(cat,con,X)

In [15]:
pre

0,1,2
,transformers,"[('cat', ...), ('con', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [17]:
X_pre.head()

Unnamed: 0,cat__Product ID_H29424,cat__Product ID_H29425,cat__Product ID_H29432,cat__Product ID_H29434,cat__Product ID_H29441,cat__Product ID_H29452,cat__Product ID_H29457,cat__Product ID_H29462,cat__Product ID_H29466,cat__Product ID_H29481,...,con__Air temperature [K],con__Process temperature [K],con__Rotational speed [rpm],con__Torque [Nm],con__Tool wear [min],con__TWF,con__HDF,con__PWF,con__OSF,con__RNF
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.39588,-0.24623,0.545416,-0.499711,0.556417,-0.03945,-0.072021,-0.049016,-0.063038,-0.047568
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.469856,1.558605,1.720308,-1.323028,1.494433,-0.03945,-0.072021,-0.049016,-0.063038,-0.047568
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.302204,-1.040358,2.051873,-1.628831,-1.241447,-0.03945,-0.072021,-0.049016,-0.063038,-0.047568
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.610675,0.692284,0.026445,0.464745,1.447532,-0.03945,-0.072021,-0.049016,-0.063038,-0.047568
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.000288,-0.679391,0.869773,-0.582043,-1.100744,-0.03945,-0.072021,-0.049016,-0.063038,-0.047568


## Splitting of data into training and testing

In [11]:
xtrain,xtest,ytrain,ytest = train_test_split(X_pre,Y,train_size=0.75,random_state=21)
print(f'xtrain shape: {xtrain.shape}')
print(f'xtest shape: {xtest.shape}')
print(f'ytrain shape: {ytrain.shape}')
print(f'ytest shape: {ytest.shape}')

xtrain shape: (102321, 9989)
xtest shape: (34108, 9989)
ytrain shape: (102321, 1)
ytest shape: (34108, 1)


## Algorithm Evaluation

In [12]:
models = [
    LogisticRegression(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    GradientBoostingClassifier(),
    XGBClassifier()
]

In [13]:
models[0]

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [14]:
type(models[0])

sklearn.linear_model._logistic.LogisticRegression

In [18]:
type(models[0]).__name__

'LogisticRegression'

In [19]:
def evaluate_model_performance(model,xtrain,ytrain,xtest,ytest):
    # Fit the model
    model.fit(xtrain,ytrain)

    # Predict the outputs-Machine_failure
    ypred_train = model.predict(xtrain)
    ypred_test = model.predict(ytrain)

    # Evaluation on basis of metrics: f1-score
    f1_train = f1_score(ytrain,ypred_train)
    f1_test = f1_score(ytest,ypred_test)

    # Evaluate through cross validation also
    scores = cross_val_score(model,xtrain,ytrain,cv=4,scoring='f1_macro')
    f1_cv = scores.mean()

    # Returns all the above results for the model
    results = {
        'Name':type(model).__name__,
        'f1_score:training':f1_train,
        'f1_score:testing':f1_test,
        'f1_cv':f1_cv
    }
    return results

In [20]:
models

[LogisticRegression(),
 DecisionTreeClassifier(),
 RandomForestClassifier(),
 GradientBoostingClassifier(),
 XGBClassifier(base_score=None, booster=None, callbacks=None,
               colsample_bylevel=None, colsample_bynode=None,
               colsample_bytree=None, device=None, early_stopping_rounds=None,
               enable_categorical=False, eval_metric=None, feature_types=None,
               feature_weights=None, gamma=None, grow_policy=None,
               importance_type=None, interaction_constraints=None,
               learning_rate=None, max_bin=None, max_cat_threshold=None,
               max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
               max_leaves=None, min_child_weight=None, missing=nan,
               monotone_constraints=None, multi_strategy=None, n_estimators=None,
               n_jobs=None, num_parallel_tree=None, ...)]

In [21]:
def evaluate_model(models,xtrain,ytrain,xtest,ytest):
    res = []
    for i in models:
        r = evaluate_model_performance(i,xtrain,ytrain,xtest,ytest)

        # combine all the results of each model provided as input to above method
        res.append(r)
    # Convert these results into a dataframe and display
    results_df = pd.DataFrame(res)
    return results_df

In [22]:
score_results_df = evaluate_model(models,xtrain,ytrain,xtest,ytest)

MemoryError: Unable to allocate 7.62 GiB for an array with shape (9989, 102321) and data type float64