### Import Libraries

In [1]:
# Import Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

### Data Exploration

In [2]:
# Data Exploaration
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,id,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,0,L50096,L,300.6,309.6,1596,36.1,140,0,0,0,0,0,0
1,1,M20343,M,302.6,312.1,1759,29.1,200,0,0,0,0,0,0
2,2,L49454,L,299.3,308.5,1805,26.5,25,0,0,0,0,0,0
3,3,L53355,L,301.0,310.9,1524,44.3,197,0,0,0,0,0,0
4,4,M24050,M,298.0,309.0,1641,35.4,34,0,0,0,0,0,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136429 entries, 0 to 136428
Data columns (total 14 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   id                       136429 non-null  int64  
 1   Product ID               136429 non-null  object 
 2   Type                     136429 non-null  object 
 3   Air temperature [K]      136429 non-null  float64
 4   Process temperature [K]  136429 non-null  float64
 5   Rotational speed [rpm]   136429 non-null  int64  
 6   Torque [Nm]              136429 non-null  float64
 7   Tool wear [min]          136429 non-null  int64  
 8   Machine failure          136429 non-null  int64  
 9   TWF                      136429 non-null  int64  
 10  HDF                      136429 non-null  int64  
 11  PWF                      136429 non-null  int64  
 12  OSF                      136429 non-null  int64  
 13  RNF                      136429 non-null  int64  
dtypes: f

In [4]:
df_clean = df.drop(columns = ['id','Product ID'])
df_clean.head()

Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,L,300.6,309.6,1596,36.1,140,0,0,0,0,0,0
1,M,302.6,312.1,1759,29.1,200,0,0,0,0,0,0
2,L,299.3,308.5,1805,26.5,25,0,0,0,0,0,0
3,L,301.0,310.9,1524,44.3,197,0,0,0,0,0,0
4,M,298.0,309.0,1641,35.4,34,0,0,0,0,0,0


In [5]:
df_clean.Type.unique()

array(['L', 'M', 'H'], dtype=object)

In [6]:
df_clean['Type'] = df_clean['Type'].apply(lambda x:0 if x == "L" else x)
df_clean['Type'] = df_clean['Type'].apply(lambda x:1 if x == "M" else x)
df_clean['Type'] = df_clean['Type'].apply(lambda x:2 if x == "H" else x)

In [11]:
df_clean

Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,0,300.6,309.6,1596,36.1,140,0,0,0,0,0,0
1,1,302.6,312.1,1759,29.1,200,0,0,0,0,0,0
2,0,299.3,308.5,1805,26.5,25,0,0,0,0,0,0
3,0,301.0,310.9,1524,44.3,197,0,0,0,0,0,0
4,1,298.0,309.0,1641,35.4,34,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
136424,1,300.1,311.4,1530,37.5,210,0,0,0,0,0,0
136425,2,297.5,308.5,1447,49.1,2,0,0,0,0,0,0
136426,0,300.5,311.8,1524,38.5,214,0,0,0,0,0,0
136427,0,301.7,310.9,1447,46.3,42,0,0,0,0,0,0


In [14]:
def pipeline(df):
    
    X = df.drop(['Machine failure'],axis = 1)
    y = df['Machine failure']
    scaler = StandardScaler()
    split = StratifiedShuffleSplit(n_splits = 1,test_size = 0.2, random_state = 1234)
    for train_indices,test_indices in split.split(df,df[['Machine failure']]):
        train_set = df.loc[train_indices]
        test_set = df.loc[test_indices]
    
    X_data = scaler.fit_transform(X)
    y_data = y.to_numpy()
    
    X_train = train_set.drop(['Machine failure'],axis = 1)
    y_train = train_set['Machine failure']
    
    X_train = scaler.fit_transform(X_train)
    y_train = y_train.to_numpy()
    
    X_test = test_set.drop(['Machine failure'],axis = 1)
    y_test = test_set['Machine failure']
    
    X_test = scaler.fit_transform(X_test)
    y_test = y_test.to_numpy()
    
    
    
    
    
    return X_data,y_data,X_train,y_train,X_test,y_test
    

In [15]:
X_data,y_data,X_train,y_train,X_test,y_test = pipeline(df_clean)

### Apply Machine Learning

In [16]:
lg = LogisticRegression()
lg.fit(X_train,y_train)
lg.score(X_test,y_test)

0.9967749028805981

In [17]:
test_data = pd.read_csv('test.csv')
final_df = pd.DataFrame(test_data['id'])

In [20]:
test_data = test_data.drop(columns = ['id','Product ID'],axis = 1)
test_data['Type'] = test_data['Type'].apply(lambda x:0 if x == "L" else x)
test_data['Type'] = test_data['Type'].apply(lambda x:1 if x == "M" else x)
test_data['Type'] = test_data['Type'].apply(lambda x:2 if x == "H" else x)
scaler = StandardScaler()
test_data = scaler.fit_transform(test_data)

In [22]:
y_predicted = lg.predict(test_data)
final_df['Machine failure'] = y_predicted
final_df.to_csv('predictions.csv',index = False)

### Parameter Hypertuning

In [23]:
### Decision Tree
from sklearn.tree import DecisionTreeClassifier
dtc=DecisionTreeClassifier()
param = [
    {'criterion' : ['gini', 'entropy'],
    'max_depth' : [2,4,6,8,10,12],
    }
]

grid_search = GridSearchCV(dtc,param,cv = 3,scoring= "accuracy",return_train_score = True)
grid_search.fit(X_train,y_train)
final_dtc = grid_search.best_estimator_
y_predicted = final_dtc.predict(test_data)
final_df['Machine failure'] = y_predicted
final_df.to_csv('predictions.csv',index = False)
