In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

%matplotlib inline

In [2]:
orig = pd.read_csv('./machine failure.csv', index_col = 'UDI')
train = pd.read_csv('./train.csv', index_col='id')

In [3]:
orig.index.names = ['id']

In [4]:
orig.head()

Unnamed: 0_level_0,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,M14860,M,298.1,308.6,1551,42.8,0,0,0,0,0,0,0
2,L47181,L,298.2,308.7,1408,46.3,3,0,0,0,0,0,0
3,L47182,L,298.1,308.5,1498,49.4,5,0,0,0,0,0,0
4,L47183,L,298.2,308.6,1433,39.5,7,0,0,0,0,0,0
5,L47184,L,298.2,308.7,1408,40.0,9,0,0,0,0,0,0


In [5]:
df = pd.concat([train, orig], axis = 0)

In [6]:
df.head()

Unnamed: 0_level_0,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,L50096,L,300.6,309.6,1596,36.1,140,0,0,0,0,0,0
1,M20343,M,302.6,312.1,1759,29.1,200,0,0,0,0,0,0
2,L49454,L,299.3,308.5,1805,26.5,25,0,0,0,0,0,0
3,L53355,L,301.0,310.9,1524,44.3,197,0,0,0,0,0,0
4,M24050,M,298.0,309.0,1641,35.4,34,0,0,0,0,0,0


In [7]:
df.shape

(146429, 13)

In [8]:
name_dic = dict(
    zip(df.columns,
       [
           'productId',
           'type',
           'airTemp',
           'processTemp',
           'rotationalSpeed',
           'torque',
           'toolWear',
           'target',
           'twf',
           'hdf',
           'pwf',
           'osf',
           'rnf'
       ])
)

In [9]:
df.rename(columns=name_dic, inplace=True)

In [10]:
y = df['target']
df.drop(['target'], axis=1, inplace=True)

In [11]:
df.drop(['productId'], axis=1, inplace=True)

In [12]:
df = pd.concat(
    [
        df,
        pd.get_dummies(df["type"], prefix="type"),
    ],
    axis=1,
)

In [13]:
df['power'] = df['torque'] * df['rotationalSpeed']
df['rotationalSpeed_TorqueRatio'] = df['rotationalSpeed'] / df['torque']
# Calculate temperature difference
df['temperatureDifference'] = df['processTemp'] - df['airTemp']
# Calculate temperature variability
df['temperatureVariability'] = df[['airTemp', 'processTemp']].std(axis=1)
# Calculate temperature ratio
df['temperatureRatio'] = df['processTemp'] / df['airTemp']

In [14]:
max_tool_wear = df['toolWear'].max()
df['toolWearRate'] = df['toolWear'] / max_tool_wear
# Calculate temperature change rate
df['temperatureChangeRate'] = df['temperatureDifference'] / (np.where(df['toolWear']==0, 2,df['toolWear']))

In [15]:
df['totalFailures'] = df[['twf', 'hdf', 'pwf', 'osf', 'rnf']].sum(axis=1)

In [16]:
df.drop(['type'], axis=1, inplace=True)

In [17]:
df.head()

Unnamed: 0_level_0,airTemp,processTemp,rotationalSpeed,torque,toolWear,twf,hdf,pwf,osf,rnf,...,type_L,type_M,power,rotationalSpeed_TorqueRatio,temperatureDifference,temperatureVariability,temperatureRatio,toolWearRate,temperatureChangeRate,totalFailures
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,300.6,309.6,1596,36.1,140,0,0,0,0,0,...,True,False,57615.6,44.210526,9.0,6.363961,1.02994,0.55336,0.064286,0
1,302.6,312.1,1759,29.1,200,0,0,0,0,0,...,False,True,51186.9,60.446735,9.5,6.717514,1.031395,0.790514,0.0475,0
2,299.3,308.5,1805,26.5,25,0,0,0,0,0,...,True,False,47832.5,68.113208,9.2,6.505382,1.030738,0.098814,0.368,0
3,301.0,310.9,1524,44.3,197,0,0,0,0,0,...,True,False,67513.2,34.401806,9.9,7.000357,1.03289,0.778656,0.050254,0
4,298.0,309.0,1641,35.4,34,0,0,0,0,0,...,False,True,58091.4,46.355932,11.0,7.778175,1.036913,0.134387,0.323529,0


In [18]:
xgb = XGBClassifier(n_estimators=1000, n_jobs=-1, max_depth=4, eta=0.2, colsample_bytree=0.67)

In [19]:
X_train, X_valid, y_train, y_valid = train_test_split(df, y,
                                                               train_size=0.8,
                                                               test_size=0.2,
                                                               random_state=0)


In [20]:
pred = xgb.fit(X_train, y_train)

In [21]:
y_pred = xgb.predict_proba(X_valid)

In [33]:
roc_auc_score(y_valid, y_pred[:,1])

0.9719283277173258

# test df

In [34]:
df = pd.read_csv('./test.csv', index_col='id')

In [35]:
name = dict(
    zip(df.columns,
       [
           'productId',
           'type',
           'airTemp',
           'processTemp',
           'rotationalSpeed',
           'torque',
           'toolWear',
           'twf',
           'hdf',
           'pwf',
           'osf',
           'rnf'
       ])
)

In [36]:
df.rename(columns=name, inplace=True)

In [37]:
df.drop(['productId'], axis=1, inplace=True)

In [38]:
df = pd.concat(
    [
        df,
        pd.get_dummies(df["type"], prefix="type"),
    ],
    axis=1,
)

In [39]:
df['power'] = df['torque'] * df['rotationalSpeed']
df['rotationalSpeed_TorqueRatio'] = df['rotationalSpeed'] / df['torque']
# Calculate temperature difference
df['temperatureDifference'] = df['processTemp'] - df['airTemp']
# Calculate temperature variability
df['temperatureVariability'] = df[['airTemp', 'processTemp']].std(axis=1)
# Calculate temperature ratio
df['temperatureRatio'] = df['processTemp'] / df['airTemp']
max_tool_wear = df['toolWear'].max()
df['toolWearRate'] = df['toolWear'] / max_tool_wear
# Calculate temperature change rate
df['temperatureChangeRate'] = df['temperatureDifference'] / (np.where(df['toolWear']==0, 2,df['toolWear']))
df['totalFailures'] = df[['twf', 'hdf', 'pwf', 'osf', 'rnf']].sum(axis=1)

In [40]:
df.drop(['type'], axis=1, inplace=True)

In [41]:
predict = xgb.predict_proba(df)

In [42]:
output = pd.DataFrame({'id': df.index, 'Machine failure': predict[:,1]})
output.to_csv('submit.csv', index=False)

In [44]:
output.head()

Unnamed: 0,id,Machine failure
0,136429,0.001231
1,136430,0.001811
2,136431,0.000818
3,136432,0.001287
4,136433,0.001329
