In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import warnings
warnings.simplefilter(action="ignore")
import category_encoders as ce
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 170)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [2]:
errors = pd.read_csv("PdM_errors.csv")
failures = pd.read_csv("PdM_failures.csv")
machines = pd.read_csv("PdM_machines.csv")
maint = pd.read_csv("PdM_maint.csv")
telemetry = pd.read_csv("PdM_telemetry.csv")

### EDA

In [3]:
telemetry.head()

Unnamed: 0,datetime,machineID,volt,rotate,pressure,vibration
0,2015-01-01 06:00:00,1,176.218,418.504,113.078,45.088
1,2015-01-01 07:00:00,1,162.879,402.747,95.461,43.414
2,2015-01-01 08:00:00,1,170.99,527.35,75.238,34.179
3,2015-01-01 09:00:00,1,162.463,346.149,109.249,41.122
4,2015-01-01 10:00:00,1,157.61,435.377,111.887,25.991


In [4]:
telemetry.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 876100 entries, 0 to 876099
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   datetime   876100 non-null  object 
 1   machineID  876100 non-null  int64  
 2   volt       876100 non-null  float64
 3   rotate     876100 non-null  float64
 4   pressure   876100 non-null  float64
 5   vibration  876100 non-null  float64
dtypes: float64(4), int64(1), object(1)
memory usage: 40.1+ MB


In [5]:
telemetry['datetime'] = pd.to_datetime(telemetry['datetime'], format="%Y-%m-%d %H:%M:%S")

In [6]:
telemetry.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 876100 entries, 0 to 876099
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   datetime   876100 non-null  datetime64[ns]
 1   machineID  876100 non-null  int64         
 2   volt       876100 non-null  float64       
 3   rotate     876100 non-null  float64       
 4   pressure   876100 non-null  float64       
 5   vibration  876100 non-null  float64       
dtypes: datetime64[ns](1), float64(4), int64(1)
memory usage: 40.1 MB


In [7]:
telemetry.describe([0.5,0.75,0.95,0.99]).T

Unnamed: 0,count,mean,std,min,50%,75%,95%,99%,max
machineID,876100.0,50.5,28.866,1.0,50.5,75.25,95.05,99.01,100.0
volt,876100.0,170.778,15.509,97.334,170.607,181.004,196.549,208.132,255.125
rotate,876100.0,446.605,52.674,138.432,447.558,482.177,531.334,565.513,695.021
pressure,876100.0,100.859,11.049,51.237,100.426,107.555,119.446,131.701,185.952
vibration,876100.0,40.385,5.37,14.877,40.237,43.785,49.383,54.241,76.791


In [8]:
def outlier_thresholds(dataframe, col_name, q1=0.05, q3=0.99):
    quartile1 = dataframe[col_name].quantile(q1)
    quartile3 = dataframe[col_name].quantile(q3)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
    return low_limit, up_limit


def check_outlier(dataframe, col_name):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name)
    if dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None):
        return True
    else:
        return False

In [9]:
for col in telemetry.columns:    
    print(col, check_outlier(telemetry,col))

datetime False
machineID False
volt False
rotate False
pressure False
vibration False


In [10]:
errors.head()

Unnamed: 0,datetime,machineID,errorID
0,2015-01-03 07:00:00,1,error1
1,2015-01-03 20:00:00,1,error3
2,2015-01-04 06:00:00,1,error5
3,2015-01-10 15:00:00,1,error4
4,2015-01-22 10:00:00,1,error4


In [11]:
errors['datetime'] = pd.to_datetime(errors['datetime'], format="%Y-%m-%d %H:%M:%S")

In [12]:
errors.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3919 entries, 0 to 3918
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   datetime   3919 non-null   datetime64[ns]
 1   machineID  3919 non-null   int64         
 2   errorID    3919 non-null   object        
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 92.0+ KB


In [13]:
errors['errorID'].value_counts()

error1    1010
error2     988
error3     838
error4     727
error5     356
Name: errorID, dtype: int64

In [14]:
failures.head()

Unnamed: 0,datetime,machineID,failure
0,2015-01-05 06:00:00,1,comp4
1,2015-03-06 06:00:00,1,comp1
2,2015-04-20 06:00:00,1,comp2
3,2015-06-19 06:00:00,1,comp4
4,2015-09-02 06:00:00,1,comp4


In [15]:
failures['datetime'] = pd.to_datetime(failures['datetime'], format="%Y-%m-%d %H:%M:%S")

In [16]:
failures.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 761 entries, 0 to 760
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   datetime   761 non-null    datetime64[ns]
 1   machineID  761 non-null    int64         
 2   failure    761 non-null    object        
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 18.0+ KB


In [17]:
machines.head()

Unnamed: 0,machineID,model,age
0,1,model3,18
1,2,model4,7
2,3,model3,8
3,4,model3,7
4,5,model3,2


In [18]:
machines.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   machineID  100 non-null    int64 
 1   model      100 non-null    object
 2   age        100 non-null    int64 
dtypes: int64(2), object(1)
memory usage: 2.5+ KB


In [19]:
maint.head()

Unnamed: 0,datetime,machineID,comp
0,2014-06-01 06:00:00,1,comp2
1,2014-07-16 06:00:00,1,comp4
2,2014-07-31 06:00:00,1,comp3
3,2014-12-13 06:00:00,1,comp1
4,2015-01-05 06:00:00,1,comp4


In [20]:
maint['datetime'] = pd.to_datetime(maint['datetime'], format="%Y-%m-%d %H:%M:%S")

In [21]:
maint.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3286 entries, 0 to 3285
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   datetime   3286 non-null   datetime64[ns]
 1   machineID  3286 non-null   int64         
 2   comp       3286 non-null   object        
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 77.1+ KB


In [22]:
maint['comp'].value_counts()

comp2    863
comp4    811
comp3    808
comp1    804
Name: comp, dtype: int64

### Feature Engineering
Telemetry 

In [23]:
# ortalama değerler için telemetry
t = []
cols = ['volt', 'rotate', 'pressure', 'vibration']
for col in cols:
    t.append(pd.pivot_table(telemetry,
                               index='datetime',
                               columns='machineID',
                               values=col).rolling(window=12).mean().resample('12H',
                                                                              closed='left',
                                                                              label='right').first().unstack())
telemetry_mean_12h = pd.concat(t, axis=1)
telemetry_mean_12h.columns = [i + 'mean_12h' for i in cols]
telemetry_mean_12h.reset_index(inplace=True)
telemetry_mean_12h = telemetry_mean_12h.loc[-telemetry_mean_12h['voltmean_12h'].isnull()]

In [24]:
# standart sapma aynısı
t = []
cols = ['volt', 'rotate', 'pressure', 'vibration']
for col in cols:
    t.append(pd.pivot_table(telemetry,
                               index='datetime',
                               columns='machineID',
                               values=col).rolling(window=12).std().resample('12H',
                                                                             closed='left',
                                                                             label='right').first().unstack())
telemetry_sd_12h = pd.concat(t, axis=1)
telemetry_sd_12h.columns = [i + 'sd_12h' for i in cols]
telemetry_sd_12h.reset_index(inplace=True)
telemetry_sd_12h = telemetry_sd_12h.loc[-telemetry_sd_12h['voltsd_12h'].isnull()]

In [25]:
# 24 saatlik için de yapalım
# ortalama değerler için telemetry
t = []
cols = ['volt', 'rotate', 'pressure', 'vibration']
for col in cols:
    t.append(pd.pivot_table(telemetry,
                               index='datetime',
                               columns='machineID',
                               values=col).rolling(window=24).mean().resample('12H',
                                                                              closed='left',
                                                                              label='right').first().unstack())
telemetry_mean_24h = pd.concat(t, axis=1)
telemetry_mean_24h.columns = [i + 'mean_24h' for i in cols]
telemetry_mean_24h.reset_index(inplace=True)
telemetry_mean_24h = telemetry_mean_24h.loc[-telemetry_mean_24h['voltmean_24h'].isnull()]

In [26]:
# standart sapma aynısı
t = []
cols = ['volt', 'rotate', 'pressure', 'vibration']
for col in cols:
    t.append(pd.pivot_table(telemetry,
                               index='datetime',
                               columns='machineID',
                               values=col).rolling(window=24).std().resample('12H',
                                                                             closed='left',
                                                                             label='right').first().unstack())
telemetry_sd_24h = pd.concat(t, axis=1)
telemetry_sd_24h.columns = [i + 'sd_24h' for i in cols]
telemetry_sd_24h.reset_index(inplace=True)
telemetry_sd_24h = telemetry_sd_24h.loc[-telemetry_sd_24h['voltsd_24h'].isnull()]

In [27]:
telemetry_merged = pd.concat([telemetry_mean_12h,
                            telemetry_sd_12h.iloc[:, 2:6],
                            telemetry_mean_24h.iloc[:, 2:6],
                            telemetry_sd_24h.iloc[:, 2:6]], axis=1).dropna()

In [28]:
telemetry_merged.head()

Unnamed: 0,machineID,datetime,voltmean_12h,rotatemean_12h,pressuremean_12h,vibrationmean_12h,voltsd_12h,rotatesd_12h,pressuresd_12h,vibrationsd_12h,voltmean_24h,rotatemean_24h,pressuremean_24h,vibrationmean_24h,voltsd_24h,rotatesd_24h,pressuresd_24h,vibrationsd_24h
2,1,2015-01-02 12:00:00,169.771,446.372,96.462,40.966,10.4,44.327,7.504,4.811,169.734,445.18,96.797,40.385,11.233,48.717,10.08,5.853
3,1,2015-01-03 00:00:00,172.716,442.095,96.87,39.493,16.029,39.994,10.149,6.581,171.243,444.234,96.666,40.229,13.299,41.346,8.731,5.688
4,1,2015-01-03 12:00:00,166.83,452.91,101.142,39.286,15.535,40.293,13.541,4.718,169.773,447.502,99.006,39.39,15.727,39.648,11.905,5.601
5,1,2015-01-04 00:00:00,175.253,474.493,96.79,45.507,10.893,46.078,7.667,8.224,171.042,463.701,98.966,42.397,13.808,43.742,10.989,7.286
6,1,2015-01-04 12:00:00,169.773,438.366,98.695,51.326,13.021,41.462,9.707,7.122,172.513,456.429,97.743,48.416,12.069,46.67,8.61,8.089


Errors

Kategorik olduğu için one hot encoder kullanıyoruz

In [29]:
e = pd.get_dummies(errors["errorID"])
error_count = pd.concat([errors["datetime"], 
                         errors["machineID"], 
                         e], axis=1).dropna()
error_count = error_count.groupby(['machineID', 'datetime']).sum().reset_index()

error_count = telemetry[['datetime', 'machineID']].merge(error_count, on=['machineID', 'datetime'], how='left').fillna(0.0)

In [30]:
error_count.head()

Unnamed: 0,datetime,machineID,error1,error2,error3,error4,error5
0,2015-01-01 06:00:00,1,0.0,0.0,0.0,0.0,0.0
1,2015-01-01 07:00:00,1,0.0,0.0,0.0,0.0,0.0
2,2015-01-01 08:00:00,1,0.0,0.0,0.0,0.0,0.0
3,2015-01-01 09:00:00,1,0.0,0.0,0.0,0.0,0.0
4,2015-01-01 10:00:00,1,0.0,0.0,0.0,0.0,0.0


In [31]:
t = []
cols = ['error%d' % i for i in range(1,6)]
for col in cols:
    t.append(pd.pivot_table(error_count,
                               index='datetime',
                               columns='machineID',
                               values=col).rolling(window=24).sum().resample('12H',
                                                                             closed='left',
                                                                             label='right').first().unstack())
error_count = pd.concat(t, axis=1)
error_count.columns = [i + 'count' for i in cols]
error_count.reset_index(inplace=True)
error_count = error_count.dropna()

In [32]:
error_count.columns = ['datetime', 'machineID', 'error1', 'error2', 'error3', 'error4', 'error5']

In [33]:
error_count.head()

Unnamed: 0,datetime,machineID,error1,error2,error3,error4,error5
2,1,2015-01-02 12:00:00,0.0,0.0,0.0,0.0,0.0
3,1,2015-01-03 00:00:00,0.0,0.0,0.0,0.0,0.0
4,1,2015-01-03 12:00:00,0.0,0.0,0.0,0.0,0.0
5,1,2015-01-04 00:00:00,1.0,0.0,0.0,0.0,0.0
6,1,2015-01-04 12:00:00,1.0,0.0,1.0,0.0,0.0


Maintenance

In [34]:
# hata tipleri için sütun oluşturalım
comp_ = pd.get_dummies(maint["comp"])
comp_rep = pd.concat([maint["machineID"], 
                         maint["datetime"], 
                         comp_ ], axis=1).dropna()

In [35]:
# groupby ile makinelerin saat bazında tamir edilme gösterilmesi
comp_rep = comp_rep.groupby(['machineID', 'datetime']).sum().reset_index()

Bu alanda bir bileşenin en son değiştirilmesinden bu yana ne kadar zaman geçtiğini hesaplamak daha iyi olacaktır.Bunun adına özellikler oluşturulacaktır.

In [36]:
# hiçbir bileşenin değişmediği zaman noktaları
comp_rep = telemetry[['datetime', 'machineID']].merge(comp_rep,
                                                      on=['datetime', 'machineID'],
                                                      how='outer').fillna(0).sort_values(by=['machineID', 'datetime'])
components = ['comp1', 'comp2', 'comp3', 'comp4']
for comp in components:
# en son comp değişikliği tarihine dönüştürelim
    comp_rep.loc[comp_rep[comp] < 1, comp] = None
    comp_rep.loc[-comp_rep[comp].isnull(), comp] = comp_rep.loc[-comp_rep[comp].isnull(), 'datetime']
    
# comp değişme tarihini ileriye doğru dolduralım
    comp_rep[comp] = comp_rep[comp].fillna(method='ffill')

In [37]:
# tarih aralığının dışındakileri çıkaralım   
comp_rep = comp_rep.loc[comp_rep['datetime'] > pd.to_datetime('2015-01-01')]
# gün cinsinden değiştirilen parçanın üstünden kaç gün geçmiş.
for comp in components:
    comp_rep[comp] = (comp_rep['datetime'] - pd.to_datetime(comp_rep[comp])).apply(lambda x: x / pd.Timedelta(days=1))

In [38]:
comp_rep.head()

Unnamed: 0,datetime,machineID,comp1,comp2,comp3,comp4
0,2015-01-01 06:00:00,1,19.0,214.0,154.0,169.0
1,2015-01-01 07:00:00,1,19.042,214.042,154.042,169.042
2,2015-01-01 08:00:00,1,19.083,214.083,154.083,169.083
3,2015-01-01 09:00:00,1,19.125,214.125,154.125,169.125
4,2015-01-01 10:00:00,1,19.167,214.167,154.167,169.167


Machine

In [39]:
machines.head()

Unnamed: 0,machineID,model,age
0,1,model3,18
1,2,model4,7
2,3,model3,8
3,4,model3,7
4,5,model3,2


### Birleştirme

In [40]:
error_count["machineID"] = error_count["machineID"].values.astype(int)

In [41]:
error_count["datetime"] = error_count["datetime"].values.astype("datetime64[ns]")

In [42]:
error_count.dtypes

datetime     datetime64[ns]
machineID             int32
error1              float64
error2              float64
error3              float64
error4              float64
error5              float64
dtype: object

In [43]:
error_count.datetime.astype("datetime64[ns]")
error_count.machineID.astype("int64")
df = telemetry_merged.merge(error_count, on=['datetime', 'machineID'], how='left')
df = df.merge(comp_rep, on=['datetime', 'machineID'], how='left')
df = df.merge(machines, on=['machineID'], how='left')

In [44]:
df.head()

Unnamed: 0,machineID,datetime,voltmean_12h,rotatemean_12h,pressuremean_12h,vibrationmean_12h,voltsd_12h,rotatesd_12h,pressuresd_12h,vibrationsd_12h,voltmean_24h,rotatemean_24h,pressuremean_24h,vibrationmean_24h,voltsd_24h,rotatesd_24h,pressuresd_24h,vibrationsd_24h,error1,error2,error3,error4,error5,comp1,comp2,comp3,comp4,model,age
0,1,2015-01-02 12:00:00,169.771,446.372,96.462,40.966,10.4,44.327,7.504,4.811,169.734,445.18,96.797,40.385,11.233,48.717,10.08,5.853,,,,,,20.25,215.25,155.25,170.25,model3,18
1,1,2015-01-03 00:00:00,172.716,442.095,96.87,39.493,16.029,39.994,10.149,6.581,171.243,444.234,96.666,40.229,13.299,41.346,8.731,5.688,,,,,,20.75,215.75,155.75,170.75,model3,18
2,1,2015-01-03 12:00:00,166.83,452.91,101.142,39.286,15.535,40.293,13.541,4.718,169.773,447.502,99.006,39.39,15.727,39.648,11.905,5.601,,,,,,21.25,216.25,156.25,171.25,model3,18
3,1,2015-01-04 00:00:00,175.253,474.493,96.79,45.507,10.893,46.078,7.667,8.224,171.042,463.701,98.966,42.397,13.808,43.742,10.989,7.286,,,,,,21.75,216.75,156.75,171.75,model3,18
4,1,2015-01-04 12:00:00,169.773,438.366,98.695,51.326,13.021,41.462,9.707,7.122,172.513,456.429,97.743,48.416,12.069,46.67,8.61,8.089,,,,,,22.25,217.25,157.25,172.25,model3,18


### Sınıf verisiyle yani failure ile birleştirelim.

In [45]:
#bir makinenin bir bileşen arızası nedeniyle önümüzdeki 7 gün içinde arızalanma olasılığını hesaplamak
failures['datetime'] = failures['datetime'].apply(lambda x: x.replace(hour=12))

df = df.merge(failures, on=['datetime', 'machineID'], how='left')
df = df.fillna(method='bfill', limit=7)
df['failure'] = df['failure'].fillna('none')
df = df.fillna(0.0)
df.head()

Unnamed: 0,machineID,datetime,voltmean_12h,rotatemean_12h,pressuremean_12h,vibrationmean_12h,voltsd_12h,rotatesd_12h,pressuresd_12h,vibrationsd_12h,voltmean_24h,rotatemean_24h,pressuremean_24h,vibrationmean_24h,voltsd_24h,rotatesd_24h,pressuresd_24h,vibrationsd_24h,error1,error2,error3,error4,error5,comp1,comp2,comp3,comp4,model,age,failure
0,1,2015-01-02 12:00:00,169.771,446.372,96.462,40.966,10.4,44.327,7.504,4.811,169.734,445.18,96.797,40.385,11.233,48.717,10.08,5.853,0.0,0.0,0.0,0.0,0.0,20.25,215.25,155.25,170.25,model3,18,comp4
1,1,2015-01-03 00:00:00,172.716,442.095,96.87,39.493,16.029,39.994,10.149,6.581,171.243,444.234,96.666,40.229,13.299,41.346,8.731,5.688,0.0,0.0,0.0,0.0,0.0,20.75,215.75,155.75,170.75,model3,18,comp4
2,1,2015-01-03 12:00:00,166.83,452.91,101.142,39.286,15.535,40.293,13.541,4.718,169.773,447.502,99.006,39.39,15.727,39.648,11.905,5.601,0.0,0.0,0.0,0.0,0.0,21.25,216.25,156.25,171.25,model3,18,comp4
3,1,2015-01-04 00:00:00,175.253,474.493,96.79,45.507,10.893,46.078,7.667,8.224,171.042,463.701,98.966,42.397,13.808,43.742,10.989,7.286,0.0,0.0,0.0,0.0,0.0,21.75,216.75,156.75,171.75,model3,18,comp4
4,1,2015-01-04 12:00:00,169.773,438.366,98.695,51.326,13.021,41.462,9.707,7.122,172.513,456.429,97.743,48.416,12.069,46.67,8.61,8.089,0.0,0.0,0.0,0.0,0.0,22.25,217.25,157.25,172.25,model3,18,comp4


In [46]:
df['failure'].value_counts()

none     67162
comp2     2009
comp1     1531
comp4     1263
comp3      978
Name: failure, dtype: int64

### Adding new features

In [47]:
# creating date features
def create_date_features(df):
    df["month"] = df.datetime.dt.month
    df["day_of_month"] = df.datetime.dt.day
    df["day_of_week"] = df.datetime.dt.dayofweek
    df["is_weekend"] = df.datetime.dt.weekday//4
    df["is_month_start"] = df.datetime.dt.is_month_start.astype(int)
    df["is_month_end"] = df.datetime.dt.is_month_end.astype(int)
    
    return df

In [48]:
df = create_date_features(df)
df.head()

Unnamed: 0,machineID,datetime,voltmean_12h,rotatemean_12h,pressuremean_12h,vibrationmean_12h,voltsd_12h,rotatesd_12h,pressuresd_12h,vibrationsd_12h,voltmean_24h,rotatemean_24h,pressuremean_24h,vibrationmean_24h,voltsd_24h,rotatesd_24h,pressuresd_24h,vibrationsd_24h,error1,error2,error3,error4,error5,comp1,comp2,comp3,comp4,model,age,failure,month,day_of_month,day_of_week,is_weekend,is_month_start,is_month_end
0,1,2015-01-02 12:00:00,169.771,446.372,96.462,40.966,10.4,44.327,7.504,4.811,169.734,445.18,96.797,40.385,11.233,48.717,10.08,5.853,0.0,0.0,0.0,0.0,0.0,20.25,215.25,155.25,170.25,model3,18,comp4,1,2,4,1,0,0
1,1,2015-01-03 00:00:00,172.716,442.095,96.87,39.493,16.029,39.994,10.149,6.581,171.243,444.234,96.666,40.229,13.299,41.346,8.731,5.688,0.0,0.0,0.0,0.0,0.0,20.75,215.75,155.75,170.75,model3,18,comp4,1,3,5,1,0,0
2,1,2015-01-03 12:00:00,166.83,452.91,101.142,39.286,15.535,40.293,13.541,4.718,169.773,447.502,99.006,39.39,15.727,39.648,11.905,5.601,0.0,0.0,0.0,0.0,0.0,21.25,216.25,156.25,171.25,model3,18,comp4,1,3,5,1,0,0
3,1,2015-01-04 00:00:00,175.253,474.493,96.79,45.507,10.893,46.078,7.667,8.224,171.042,463.701,98.966,42.397,13.808,43.742,10.989,7.286,0.0,0.0,0.0,0.0,0.0,21.75,216.75,156.75,171.75,model3,18,comp4,1,4,6,1,0,0
4,1,2015-01-04 12:00:00,169.773,438.366,98.695,51.326,13.021,41.462,9.707,7.122,172.513,456.429,97.743,48.416,12.069,46.67,8.61,8.089,0.0,0.0,0.0,0.0,0.0,22.25,217.25,157.25,172.25,model3,18,comp4,1,4,6,1,0,0


### Model

In [49]:
df.isnull().values.any()

False

In [50]:
y_train = df.loc[df['datetime'] <= pd.to_datetime('2015-09-30 12:00:00'), 'failure']
X_train = df.loc[df['datetime'] <= pd.to_datetime('2015-09-30 12:00:00')].drop(['datetime','machineID','model','failure'], 1)
y_test = df.loc[df['datetime'] >= pd.to_datetime('2015-10-01 00:00:00'), 'failure']
X_test = df.loc[df['datetime'] >= pd.to_datetime('2015-10-01 00:00:00')].drop(['datetime','machineID','model','failure'], 1)

In [51]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(random_state=17).fit(X_train,y_train)

In [52]:
y_pred = rf_model.predict(X_test)

In [53]:
from sklearn.metrics import accuracy_score
print("Accuracy Score: ",accuracy_score(y_test, y_pred))

Accuracy Score:  0.9438981138158955


In [54]:
rf_model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 17,
 'verbose': 0,
 'warm_start': False}

In [55]:
rf_params = {"max_depth":[None,5,8,10],
            "max_features":["auto",3,5,7],
            "n_estimators":[100,200,300],
            "min_samples_split":[2,5,7,10,15,20]}

In [56]:
rf_best_grid = GridSearchCV(rf_model,rf_params,cv=5,n_jobs=-1,verbose=False).fit(X_train,y_train)

In [57]:
rf_best_grid.best_params_

{'max_depth': None,
 'max_features': 7,
 'min_samples_split': 10,
 'n_estimators': 300}

In [58]:
rf_final = rf_model.set_params(**rf_best_grid.best_params_,random_state=17).fit(X_train,y_train)

In [59]:
y_pred = rf_final.predict(X_test)

In [60]:
print("Accuracy Score: ",accuracy_score(y_test, y_pred))

Accuracy Score:  0.9454027621043581


CatBoost

In [61]:
from catboost import CatBoostClassifier
from sklearn import preprocessing
y_train = LabelEncoder().fit_transform(y_train)
y_test = LabelEncoder().fit_transform(y_test)
cat_model = CatBoostClassifier(random_state=17,verbose=False)#.fit(X_train,y_train)

In [62]:
cat_model.get_params()

{'verbose': False, 'random_state': 17}

In [63]:
cat_params = {"learning_rate":[0.1,0.01],
              "depth":[3,6],
            "iterations":[200,500]}

In [64]:
cat_best_grid = GridSearchCV(cat_model,cat_params,cv=3,n_jobs=-1,verbose=False).fit(X_train,y_train)

In [65]:
cat_best_grid.best_params_

{'depth': 3, 'iterations': 500, 'learning_rate': 0.1}

In [66]:
cat_final = cat_model.set_params(**cat_best_grid.best_params_,random_state=17).fit(X_train,y_train)

In [67]:
y_pred = cat_final.predict(X_test)

In [68]:
print("Accuracy Score: ",accuracy_score(y_test, y_pred))

Accuracy Score:  0.9434144768660325


In [69]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.38      0.50       370
           1       0.78      0.41      0.54       579
           2       0.65      0.39      0.49       226
           3       0.75      0.44      0.55       303
           4       0.95      0.99      0.97     17131

    accuracy                           0.94     18609
   macro avg       0.77      0.52      0.61     18609
weighted avg       0.94      0.94      0.94     18609

