### PHM Data challenge Overview
The data description can be found in [this link](https://www.phmsociety.org/events/conference/phm/18/data-challenge).

In [1]:
import os
import sys
import numpy as np
# Read csv file from URL directly
import pandas as pd
import itertools
%matplotlib inline
import matplotlib.pyplot as plt
import math

In [2]:

#get the current working directory
print(os.getcwd()) 

#list files in current working directory
# os.listdir(os.curdir)

/home/mylogin/notebooks/phmdatachallenge


In [3]:
local_path = os.getcwd()
data_path = os.path.join(local_path,'data','phm_data_challenge_2018')
data_train_path = os.path.join(data_path,'train')
data_test_path = os.path.join(data_path,'test')
data_faults_path = os.path.join(data_train_path, 'train_faults')
data_ttf_path = os.path.join(data_train_path, 'train_ttf')

## Data
In the `train` folder, there are three types of files:
- sensor data files in the format of xx_Mxx_DC_train.csv. xx_Mxx represent tool id, e.g. 03_M01. Each file represents the sensor data for a single tool. 
- train_faults folder contains files about the failure information. There are three fault modes: `FlowCool Pressure Dropped Below Limit`, `Flowcool Pressure Too High Check Flowcool Pump`, and `Flowcool leak`.
- train_ttf folder contains files illustrating how the prediction results should look like. It reports the TTF (Time to Failure) for three fault modes.
    
In the `test` folder, threre are 5 files in it. These files have the same format as the sensor data files in the `train` folder. We should generate prediction results for these data as data challenge submissions.

In [4]:
print(data_train_path)
os.listdir(data_train_path)

/home/mylogin/notebooks/phmdatachallenge/data/phm_data_challenge_2018/train


['train_faults',
 '08_M02_DC_train.csv',
 '02_M01_DC_train.csv',
 '04_M02_DC_train.csv',
 '06_M01_DC_train.csv',
 '03_M02_DC_train.csv',
 '06_M02_DC_train.csv',
 '03_M01_DC_train.csv',
 '08_M01_DC_train.csv',
 '09_M01_DC_train.csv',
 '10_M01_DC_train.csv',
 '05_M02_DC_train.csv',
 '09_M02_DC_train.csv',
 '07_M02_DC_train.csv',
 'train_ttf',
 '04_M01_DC_train.csv',
 '02_M02_DC_train.csv',
 '01_M01_DC_train.csv',
 '07_M01_DC_train.csv',
 '05_M01_DC_train.csv',
 '01_M02_DC_train.csv',
 '10_M02_DC_train.csv']

In [5]:
print(data_test_path)
os.listdir(data_test_path)

/home/mylogin/notebooks/phmdatachallenge/data/phm_data_challenge_2018/test


['02_M02_DC_test.csv',
 '06_M01_DC_test.csv',
 '04_M01_DC_test.csv',
 '03_M01_DC_test.csv',
 '01_M02_DC_test.csv']

In [6]:
def assure_path_exists(path):
    """Make sure the input path already exists.
    If it does not exists, a new directory will 
    be created 
    """
    mydir = os.path.join(os.getcwd(), path)
    if not os.path.exists(mydir):
        os.makedirs(mydir)
                
def check_obj_type(obj):
    print('type(obj) is {}'.format(type(obj)))
    if(hasattr(obj, 'shape')):
        print('obj.shape is {}'.format(obj.shape))
    if(hasattr(obj, 'size')):
        print('obj.size is {}'.format(obj.size))
    if(hasattr(obj, 'dtypes')):
        print('obj.dtypes are: \n{}'.format(obj.dtypes))
    print('\n')

def read_lines(datafile, top_N):
    '''read the top N lines of a file as a list'''
    with open(datafile) as myfile:
        head = [next(myfile) for x in range(N)]
    return(head)

def sum_list_of_tuples(a_list):
    '''sum the 2nd value in each tuple in a list of tuples in Python. Change from j to i if 
    the objective is to sum the first value in each tuple in a list of tuples in Python.'''
    return sum(j for i,j in a_list)

def check_seq_gap(a_list, gap):
    '''Assume a_list is an asscendingly sorted int list. This function returns an int list with [a_list[i-1], a_list[i]] 
    being its element, satisfying (a_list[i]-a_list[i-1]) >= gap, where gap is an integer.'''
    new_list = []
    for i in range(1, len(a_list)):
        if((a_list[i] - a_list[i-1]) >= gap):
            new_list.append([a_list[i-1],a_list[i]])
    return new_list
    
def data_around_fault(sensor_data, fault_time, w):
    '''Return a subset of sensor data set where the data is fault_time +- window w. The purpose is to check if this subset
    of data is avaible in sensor_data. sensor_data is a pandas dataframe. w is a positive integer.'''
    return sensor_data[(sensor_data['time']>= (fault_time - w)) & (sensor_data['time']< (fault_time +w))]

def prepare_sensor_data(df):
    '''standardize column names and column types. df is a pandas dataframe'''
    col_names = list(df.columns.values)
    col_names = [x.lower() for x in col_names]
    df.columns = col_names
    df['tool'] = df['tool'].astype('category')
    df['stage'] = df['stage'].astype('category')
    df['lot'] = df['lot'].astype('category')
    df['recipe'] = df['recipe'].astype('category')
    df['recipe_step'] = df['recipe_step'].astype('category')
    return df

def prepare_fault_data(df):
    '''standardize column names. df is a pandas dataframe'''
    col_names = list(df.columns.values)
    col_names = [x.lower() for x in col_names]
    df.columns = col_names
    return df


### Sensor Data

In [7]:
# load raw data from the GitHub URL
#datafile =  os.path.join(data_train_path,'03_M01_DC_train.csv') 
datafile =  os.path.join(data_train_path,'04_M02_DC_train.csv') 

# Read into pandas
df = pd.read_csv(datafile, encoding='utf-8')

In [8]:
# convert column names to lower cae
col_names = list(df.columns.values)
col_names = [x.lower() for x in col_names]
df.columns = col_names

In [9]:
# make sure the data dypes are correct for each column
df['tool'] = df['tool'].astype('category')
df['stage'] = df['stage'].astype('category')
df['lot'] = df['lot'].astype('category')
df['recipe'] = df['recipe'].astype('category')
df['recipe_step'] = df['recipe_step'].astype('category')

In [10]:
df.shape

(4450348, 24)

In [11]:
df.head()

Unnamed: 0,time,tool,stage,lot,runnum,recipe,recipe_step,iongaugepressure,etchbeamvoltage,etchbeamcurrent,...,etchgaschannel1readback,etchpbngasreadback,fixturetiltangle,rotationspeed,actualrotationangle,fixtureshutterposition,etchsourceusage,etchauxsourcetimer,etchaux2sourcetimer,actualstepduration
0,3997500,04M02,1,9199,10554217,67,1,-1.335401,-1.037886,-1.031469,...,-1.793606,-3.367822,1.86631,-0.003224,-0.070721,0,-1.629322,-1.621974,-1.349286,-0.252985
1,3997504,04M02,1,9199,10554217,67,1,-1.333846,-1.038108,-1.032264,...,-1.793606,-3.367822,1.86631,-0.003224,-0.070721,0,-1.629322,-1.621974,-1.349286,-0.252985
2,3997508,04M02,1,9199,10554217,67,1,-1.333008,-1.037719,-1.031541,...,-1.793606,-3.367822,1.86631,-0.003224,-0.070721,0,-1.629322,-1.621974,-1.349286,-0.252985
3,3997512,04M02,1,9199,10554217,67,1,-1.334116,-1.038497,-1.03212,...,-1.793606,-3.367822,1.86631,-0.003224,-0.070721,0,-1.629322,-1.621974,-1.349286,-0.252985
4,3997516,04M02,1,9199,10554217,67,1,-1.335806,-1.037663,-1.031903,...,-1.793606,-3.367822,1.86631,-0.003224,-0.070721,0,-1.629322,-1.621974,-1.349286,-0.252985


Data understanding
- What is the sample frequeqcy? Does 'time' always have the same time interval?
- 'runnum' indicates 'number of times tool has been run'. check this colun to see how it increases with 'time'

check the gap in sequence data

The above analysis shows that the sample frequency is not consistent. Time interval being 4 is the most frequnt condition, accounting for 98% of cases. There is a long tail for much larger time interval as well.

### Train Fault Data

In [19]:
print(data_faults_path)
os.listdir(data_faults_path)

/home/mylogin/notebooks/phmdatachallenge/data/phm_data_challenge_2018/train/train_faults


['01_M02_train_fault_data.csv',
 '03_M01_train_fault_data.csv',
 '08_M02_train_fault_data.csv',
 '01_M01_train_fault_data.csv',
 '07_M02_train_fault_data.csv',
 '06_M02_train_fault_data.csv',
 '05_M01_train_fault_data.csv',
 '02_M02_train_fault_data.csv',
 '05_M02_train_fault_data.csv',
 '09_M02_train_fault_data.csv',
 '10_M02_train_fault_data.csv',
 '04_M01_train_fault_data.csv',
 '09_M01_train_fault_data.csv',
 '03_M02_train_fault_data.csv',
 '02_M01_train_fault_data.csv',
 '04_M02_train_fault_data.csv',
 '08_M01_train_fault_data.csv',
 '07_M01_train_fault_data.csv',
 '06_M01_train_fault_data.csv',
 '10_M01_train_fault_data.csv']

In [13]:
# load a single fault data file
#datafile_fault =  os.path.join(data_faults_path,'03_M01_train_fault_data.csv') 
datafile_fault =  os.path.join(data_faults_path,'04_M02_train_fault_data.csv')   
# Read into pandas
df_fault= pd.read_csv(datafile_fault, encoding='utf-8')
print(df_fault.shape[0])
print(set(df_fault['fault_name']))
df_fault.head()

37
{'Flowcool Pressure Too High Check Flowcool Pump', 'Flowcool leak', 'FlowCool Pressure Dropped Below Limit'}


Unnamed: 0,time,fault_name,Tool
0,3926292,Flowcool Pressure Too High Check Flowcool Pump,04M02
1,23846800,Flowcool leak,04M02
2,24337110,Flowcool leak,04M02
3,28324130,FlowCool Pressure Dropped Below Limit,04M02
4,28325090,Flowcool leak,04M02


From the above data, we can see that, the time when the fault (operator performs the mainenance) occurs is shown. We want to find out if the sensor data before and after this time point is available in the sensor data.

### Batch Generate Training Data

In [60]:
# Get all fault data files 
toolid_part1 = []       
toolid_part2 = ['M01','M02']

for cur_data_file in (os.listdir(data_faults_path)):
    file_name_split = ["".join(x) for _, x in itertools.groupby(cur_data_file, key=str.isdigit)]
    cur_toolid_p1 = file_name_split[0]
    toolid_part1.append(cur_toolid_p1)
  
toolid_part1 = list(set(toolid_part1))
toolid_part1.sort()
print("Existing tool ids:")
print(toolid_part1)
print(toolid_part2)

Existing tool ids:
['01', '02', '03', '04', '05', '06', '07', '08', '09', '10']
['M01', 'M02']


In [None]:
# The objective of below scripts is to generate training data for each tool

tag = True

aggdata_path = os.path.join(data_path,'./aggdata')
assure_path_exists(aggdata_path)
if tag:
    for cur_tool_p2 in toolid_part2:
        for cur_tool_p1 in toolid_part1:
            cur_ttf_file = data_ttf_path+"/{}_{}_DC_train.csv".format(cur_tool_p1,cur_tool_p2)
            cur_sensor_file = data_train_path+"/{}_{}_DC_train.csv".format(cur_tool_p1,cur_tool_p2)
            df_sensor = pd.read_csv(cur_sensor_file, encoding='utf-8')
            df_sensor = prepare_sensor_data(df_sensor) # column name to lowercase, adjust columns data type
            df_ttf = pd.read_csv(cur_ttf_file, encoding='utf-8')
            
            df = pd.merge(df_sensor, df_ttf, on='time')
            df_grouped = get_agg(df)
            df_grouped.columns = ["_".join(x) for x in df_grouped.columns.ravel()]
            csv_file_name = os.path.join(aggdata_path, "./{}_{}_data_agg.csv".format(cur_tool_p1,cur_tool_p2))
            df_grouped.to_csv(csv_file_name, header=True, index= False)  

### TTF Data
TTF Data shows the format for data challenge submission.

In [15]:
# load raw data from the GitHub URL
datafile_ttf =  os.path.join(data_ttf_path,'04_M02_DC_train.csv') 
 
# Read into pandas
df_ttf = pd.read_csv(datafile_ttf, encoding='utf-8')

print(df_ttf.shape)
df_ttf.head(10)

(4450348, 4)


Unnamed: 0,time,TTF_FlowCool Pressure Dropped Below Limit,TTF_Flowcool Pressure Too High Check Flowcool Pump,TTF_Flowcool leak
0,3997500,24326630.0,,19849300.0
1,3997504,24326626.0,,19849296.0
2,3997508,24326622.0,,19849292.0
3,3997512,24326618.0,,19849288.0
4,3997516,24326614.0,,19849284.0
5,3997520,24326610.0,,19849280.0
6,3997524,24326606.0,,19849276.0
7,3997528,24326602.0,,19849272.0
8,3997532,24326598.0,,19849268.0
9,3997536,24326594.0,,19849264.0


### Marge Sensor Data and TTF Data






In [17]:
df = pd.merge(df, df_ttf, on='time')

In [18]:
print(df.shape)
df.head()

(4452862, 27)


Unnamed: 0,time,tool,stage,lot,runnum,recipe,recipe_step,iongaugepressure,etchbeamvoltage,etchbeamcurrent,...,rotationspeed,actualrotationangle,fixtureshutterposition,etchsourceusage,etchauxsourcetimer,etchaux2sourcetimer,actualstepduration,TTF_FlowCool Pressure Dropped Below Limit,TTF_Flowcool Pressure Too High Check Flowcool Pump,TTF_Flowcool leak
0,3997500,04M02,1,9199,10554217,67,1,-1.335401,-1.037886,-1.031469,...,-0.003224,-0.070721,0,-1.629322,-1.621974,-1.349286,-0.252985,24326630.0,,19849300.0
1,3997504,04M02,1,9199,10554217,67,1,-1.333846,-1.038108,-1.032264,...,-0.003224,-0.070721,0,-1.629322,-1.621974,-1.349286,-0.252985,24326626.0,,19849296.0
2,3997508,04M02,1,9199,10554217,67,1,-1.333008,-1.037719,-1.031541,...,-0.003224,-0.070721,0,-1.629322,-1.621974,-1.349286,-0.252985,24326622.0,,19849292.0
3,3997512,04M02,1,9199,10554217,67,1,-1.334116,-1.038497,-1.03212,...,-0.003224,-0.070721,0,-1.629322,-1.621974,-1.349286,-0.252985,24326618.0,,19849288.0
4,3997516,04M02,1,9199,10554217,67,1,-1.335806,-1.037663,-1.031903,...,-0.003224,-0.070721,0,-1.629322,-1.621974,-1.349286,-0.252985,24326614.0,,19849284.0


In [49]:

def get_agg(df):
    '''get aggregated data. df is the merged data (sensor data + ttf data)'''
    group = df.groupby(['tool', 'runnum'], as_index=False)
    aggregations = {
        'time':[np.min,np.max],
        'lot':['nunique'],
        'iongaugepressure': [np.min, np.max, np.mean, np.std],
        'etchbeamvoltage': [np.min, np.max, np.mean, np.std],
        'etchbeamcurrent': [np.min, np.max, np.mean, np.std],
        'etchsuppressorvoltage': [np.min, np.max, np.mean, np.std],
        'etchsuppressorcurrent': [np.min, np.max, np.mean, np.std],
        'flowcoolflowrate': [np.min, np.max, np.mean, np.std],
        'flowcoolpressure': [np.min, np.max, np.mean, np.std],
        'etchgaschannel1readback': [np.min, np.max, np.mean, np.std],
        'etchpbngasreadback': [np.min, np.max, np.mean, np.std],
        'fixturetiltangle': [np.min, np.max, np.mean, np.std],
        'rotationspeed': [np.min, np.max, np.mean, np.std],
        'actualrotationangle': [np.min, np.max, np.mean, np.std],
        'fixtureshutterposition': [np.min, np.max, np.mean, np.std],
        'etchsourceusage': [np.min, np.max, np.mean, np.std],
        'etchauxsourcetimer': [np.min, np.max, np.mean, np.std],
        'etchaux2sourcetimer': [np.min, np.max, np.mean, np.std],
        'actualstepduration': [np.min, np.max, np.mean, np.std],
        'TTF_FlowCool Pressure Dropped Below Limit':[np.mean],
        'TTF_Flowcool Pressure Too High Check Flowcool Pump':[np.mean],
        'TTF_Flowcool leak':[np.mean]
    }
    df_grouped = group.agg(aggregations)
    return df_grouped
    

In [50]:
df_grouped = get_agg(df)
# Using ravel, and a string join, we can create better names for the columns:
df_grouped.columns = ["_".join(x) for x in df_grouped.columns.ravel()]

In [52]:
df_grouped.columns

Index(['tool_', 'runnum_', 'time_amin', 'time_amax', 'lot_nunique',
       'iongaugepressure_amin', 'iongaugepressure_amax',
       'iongaugepressure_mean', 'iongaugepressure_std', 'etchbeamvoltage_amin',
       'etchbeamvoltage_amax', 'etchbeamvoltage_mean', 'etchbeamvoltage_std',
       'etchbeamcurrent_amin', 'etchbeamcurrent_amax', 'etchbeamcurrent_mean',
       'etchbeamcurrent_std', 'etchsuppressorvoltage_amin',
       'etchsuppressorvoltage_amax', 'etchsuppressorvoltage_mean',
       'etchsuppressorvoltage_std', 'etchsuppressorcurrent_amin',
       'etchsuppressorcurrent_amax', 'etchsuppressorcurrent_mean',
       'etchsuppressorcurrent_std', 'flowcoolflowrate_amin',
       'flowcoolflowrate_amax', 'flowcoolflowrate_mean',
       'flowcoolflowrate_std', 'flowcoolpressure_amin',
       'flowcoolpressure_amax', 'flowcoolpressure_mean',
       'flowcoolpressure_std', 'etchgaschannel1readback_amin',
       'etchgaschannel1readback_amax', 'etchgaschannel1readback_mean',
       'etchg

In [48]:
df_grouped.head()

Unnamed: 0,tool_,runnum_,time_amin,time_amax,lot_nunique,iongaugepressure_amin,iongaugepressure_amax,iongaugepressure_mean,iongaugepressure_std,etchbeamvoltage_amin,...,etchaux2sourcetimer_amax,etchaux2sourcetimer_mean,etchaux2sourcetimer_std,actualstepduration_amin,actualstepduration_amax,actualstepduration_mean,actualstepduration_std,TTF_FlowCool Pressure Dropped Below Limit_mean,TTF_Flowcool Pressure Too High Check Flowcool Pump_mean,TTF_Flowcool leak_mean
0,04M02,10554217,3997500,4008808,4,-1.339284,1.015015,0.216933,0.443725,-1.04014,...,-1.34272,-1.345945,0.001932,-0.913557,4.779943,0.045047,1.239795,24321020.0,,19843690.0
1,04M02,10555140,4010816,4022524,4,-1.335483,1.041756,0.14959,0.510979,-1.04014,...,-1.336154,-1.339361,0.001897,-0.913557,4.779943,0.012084,1.240497,24307510.0,,19830180.0
2,04M02,10556349,4028696,4035220,2,-1.338916,0.538764,0.220148,0.446652,-1.04014,...,-1.332308,-1.334176,0.001175,-0.913557,0.879424,-0.11472,0.59923,24292210.0,,19814880.0
3,04M02,10557015,4037956,4044556,2,-1.336732,1.028346,0.195255,0.459987,-1.04014,...,-1.328432,-1.330299,0.001183,-0.913557,0.879424,-0.122184,0.600267,24282910.0,,19805580.0
4,04M02,10558223,4055744,4062324,2,-1.341442,0.962426,0.213917,0.445071,-1.039163,...,-1.324544,-1.326452,0.001187,-0.913557,0.879424,-0.110414,0.594148,24265130.0,,19787800.0
