### Metal Etch Data for Fault Detection Evaluation
https://www.eigenvector.com/data/Etch/

In [1]:
import scipy.io
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

### Matlab file structure

The data consists of 108 normal wafers (calibration) and 21 faulty wafers (test) from three experiments, and 19 sensor reading variables.

- data[0][0][0]  **INFORMATION**: [ 29x63 char]  This field                       
- data[0][0][1]  **calibration**: {108x1  cell}  The normal or calibration wafers 
- data[0][0][2]  **calib_names**: [108x9  char]  Names of the calibration wafers  
- data[0][0][3]  **test**: { 21x1  cell}  The test or faulty wafers        
- data[0][0][4]  **test_names**: [ 21x9  char]  Names of the test wafers         
- data[0][0][5]  **fault_names**: [ 21x9  char]  Names of the specific faults     
- data[0][0][6]  **variables**: [ 21x14 char]  Names of the variables           

In [2]:
mat = scipy.io.loadmat('metal_etch/MACHINE_Data.mat')
data = mat.get('LAMDATA')
type(data)

numpy.ndarray

### Export data to csv files
(Big-endian bytes are not compatible with pandas dataframe. Export the data to csv and reload the data to resolve this issue.)

In [3]:
# Normal data
normal = data[0][0][1]
n_normal = normal.shape[0]
print(f"# of normal wafers: {n_normal}")

# Extract normal wafer data
for i in range(n_normal):
    wafer =  pd.DataFrame(normal[i][0].reshape(-1, 21))
    wafer.to_csv(f"metal_etch/raw_normal/wafer_{i+1}.csv", index = False)


# Abnormal data
abnormal = data[0][0][3]
n_abnormal = abnormal.shape[0]
print(f"# of abnormal wafers: {n_abnormal}")

# Extract abnormal wafer data
for i in range(n_abnormal):
    wafer =  pd.DataFrame(abnormal[i][0].reshape(-1, 21))
    wafer.to_csv(f"metal_etch/raw_abnormal/wafer_{i+1}.csv", index = False)

# of normal wafers: 108
# of abnormal wafers: 21


### Load csv files
- Load wafer data
- Cosolidate all normal wafer data and abnormal wafer data
- Modify the column names 

In [4]:
# normal
normal_raw = pd.DataFrame()
normal_id_list = data[0][0][2]

for i in range(n_normal):
    wafer = pd.read_csv(f"metal_etch/raw_normal/wafer_{i+1}.csv")
    wafer_id = normal_id_list[i].replace(".txm", "")
    wafer["Wafer ID"] = [wafer_id for i in range(len(wafer))]
    normal_raw = pd.concat([normal_raw, wafer])

# abnormal
abnormal_raw = pd.DataFrame()
abnormal_id_list = data[0][0][4]

for i in range(n_abnormal):
    wafer = pd.read_csv(f"metal_etch/raw_abnormal/wafer_{i+1}.csv")
    wafer_id = abnormal_id_list[i].replace(".txm", "")
    wafer["Wafer ID"] = [wafer_id for i in range(len(wafer))]
    abnormal_raw = pd.concat([abnormal_raw, wafer])

# Column names
col_names = data[0][0][6]
col_names = np.append(col_names, "Wafer ID")
col_names = [col.strip().replace(" ", "_") for col in col_names]
normal_raw.columns = col_names
abnormal_raw.columns = col_names
normal_raw["Step_Number"] = normal_raw["Step_Number"].astype(int)
abnormal_raw["Step_Number"] = abnormal_raw["Step_Number"].astype(int)

print(f"normal: {normal_raw.shape}")
print(f"abnormal: {abnormal_raw.shape}")

normal_raw.head()

normal: (10770, 22)
abnormal: (2059, 22)


Unnamed: 0,Time,Step_Number,BCl3_Flow,Cl2_Flow,RF_Btm_Pwr,RF_Btm_Rfl_Pwr,Endpt_A,He_Press,Pressure,RF_Tuner,...,RF_Pwr,RF_Impedance,TCP_Tuner,TCP_Phase_Err,TCP_Impedance,TCP_Top_Pwr,TCP_Rfl_Pwr,TCP_Load,Vat_Valve,Wafer_ID
0,11.946,4,751.0,753.0,132.0,0.0,626.0,100.0,1227.0,9408.0,...,26.0,16599.0,20028.0,-296.0,16848.0,360.0,0.0,27594.0,49.0,l2901
1,13.028,4,751.0,753.0,134.0,0.0,620.0,99.0,1229.0,9431.0,...,26.0,16568.0,20042.0,-676.0,16796.0,350.0,0.0,27440.0,49.0,l2901
2,14.049,4,751.0,755.0,134.0,0.0,599.0,102.0,1221.0,9389.0,...,25.0,16442.0,20146.0,-291.0,16512.0,344.0,0.0,27276.0,49.0,l2901
3,15.1329,4,751.0,753.0,133.0,0.0,586.0,100.0,1201.0,9445.0,...,25.0,16960.0,20148.0,-262.0,17020.0,352.0,0.0,27330.0,50.0,l2901
4,16.139,4,751.0,754.0,132.0,0.0,587.0,102.0,1182.0,9456.0,...,25.0,16564.0,20226.0,-547.0,16440.0,346.0,0.0,27262.0,50.0,l2901


### Get summarized features
- Extract summarized features from temporal data 

In [5]:
def get_summary_data(data, SVID_list, col_ID, col_step, aggfunc = "mean"):

    df_feature = data.groupby([col_ID, col_step])[SVID_list].mean()
    df_feature = df_feature.reset_index()
    df_feature = df_feature.pivot(index = col_ID, columns = [col_step], values = SVID_list)
    df_feature.columns = df_feature.columns.map(lambda x: f"{x[0]}_S{x[1]}_{aggfunc}") 

    return df_feature

In [6]:
normal_data = get_summary_data(data = normal_raw, 
                              SVID_list = col_names[2:-1], 
                              col_ID = "Wafer_ID", 
                              col_step = "Step_Number",
                              aggfunc = "mean")
abnormal_data = get_summary_data(data = abnormal_raw, 
                              SVID_list = col_names[2:-1], 
                              col_ID = "Wafer_ID", 
                              col_step = "Step_Number",
                              aggfunc = "mean")

print(f"normal: {normal_data.shape}")
print(f"abnormal: {abnormal_data.shape}")

normal_data.head(3)

normal: (108, 38)
abnormal: (21, 38)


Unnamed: 0_level_0,BCl3_Flow_S4_mean,BCl3_Flow_S5_mean,Cl2_Flow_S4_mean,Cl2_Flow_S5_mean,RF_Btm_Pwr_S4_mean,RF_Btm_Pwr_S5_mean,RF_Btm_Rfl_Pwr_S4_mean,RF_Btm_Rfl_Pwr_S5_mean,Endpt_A_S4_mean,Endpt_A_S5_mean,...,TCP_Impedance_S4_mean,TCP_Impedance_S5_mean,TCP_Top_Pwr_S4_mean,TCP_Top_Pwr_S5_mean,TCP_Rfl_Pwr_S4_mean,TCP_Rfl_Pwr_S5_mean,TCP_Load_S4_mean,TCP_Load_S5_mean,Vat_Valve_S4_mean,Vat_Valve_S5_mean
Wafer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
l2901,751.528302,751.745763,753.188679,753.186441,133.679245,133.271186,0.018868,0.016949,1431.09434,882.050847,...,16610.377358,16503.898305,349.981132,350.016949,0.188679,0.033898,27817.509434,27720.101695,48.981132,49.915254
l2902,751.686275,751.75,753.117647,753.160714,133.156863,133.375,0.019608,0.0,1465.882353,881.964286,...,16554.470588,16528.821429,348.470588,349.982143,0.215686,0.017857,27839.647059,27720.928571,48.980392,49.892857
l2903,751.46,751.732143,753.32,753.125,133.36,133.071429,0.0,0.0,1508.94,888.875,...,16553.92,16555.428571,348.54,349.214286,0.28,0.017857,27852.8,27722.571429,49.0,49.875


### Split data based on experiment
- Three experiments {29, 31, 33} were performed and the faults are intentionally induced by changing the settings of different controllable variables.
- Three experiments were run at different time periods, February, March, and April, respectively, and the process drift and changes on covariance among variables can be observed. 

In [7]:
exp29_noc = normal_data[normal_data.index.str.contains("l29")]
exp31_noc = normal_data[normal_data.index.str.contains("l31")]
exp33_noc = normal_data[normal_data.index.str.contains("l33")]

exp29_abn = abnormal_data[abnormal_data.index.str.contains("l29")]
exp31_abn = abnormal_data[abnormal_data.index.str.contains("l31")]
exp33_abn = abnormal_data[abnormal_data.index.str.contains("l33")]

print(f"exp29: {len(exp29_noc)} normal wafers; {len(exp29_abn)} abnormal wafers.")
print(f"exp31: {len(exp31_noc)} normal wafers; {len(exp31_abn)} abnormal wafers.")
print(f"exp33: {len(exp33_noc)} normal wafers; {len(exp33_abn)} abnormal wafers.")

exp29: 34 normal wafers; 9 abnormal wafers.
exp31: 37 normal wafers; 6 abnormal wafers.
exp33: 37 normal wafers; 6 abnormal wafers.


### Prepare training data and testing data
- To evaluate monitoring schemes by their sensitivity (True Positive Rate) and specificity (True Negative Rate), five normal wafers of each experiment are excluded from the learning process and are used for testing.

In [8]:
random.seed(42)
exp29_noc_test = (random.sample(exp29_noc.index.to_list(),5)) 
exp31_noc_test = (random.sample(exp31_noc.index.to_list(),5)) 
exp33_noc_test = (random.sample(exp33_noc.index.to_list(),5)) 

exp29_noc_train = list(set(exp29_noc.index.to_list())-set(exp29_noc_test))
exp31_noc_train = list(set(exp31_noc.index.to_list())-set(exp31_noc_test))
exp33_noc_train = list(set(exp33_noc.index.to_list())-set(exp33_noc_test))

train = {"exp29":exp29_noc.loc[exp29_noc_train], 
         "exp31":exp31_noc.loc[exp31_noc_train], 
         "exp33":exp33_noc.loc[exp33_noc_train]}
test = {"exp29": pd.concat([exp29_noc.loc[exp29_noc_test], exp29_abn]), 
        "exp31": pd.concat([exp31_noc.loc[exp31_noc_test], exp31_abn]),
        "exp33": pd.concat([exp33_noc.loc[exp33_noc_test], exp33_abn])}

In [9]:
summary = pd.DataFrame(index = ["exp29", "exp31", "exp33"])
summary["# training wafer (NOC)"]  = [len(v) for k,v in train.items()]
summary["# testing wafer (NOC)"]  = [5]*3
summary["# testing wafer (Faulty)"] = [len(v)-5 for k,v in test.items()]
summary

Unnamed: 0,# training wafer (NOC),# testing wafer (NOC),# testing wafer (Faulty)
exp29,29,5,9
exp31,32,5,6
exp33,32,5,6


- Consolidation: (1) Three experiments in one set (2) Separate datasets



In [10]:
#(1) Three experiments in one set 
train_all = pd.DataFrame()
for k, df in train.items():
    df_new = df.copy()
    df_new.insert(0, 'Experiment', k)
    train_all = pd.concat([train_all, df_new])

test_all = pd.DataFrame()
for k, df in test.items():
    df_new = df.copy()
    df_new.insert(0, 'Experiment', k)
    test_all = pd.concat([test_all, df_new])

train_all.to_csv("metal_etch/metaletch_train_all.csv")
test_all.to_csv("metal_etch/metaletch_test_all.csv")

In [11]:
# (2) Separate datasets
for k, df in train.items():
    filename = f"metal_etch/metaletch_train_{k}.csv"
    df.to_csv(filename)


for k, df in test.items():
    filename = f"metal_etch/metaletch_test_{k}.csv"
    df.to_csv(filename)



