### Importing Required Packages

In [1]:
import pandas as pd
import numpy as np

### Necessary functions
- These functions are explained in the 'data cleaning' file
- They are necessary to format the data for modelling

In [2]:
#Converting systematic sampling (FOR FLOATS) to a function
def float_sample_2(population,sample_size):
    base_interval = len(population) // sample_size
    remainder = len(population)%sample_size
    #indices = []
    indices = {}
    #Add in min and max points in array
    min_index = population.argmin()
    max_index = population.argmax()
    indices[0] = min_index
    indices[1] = max_index
    start_index = np.random.randint(0,base_interval)
    for i in range(2,sample_size):
        interval = base_interval + 1 if i < remainder else base_interval
        #print("Interval is: ", interval)
        index = (start_index + i * interval) % len(population)
        #print("Index is: ",index)
        #indices.append(index)
        indices[i] = index
    #print("The dict is: ",indices)
    unique_indices = list(indices.values())
    unique_indices = [*set(unique_indices)]
    systematic_sample = population[unique_indices]
    print("Before delete")
    #print("systematic_sample: ",systematic_sample)
    population = np.delete(population,unique_indices)
    print("After delete")
    if len(systematic_sample) < sample_size:
        remaining_samples = sample_size - len(systematic_sample)
        remaining_indices = np.random.choice(len(population),remaining_samples,replace=False)
        systematic_sample = np.concatenate((systematic_sample,population[remaining_indices]))
    return systematic_sample
#Function to apply the systematic sampling for the function 
def apply_sampling(df):
    new_df = pd.DataFrame()
    for colName,colData in df.items():
        data = df.dropna(subset=colName)
        newCol = list(data[colName])
        newCol = np.array(newCol)
        newCol = pd.Series(float_sample_2(newCol,50000))
        #Can only concat after sampling!!
        new_df = pd.concat([new_df,newCol.rename(colName)],axis=1)
    return new_df
#Import relevant packages 
from sklearn import preprocessing
min_max = preprocessing.MinMaxScaler()
#Function to normalise all the columns in a dataframe
def normalise(df):
    final_df = pd.DataFrame()
    for colName,colData in df.items():
        c_scaled = min_max.fit_transform(df[[colName]])
        c_scaled = pd.Series(c_scaled.ravel())
        final_df = pd.concat([final_df,c_scaled.rename(colName)],axis=1)
    return final_df
#Function to convert all the data (individual tuple) into arrays/list for modelling
def convert_data_rows(df):
    data_points = []
    for colName,colData in df.items():
        data_points.append(np.asarray(colData))
    return data_points
#Function to drop columns (tests) in the raw data file which are NOT tests (i.e metadata like start time, lot number etc)
def test_only(datafile,labelfile):
    keys = list(labelfile['Name'])
    df = datafile[keys]
    return df
def identify_col(df):
    null_perCol = df.isnull().sum()
    insuff_col = {}
    for index in range(len(null_perCol)):
        if(df.shape[0]-null_perCol[index]<50000):
            insuff_col[null_perCol.index[index]] = null_perCol[index]
    return list(insuff_col.keys())
#Function to find out non applicable rows (no distribution, not part of training data)
def non_applicable (df_label):
    null_rowsLabel = df_label[df_label['Distribution Type'].isnull()]
    null_rowsLabel = list(null_rowsLabel['Name'])
    return null_rowsLabel
def drop_col(df_dropped,less_50000):
    pd.options.mode.chained_assignment = None 
    df = df_dropped.drop(columns=less_50000)
    return df
#Check if everything is replaced correctly (no more null values for the columns with <50000) data points
def check_nonull(df):
    null_perCol = df.isnull().sum()
    insuff_col = {}
    for index in range(len(null_perCol)):
        if(df.shape[0]-null_perCol[index]<50000):
            insuff_col[null_perCol.index[index]] = null_perCol[index]
    print(insuff_col)
    print(len(insuff_col))     

### Importing our data 
- HK data is the data containing array instances, and they are already cleaned
- HK label is the relabelled labels of the corresponding array instances

In [3]:
HK_data = pd.read_csv('HK_cleaned_EY.csv',low_memory=False)

In [4]:
HK_normalised = HK_data

In [5]:
import chardet
filename = 'HK_labels_31_10_23.csv'
with open(filename, 'rb') as file:
    print(chardet.detect(file.read()))

{'encoding': 'UTF-16', 'confidence': 1.0, 'language': ''}


In [6]:
HK_label = pd.read_csv(r'HK_labels_31_10_23.csv',encoding = 'utf-16')

### Data cleaning and pre-preparation for modelling

In [7]:
#Extract the corresponding columns from the label file
good_cols = list(HK_normalised.columns)
keys = HK_label['Name']
extract = [i for i in keys if i in good_cols]
print(len(extract))
HK_corr_label = HK_label[HK_label['Name'].isin(extract)]

1689


In [8]:
#Function to concat the dataframes (raw data and the name of the tests) -> row format. i.e 2 columns: data, name
def concat_xy(df):
    cols = ['Data','Name']
    concat_df = pd.DataFrame(columns=cols,index=range(df.shape[1]))
    index = 0
    for colName, colData in df.items():
        #variable index is to retrieve the index of the columns in YK raw (final_df)
        vals = np.array(colData.values)
        concat_df.loc[index].Data = vals
        concat_df.loc[index].Name = colName
        index+=1
    return concat_df
#Function concatenating the Y (distribution) with the raw data (X) -> dataframe passed in is the label final! concat_df is testname_data
def model_xy(label_df,concat_df):
    dist_labels = label_df[['Distribution Type','Name']]
    model_df = pd.merge(concat_df,dist_labels,on='Name')
    #model_df_xy = model_df[['Data','Distribution Type']]
    return model_df

In [9]:
#Sorting the data beforehand before modelling
num_classes = 6
def sort_df(df):
    #Make a copy then sort
    sorted_df = df
    for index in range(sorted_df.shape[0]):
        sorted_df['Data'].iloc[index].sort()
    return sorted_df
#Function to convert all the data (individual tuple) into arrays/list for modelling
def convert_data_rows(df):
    data_points = []
    for colName,colData in df.items():
        data_points.append(np.asarray(colData))
    return data_points

In [10]:
HK_testname_data = concat_xy(HK_normalised)
print(HK_testname_data.head())
HK_model = model_xy(HK_corr_label,HK_testname_data)
print(HK_model.head())
print(HK_model['Distribution Type'].value_counts())
print(HK_model['Distribution Type'])

                                                Data  \
0  [0.7087378640776696, 0.737864077669903, 0.7864...   
1  [0.4456761723727496, 0.6582697960780024, 0.557...   
2  [0.8347090082415121, 0.6804384607235257, 0.677...   
3  [0.0, 0.1538461538461542, 0.4615384615384617, ...   
4  [0.6612903225806455, 0.7258064516129035, 0.709...   

                                               Name  
0  a_clcXrftrimm_CLIF.TX.PHASE.VDDPA.66C.13_x_x_x__  
1   p_open_PADS.DIG.OPENVDDIO.V_x_1mA_x__SE_SPI_CLK  
2       a_anlXphase_CLIF.TX.BETA.40_x_VDDPA.3V3_x__  
3           a_vihXvtx_CLIF.LPDET.DIFF.SEL10_x_x_x__  
4   a_clcXrftrimm_CLIF.TX.PHASE.VDDPA.66C.6_x_x_x__  
                                                Data  \
0  [0.7087378640776696, 0.737864077669903, 0.7864...   
1  [0.4456761723727496, 0.6582697960780024, 0.557...   
2  [0.8347090082415121, 0.6804384607235257, 0.677...   
3  [0.0, 0.1538461538461542, 0.4615384615384617, ...   
4  [0.6612903225806455, 0.7258064516129035, 0.709...   

  

## Modelling - Support Vector Machine

### Separate them based on distribution type for separate analysis of prediction
- This is so that we can preserve the testnames to plot them back in Exensio for analysis

In [11]:
#Drop the tests with undeterministic distributions/not necessary for prediction
all_tests = list(HK_model['Name'])
#Those that are weird (to be removed) and potentially belong to another dist but are labelled to have a dist
unecessary = ['i_TOP-MISC-DLOG_x_x_x_x__Touch-Down-Num','p_postXfreq_QA.NFC.RINGO.fe.nand2.long.0_x_nom_x','p_postXfreq_QA.NFC.RINGO.fe.nor2.long.0_x_nom_x','p_postXfreq_QA.NFC.RINGO.be.m24.rc.line_x_nom_x']
#Remove untagged or unecessary (those that have tagged but undeterminstic)
untagged = list(HK_model[HK_model['Distribution Type']=='Untagged']['Name'])
print(len(untagged))
HK_model_dropped = HK_model[~HK_model['Name'].isin(unecessary+untagged)]
print(HK_model_dropped.head())
print(len(HK_model_dropped))

26
                                                Data  \
0  [0.7087378640776696, 0.737864077669903, 0.7864...   
1  [0.4456761723727496, 0.6582697960780024, 0.557...   
2  [0.8347090082415121, 0.6804384607235257, 0.677...   
3  [0.0, 0.1538461538461542, 0.4615384615384617, ...   
4  [0.6612903225806455, 0.7258064516129035, 0.709...   

                                               Name Distribution Type  
0  a_clcXrftrimm_CLIF.TX.PHASE.VDDPA.66C.13_x_x_x__           outlier  
1   p_open_PADS.DIG.OPENVDDIO.V_x_1mA_x__SE_SPI_CLK            normal  
2       a_anlXphase_CLIF.TX.BETA.40_x_VDDPA.3V3_x__           outlier  
3           a_vihXvtx_CLIF.LPDET.DIFF.SEL10_x_x_x__          discrete  
4   a_clcXrftrimm_CLIF.TX.PHASE.VDDPA.66C.6_x_x_x__           outlier  
1662


In [12]:
HK_model_dropped['Distribution Type'].value_counts()

outlier       809
functional    292
longtail      184
normal        149
discrete      136
bimodal        92
Name: Distribution Type, dtype: int64

### Analysing the unbalanced dataset
- How many instances are there for each distribution

In [13]:
#Grouping the dataframe by the distribution type
grouped_df = HK_model_dropped.groupby('Distribution Type')
#Verifying that grouping is indeed done
for group_name, group_data in grouped_df:
    print("Group: {}".format(group_name))
    print(group_data.head())
    print("------")

Group: bimodal
                                                  Data  \
11   [0.2741480578231607, 0.2869562789139079, 0.184...   
35   [0.3009006829558842, 0.265729385711893, 0.2835...   
102  [0.5335934209651043, 0.2199144370802059, 0.362...   
133  [0.0, 0.032258064516129, 1.0, 1.0, 0.032258064...   
135  [0.0, 0.1333333333333333, 0.0, 1.0, 1.0, 0.066...   

                                            Name Distribution Type  
11         a_clk_TXDC.DELAYCHAIN_x_VDDPA.3V3_x__           bimodal  
35       p_clc_PADS.SIG.DELTA.EOF.V_x_1mA_x__TX2           bimodal  
102           p_short_PADS.DIG.V_x_1mA_x__PMUVCC           bimodal  
133  ip_trim_TOP--DLOG-TRIMVALUE-VREF.LQ-WR.PFN2           bimodal  
135  ip_trim_TOP--DLOG-TRIMVALUE-IREF.HQ-WR.PFN2           bimodal  
------
Group: discrete
                                                 Data  \
3   [0.0, 0.1538461538461542, 0.4615384615384617, ...   
13  [0.0, 0.7692307692307692, 0.2307692307692308, ...   
20  [0.4000000000000001, 0.5

### Trimming data to allow model to be trained with balanced data set

In [14]:
#Function to trim the group
def trim_df (new_df,grouped_df):
    trimmed_group = None
    for _,group in grouped_df:
        count = len(group)
        #If the number of instance for any group is below limit (100), print warning statement
        if count <= 90:
            print("Group: {} has less than 90!".format(group))
        else:
            trimmed_group = group.sample(n=90,random_state=42)
        trimmed_df = pd.concat([new_df,trimmed_group])
        new_df = trimmed_df
    return trimmed_df
#Creating a new dataframe to store the trimmed data
trimmed_df = pd.DataFrame(columns=HK_model_dropped.columns)
print(trimmed_df.head())
#Applying the function
trimmed_df = trim_df(trimmed_df,grouped_df)
#Resetting the index of the trimmed dataframe
trimmed_df = trimmed_df.reset_index(drop=True)
print(trimmed_df)
print(trimmed_df['Distribution Type'].value_counts())

Empty DataFrame
Columns: [Data, Name, Distribution Type]
Index: []
                                                  Data  \
0    [0.549284438224691, 0.6680452392509326, 0.3225...   
1    [0.6203717955893876, 0.5743185925235628, 0.438...   
2    [0.104463099767329, 0.4583138840177989, 0.3892...   
3    [0.4847588264545095, 0.2324714072600784, 0.307...   
4    [0.2741480578231607, 0.2869562789139079, 0.184...   
..                                                 ...   
535  [0.5370823145884271, 0.5215973920130399, 0.449...   
536  [0.3209093280119366, 0.2417547091500351, 0.162...   
537  [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...   
538  [0.0026848040093073, 0.0035797386790764, 0.003...   
539  [0.9999942766938472, 0.9999944533970044, 0.999...   

                                                  Name Distribution Type  
0               p_lkg_PADS.IILEND_x_1V8_x__NFC_CLK_REQ           bimodal  
1                p_lkg_PADS.IIH_x_1V8_x__NFC_SIM_SWIO1           bimodal  
2          

### Training
- We extracted 90 instances of each distribution
- We trained the model with 72 instances of each distribution (80%)

In [15]:
#Prepare the training and sorting data 
from sklearn.model_selection import train_test_split
def support_vector(model_df):
    #label_encoder = LabelEncoder()
    xData = sort_df(model_df)['Data']
    yData = sort_df(model_df)['Distribution Type']
    X_train, X_test, Y_train, Y_test = train_test_split(xData,yData,test_size = 0.2,random_state=42) 
    X_train = convert_data_rows(X_train)
    X_test = convert_data_rows(X_test)
    X_train = np.array(X_train).astype('float64')
    X_test= np.array(X_test).astype('float64')
    #num_samples, num_timesteps = X_train.shape
    #X_train = X_train.reshape(num_samples,num_timesteps,1)
    #X_test = X_test.reshape(X_test.shape[0],num_timesteps,1)
    #Y_train = label_encoder.fit_transform(Y_train)
    #Y_train = to_categorical(Y_train,num_classes)
    #Y_test = label_encoder.fit_transform(Y_test)
    #Y_test = to_categorical(Y_test,num_classes)
    return X_train,X_test,Y_train,Y_test

In [16]:
X_train_SV, X_test_SV, Y_train_SV, Y_test_SV = support_vector(trimmed_df)

In [17]:
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
import joblib
svm_classifier = SVC(kernel='linear',random_state=42)
svm_classifier.fit(X_train_SV,Y_train_SV)
pred_classes = svm_classifier.predict(X_test_SV)
print("Accuracy: {}".format(accuracy_score(pred_classes,Y_test_SV)))

Accuracy: 0.7777777777777778


## Modelling - Random Forest
- We then repeated the whole process of training using the random forest model

In [18]:
#Prepare the training and sorting data 
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
def train_test_RF(model_df):
    #label_encoder = LabelEncoder()
    xData = sort_df(model_df)['Data']
    yData = sort_df(model_df)['Distribution Type']
    X_train, X_test, Y_train, Y_test = train_test_split(xData,yData,test_size = 0.2,random_state=42)
    X_train = convert_data_rows(X_train)
    X_test = convert_data_rows(X_test)
    X_train = np.array(X_train).astype('float64')
    X_test= np.array(X_test).astype('float64')
    #num_samples, num_timesteps = X_train.shape
    #X_train = X_train.reshape(num_samples,num_timesteps,1)
    #X_test = X_test.reshape(X_test.shape[0],num_timesteps,1)
    Y_train = label_encoder.fit_transform(Y_train)
    Y_train = to_categorical(Y_train,num_classes)
    Y_test = label_encoder.fit_transform(Y_test)
    Y_test = to_categorical(Y_test,num_classes)
    return X_train,X_test,Y_train,Y_test

In [19]:
X_train_RF,X_test_RF,Y_train_RF,Y_test_RF = train_test_RF(trimmed_df)

In [20]:
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
# Create a Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
# Train the Random Forest classifier
rf_classifier.fit(X_train_RF, Y_train_RF)
# Predict the labels for the test set
y_pred = rf_classifier.predict(X_test_RF)
# Evaluate the accuracy of the classifier
accuracy = np.mean(y_pred == Y_test_RF)
print("Accuracy: {}".format(accuracy_score(y_pred,Y_test_RF)))

Accuracy: 0.7777777777777778


### Unpacking the encoding of Random Forest model
- This is so that we can identify the labels as strings instead of integers and analyse them accordingly

In [21]:
class_labels = label_encoder.classes_
labels = []
for i,label in enumerate(class_labels):
    print("Integer {} corresponds to class labels: {}".format(i,label))
    labels.append(label)

Integer 0 corresponds to class labels: bimodal
Integer 1 corresponds to class labels: discrete
Integer 2 corresponds to class labels: functional
Integer 3 corresponds to class labels: longtail
Integer 4 corresponds to class labels: normal
Integer 5 corresponds to class labels: outlier


In [22]:
#Switch statements to return corresponding distributions
def return_dist(arg):
    match arg:
        case 0:
            return "bimodal"
        case 1:
            return "discrete"
        case 2:
            return "functional"
        case 3:
            return "longtail"
        case 4:
            return "normal"
        case 5:
            return "outlier"
#Function to concat the distributions together
def dist_array(num_array):
    dist = []
    for num in num_array:
        dist_name = return_dist(num)
        dist.append(dist_name)
    return dist

### Saving RF and SVM to prep for ensemble methods
- This is for future uses (eg if we were to import these trained models into Exensio instead of training them in Exensio which is slow and inefficient)

In [23]:
#For RF
import joblib
joblib.dump(rf_classifier,'RF_relabelled.joblib')

['RF_relabelled.joblib']

In [24]:
#For SVM
import pickle
model_filename = "SVM_relabelled.pkl"
with open(model_filename,'wb') as model_file_sv:
    pickle.dump(svm_classifier,model_file_sv)

## Ensemble Learning Attempt
- We aim to raise the accuracy of the model by allowing the models to work WITH each other instead of AGAINST. i.e, the models will be deployed on the same set of the test data
- Only those instances which are agreed upon by both model (they make the same prediction of distribution) will be accepted and cross-checked against the actual labels

### Further explanation on ensemble learning attempt
For the test data -> 18 instances of each distribution, 108 total instances, only take/cross check results where both models give the same prediction. Thus, there will be 3 parts to analyse for this model. Correct Prediction, Wrong Prediction, and No Prediction (both models cannot agree on the same prediction). Note that the test groups must be standardised (test w both on  test data used to build rf, repeat for svm)


In [25]:
print(X_test_RF)
print(X_test_SV)

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.08381006 0.08658729 ... 0.95219994 0.99017419 1.        ]
 [0.         0.         1.         ... 1.         1.         1.        ]
 ...
 [0.         0.         0.00495776 ... 0.87113901 0.92132537 1.        ]
 [0.         0.         0.         ... 1.         1.         1.        ]
 [0.         0.04061062 0.07057405 ... 0.9658833  0.96824759 1.        ]]
[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.08381006 0.08658729 ... 0.95219994 0.99017419 1.        ]
 [0.         0.         1.         ... 1.         1.         1.        ]
 ...
 [0.         0.         0.00495776 ... 0.87113901 0.92132537 1.        ]
 [0.         0.         0.         ... 1.         1.         1.        ]
 [0.         0.04061062 0.07057405 ... 0.9658833  0.96824759 1.        ]]


In [26]:
#Finding out prediction for RF; using RF's test
RF_pred = rf_classifier.predict(X_test_RF)
RF_pred = np.argmax(RF_pred,axis=1)
RF_pred = dist_array(RF_pred)

#Finding out prediction for SVM; using RF's test
SVM_pred = svm_classifier.predict(X_test_RF)

#Create dataframe to facilitate comparison between prediction of the two models
comp_df = pd.DataFrame({'Random Forest':RF_pred,'Support Vector':SVM_pred})
print(comp_df)

#Extract the rows in which the values for both ranfom forests and support vector are the same
converge_df = comp_df[comp_df['Random Forest'] == comp_df['Support Vector']]
print(converge_df)

    Random Forest Support Vector
0      functional     functional
1          normal         normal
2         outlier        outlier
3         bimodal        bimodal
4         outlier        outlier
..            ...            ...
103       outlier        outlier
104      discrete       discrete
105       bimodal        bimodal
106       bimodal         normal
107        normal         normal

[108 rows x 2 columns]
    Random Forest Support Vector
0      functional     functional
1          normal         normal
2         outlier        outlier
3         bimodal        bimodal
4         outlier        outlier
..            ...            ...
101      longtail       longtail
103       outlier        outlier
104      discrete       discrete
105       bimodal        bimodal
107        normal         normal

[80 rows x 2 columns]


### Analysing the prediction of ensemble method
So now that we have 80 out of 108 rows where the two models have the same prediction, let's analyse the accuracy of these rows.

In [27]:
#Finding out prediction for RF; using RF's test
Y_test_RF = np.argmax(Y_test_RF,axis=1)
Y_test_RF = dist_array(Y_test_RF)
Y_test_RF_df = pd.DataFrame({'Y test RF':Y_test_RF})
print(Y_test_RF_df)

      Y test RF
0    functional
1       bimodal
2       outlier
3       bimodal
4       outlier
..          ...
103     outlier
104    discrete
105     bimodal
106    discrete
107      normal

[108 rows x 1 columns]


In [28]:
#Extract the corresponding rows from the Y test RF
index_keys = converge_df.index
Y_test_rf_converge = Y_test_RF_df.loc[index_keys]
#print(Y_test_rf_converge)

#Dataframe to compare converged rows prediction and actual prediction
accuracy_df = pd.DataFrame({'Converge prediction':converge_df['Random Forest'],'Actual':Y_test_rf_converge['Y test RF']})
print(accuracy_df)
accuracy_analyse_df = accuracy_df[accuracy_df['Converge prediction']==accuracy_df['Actual']]
print(accuracy_analyse_df)
print("Accuracy is: ", (accuracy_analyse_df.shape[0]/accuracy_df.shape[0])*100)

    Converge prediction      Actual
0            functional  functional
1                normal     bimodal
2               outlier     outlier
3               bimodal     bimodal
4               outlier     outlier
..                  ...         ...
101            longtail    longtail
103             outlier     outlier
104            discrete    discrete
105             bimodal     bimodal
107              normal      normal

[80 rows x 2 columns]
    Converge prediction      Actual
0            functional  functional
2               outlier     outlier
3               bimodal     bimodal
4               outlier     outlier
6              longtail    longtail
..                  ...         ...
101            longtail    longtail
103             outlier     outlier
104            discrete    discrete
105             bimodal     bimodal
107              normal      normal

[76 rows x 2 columns]
Accuracy is:  95.0


### Analysing the testnames of:
- the discarded pile (instances where both models are not able to agree on)
- the converged but wrongly classified instances

In [29]:
#Prepare the training and sorting data 
from sklearn.model_selection import train_test_split
def support_vector_tests(model_df):
    #label_encoder = LabelEncoder()
    xData = sort_df(model_df)['Data']
    #Added new line 
    xDataName = model_df['Name']
    X = pd.DataFrame({'Name':xDataName,'Data':xData})
    yData = sort_df(model_df)['Distribution Type']
    X_train, X_test, Y_train, Y_test = train_test_split(X,yData,test_size = 0.2,random_state=42) 
    #X_train = convert_data_rows(X_train)
    #X_test = convert_data_rows(X_test)
    #X_train = np.array(X_train).astype('float64')
    #X_test= np.array(X_test).astype('float64')
    #num_samples, num_timesteps = X_train.shape
    #X_train = X_train.reshape(num_samples,num_timesteps,1)
    #X_test = X_test.reshape(X_test.shape[0],num_timesteps,1)
    #Y_train = label_encoder.fit_transform(Y_train)
    #Y_train = to_categorical(Y_train,num_classes)
    #Y_test = label_encoder.fit_transform(Y_test)
    #Y_test = to_categorical(Y_test,num_classes)
    return X_train,X_test,Y_train,Y_test

In [30]:
X_train_SV, X_test_SV, Y_train_SV, Y_test_SV = support_vector_tests(trimmed_df)
print(X_test_SV)

                                                  Name  \
229               xf_bf_TOP--CLOCK.MODULAR-MEAS.VERIFY   
73       a_icc_CLIF.CLKBUF.HP.VBAT_x_VBAT.VNOM_x__post   
521  f_trustp_TOP-NFC-SE-INIT-JCOP.SMX.APDU.Pow.Boo...   
86   p_short_PADS.SIG.shortDPS.detect.V_x_1mA_x__VDDPA   
469                      a_iref_XTAL.OVERBOOST_x_x_x__   
..                                                 ...   
498              a_clcXrftrim_CLIF.TX2.RON.LS0_x_x_x__   
148             a_vihXvtx_CLIF.LPDET.DIFF.SEL9_x_x_x__   
46                p_dcspec_PADS.VOL_x_1V8_x__NFC_GPIO1   
93          a_iccXinlXtrim.zone_PMUGPADC_x_512.639_x__   
406  a_vrf_PMUVDDPA_x_VSUP-PWR.3V65-VDDPA.3V3-0mA_x...   

                                                  Data  
229  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...  
73   [0.0, 0.0838100611060084, 0.0865872942158563, ...  
521  [0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...  
86   [0.0, 0.0407371923510133, 0.0509198720853447, ...  
469  [0.0, 0.89854

In [31]:
#Check if same as previous train test split
print(X_test_SV.loc[406])

Name    a_vrf_PMUVDDPA_x_VSUP-PWR.3V65-VDDPA.3V3-0mA_x...
Data    [0.0, 0.0406106172372489, 0.0705740495555176, ...
Name: 406, dtype: object


In [32]:
#Reorder the indices so as to analyse the specified rows during testing 
X_test_SV.index = list(range(0,108))
print(X_test_SV)

                                                  Name  \
0                 xf_bf_TOP--CLOCK.MODULAR-MEAS.VERIFY   
1        a_icc_CLIF.CLKBUF.HP.VBAT_x_VBAT.VNOM_x__post   
2    f_trustp_TOP-NFC-SE-INIT-JCOP.SMX.APDU.Pow.Boo...   
3    p_short_PADS.SIG.shortDPS.detect.V_x_1mA_x__VDDPA   
4                        a_iref_XTAL.OVERBOOST_x_x_x__   
..                                                 ...   
103              a_clcXrftrim_CLIF.TX2.RON.LS0_x_x_x__   
104             a_vihXvtx_CLIF.LPDET.DIFF.SEL9_x_x_x__   
105               p_dcspec_PADS.VOL_x_1V8_x__NFC_GPIO1   
106         a_iccXinlXtrim.zone_PMUGPADC_x_512.639_x__   
107  a_vrf_PMUVDDPA_x_VSUP-PWR.3V65-VDDPA.3V3-0mA_x...   

                                                  Data  
0    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...  
1    [0.0, 0.0838100611060084, 0.0865872942158563, ...  
2    [0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...  
3    [0.0, 0.0407371923510133, 0.0509198720853447, ...  
4    [0.0, 0.89854

In [33]:
discarded_index = [indices for indices in list(X_test_SV.index) if indices not in list(accuracy_df.index)]
discarded_pile = X_test_SV.loc[discarded_index]['Name']
print(discarded_pile)

5              a_clcXrftrim_BBA.GainOffset.0_x_x_x__
9          p_open_PADS.DIG.EOF.V_x_1mA_x__SE_ISO_RST
10           p_dcspec_PADS.VOL_x_1V8_x__NFC_GPIO3_AO
11            a_vihXatt_CLIF.LPDET.DIFF.SEL8_x_x_x__
17          a_iccXinlXtrim.zone_PMUGPADC_x_0.127_x__
23             a_vihXvtx_CLIF.LPDET.SE.SEL15_x_x_x__
34         p_clc_PADS.IILDELTA_x_1V8_x__NFC_GPIO2_AO
37       ip_trim_TOP--DLOG-TRIMVALUE-VREF.HQ-WR.PFN2
42              a_clcXrftrim_AGCPhase.Crx.39_x_x_x__
45           p_clc_PADS.SIG.DELTA.EOF.V_x_1mA_x__TX2
55                  p_short_PADS.DIG.V_x_1mA_x__VDDA
56              ip_nvmXutil_TOP--DLOG-W1-TESTED-HOUR
59             a_trimXiccal_CLIF.RSSI.OFFSET_x_x_x__
61                  p_short_PADS.DIG.V_x_1mA_x__RSTN
62               p_lkg_PADS.IIL_x_1V8_x__NFC_CLK_REQ
64                 p_lkg_PADS.IIL_x_1V8_x__NFC_GPIO1
65       p_short_PADS.DIG.EOF.V_x_1mA_x__SE_SPI_MISO
67                      a_anlXlsb_CLIF.VTUNE_x_x_x__
70           p_lkg_PADS.IILEND_x_1V8_x__NFC_GP

In [34]:
#Adding the test names with the mismatches distribution for comparison
notconverge_df = comp_df[comp_df['Random Forest'] != comp_df['Support Vector']]
#print(notconverge_df)
notconverge_test = pd.DataFrame({'Test':discarded_pile.values,'Random Forest':notconverge_df['Random Forest'],'Support Vector':notconverge_df['Support Vector']})
print(notconverge_test)

                                              Test Random Forest  \
5            a_clcXrftrim_BBA.GainOffset.0_x_x_x__       bimodal   
9        p_open_PADS.DIG.EOF.V_x_1mA_x__SE_ISO_RST        normal   
10         p_dcspec_PADS.VOL_x_1V8_x__NFC_GPIO3_AO      longtail   
11          a_vihXatt_CLIF.LPDET.DIFF.SEL8_x_x_x__       bimodal   
17        a_iccXinlXtrim.zone_PMUGPADC_x_0.127_x__       bimodal   
23           a_vihXvtx_CLIF.LPDET.SE.SEL15_x_x_x__      discrete   
34       p_clc_PADS.IILDELTA_x_1V8_x__NFC_GPIO2_AO       bimodal   
37     ip_trim_TOP--DLOG-TRIMVALUE-VREF.HQ-WR.PFN2       bimodal   
42            a_clcXrftrim_AGCPhase.Crx.39_x_x_x__      discrete   
45         p_clc_PADS.SIG.DELTA.EOF.V_x_1mA_x__TX2       bimodal   
55                p_short_PADS.DIG.V_x_1mA_x__VDDA      longtail   
56            ip_nvmXutil_TOP--DLOG-W1-TESTED-HOUR      discrete   
59           a_trimXiccal_CLIF.RSSI.OFFSET_x_x_x__      discrete   
61                p_short_PADS.DIG.V_x_1mA_x__RS

In [35]:
#Do the same for wrongly predicted converged predictions
wrong_ensem_index = [indices for indices in list(accuracy_df.index) if indices not in list(accuracy_analyse_df.index)]
wrong_ensem_pred = X_test_SV.loc[wrong_ensem_index]['Name']
print(wrong_ensem_pred)

1     a_icc_CLIF.CLKBUF.HP.VBAT_x_VBAT.VNOM_x__post
63              a_clcXrftrim_AGCGain.Crx.61_x_x_x__
87               p_lkg_PADS.IIHEND_x_1V8_x__NFC_IRQ
88         p_clc_PADS.DIG.DELTA.EOF.V_x_1mA_x__RSTN
Name: Name, dtype: object


In [36]:
#Adding the test names with the wrong prediction for comparison
wrong_pred = accuracy_df[accuracy_df['Converge prediction']!=accuracy_df['Actual']]
wrong_pred_test = pd.DataFrame({'Test':wrong_ensem_pred.values,'Models Prediction':wrong_pred['Converge prediction'],'Actual Prediction':wrong_pred['Actual']})
print(wrong_pred_test)

                                             Test Models Prediction  \
1   a_icc_CLIF.CLKBUF.HP.VBAT_x_VBAT.VNOM_x__post            normal   
63            a_clcXrftrim_AGCGain.Crx.61_x_x_x__          longtail   
87             p_lkg_PADS.IIHEND_x_1V8_x__NFC_IRQ            normal   
88       p_clc_PADS.DIG.DELTA.EOF.V_x_1mA_x__RSTN           outlier   

   Actual Prediction  
1            bimodal  
63           outlier  
87           bimodal  
88          longtail  


Testnames that fall under (1) discarded pile or (2) wrongly predicted are extracted for further analysis. 
Plot these back in Exensio and if they fall under (1), which of RF or SVM made the correct decision. If under (2), analyse why. Perhaps it is because of inconsistent labelling or inability of model to recognise unique features of that particular distribution.

### Conclusion
- Ensemble learning model achieves an accuracy of 95% on test data (compared to 77% for RF and SVM individually), but this comes with a trade-off - there are some instances where by both models cannot agree upon. 
- Further comments are in 'Ensemble Learning Attempt' pdf report.