### Importing Required Packages

In [1]:
import pandas as pd
import numpy as np

### Necessary functions
- These functions are explained in the 'data cleaning' file
- They are necessary to format the data for modelling

In [2]:
#Converting systematic sampling (FOR FLOATS) to a function
def float_sample_2(population,sample_size):
    base_interval = len(population) // sample_size
    remainder = len(population)%sample_size
    #indices = []
    indices = {}
    #Add in min and max points in array
    min_index = population.argmin()
    max_index = population.argmax()
    indices[0] = min_index
    indices[1] = max_index
    start_index = np.random.randint(0,base_interval)
    for i in range(2,sample_size):
        interval = base_interval + 1 if i < remainder else base_interval
        #print("Interval is: ", interval)
        index = (start_index + i * interval) % len(population)
        #print("Index is: ",index)
        #indices.append(index)
        indices[i] = index
    #print("The dict is: ",indices)
    unique_indices = list(indices.values())
    unique_indices = [*set(unique_indices)]
    systematic_sample = population[unique_indices]
    print("Before delete")
    #print("systematic_sample: ",systematic_sample)
    population = np.delete(population,unique_indices)
    print("After delete")
    if len(systematic_sample) < sample_size:
        remaining_samples = sample_size - len(systematic_sample)
        remaining_indices = np.random.choice(len(population),remaining_samples,replace=False)
        systematic_sample = np.concatenate((systematic_sample,population[remaining_indices]))
    return systematic_sample
#Function to apply the systematic sampling for the function 
def apply_sampling(df):
    new_df = pd.DataFrame()
    for colName,colData in df.items():
        data = df.dropna(subset=colName)
        newCol = list(data[colName])
        newCol = np.array(newCol)
        newCol = pd.Series(float_sample_2(newCol,50000))
        #Can only concat after sampling!!
        new_df = pd.concat([new_df,newCol.rename(colName)],axis=1)
    return new_df
#Import relevant packages 
from sklearn import preprocessing
min_max = preprocessing.MinMaxScaler()
#Function to normalise all the columns in a dataframe
def normalise(df):
    final_df = pd.DataFrame()
    for colName,colData in df.items():
        c_scaled = min_max.fit_transform(df[[colName]])
        c_scaled = pd.Series(c_scaled.ravel())
        final_df = pd.concat([final_df,c_scaled.rename(colName)],axis=1)
    return final_df
#Function to convert all the data (individual tuple) into arrays/list for modelling
def convert_data_rows(df):
    data_points = []
    for colName,colData in df.items():
        data_points.append(np.asarray(colData))
    return data_points
#Function to drop columns (tests) in the raw data file which are NOT tests (i.e metadata like start time, lot number etc)
def test_only(datafile,labelfile):
    keys = list(labelfile['Name'])
    df = datafile[keys]
    return df
def identify_col(df):
    null_perCol = df.isnull().sum()
    insuff_col = {}
    for index in range(len(null_perCol)):
        if(df.shape[0]-null_perCol[index]<50000):
            insuff_col[null_perCol.index[index]] = null_perCol[index]
    return list(insuff_col.keys())
#Function to find out non applicable rows (no distribution, not part of training data)
def non_applicable (df_label):
    null_rowsLabel = df_label[df_label['Distribution Type'].isnull()]
    null_rowsLabel = list(null_rowsLabel['Name'])
    return null_rowsLabel
def drop_col(df_dropped,less_50000):
    pd.options.mode.chained_assignment = None 
    df = df_dropped.drop(columns=less_50000)
    return df
#Check if everything is replaced correctly (no more null values for the columns with <50000) data points
def check_nonull(df):
    null_perCol = df.isnull().sum()
    insuff_col = {}
    for index in range(len(null_perCol)):
        if(df.shape[0]-null_perCol[index]<50000):
            insuff_col[null_perCol.index[index]] = null_perCol[index]
    print(insuff_col)
    print(len(insuff_col))     

### Importing our data 
- HK data is the data containing array instances, and they are already cleaned
- HK label is the relabelled labels of the corresponding array instances

In [3]:
HK_data = pd.read_csv('HK_cleaned_EY.csv',low_memory=False)

In [4]:
HK_normalised = HK_data

In [5]:
import chardet
filename = 'HK_labels_31_10_23.csv'
with open(filename, 'rb') as file:
    print(chardet.detect(file.read()))

{'encoding': 'UTF-16', 'confidence': 1.0, 'language': ''}


In [6]:
HK_label = pd.read_csv(r'HK_labels_31_10_23.csv',encoding = 'utf-16')

### Data cleaning and pre-preparation for modelling

In [7]:
#Extract the corresponding columns from the label file
good_cols = list(HK_normalised.columns)
keys = HK_label['Name']
extract = [i for i in keys if i in good_cols]
print(len(extract))
HK_corr_label = HK_label[HK_label['Name'].isin(extract)]

1689


In [8]:
#Function to concat the dataframes (raw data and the name of the tests) -> row format. i.e 2 columns: data, name
def concat_xy(df):
    cols = ['Data','Name']
    concat_df = pd.DataFrame(columns=cols,index=range(df.shape[1]))
    index = 0
    for colName, colData in df.items():
        #variable index is to retrieve the index of the columns in YK raw (final_df)
        vals = np.array(colData.values)
        concat_df.loc[index].Data = vals
        concat_df.loc[index].Name = colName
        index+=1
    return concat_df
#Function concatenating the Y (distribution) with the raw data (X) -> dataframe passed in is the label final! concat_df is testname_data
def model_xy(label_df,concat_df):
    dist_labels = label_df[['Distribution Type','Name']]
    model_df = pd.merge(concat_df,dist_labels,on='Name')
    #model_df_xy = model_df[['Data','Distribution Type']]
    return model_df

In [9]:
#Sorting the data beforehand before modelling
num_classes = 6
def sort_df(df):
    #Make a copy then sort
    sorted_df = df
    for index in range(sorted_df.shape[0]):
        sorted_df['Data'].iloc[index].sort()
    return sorted_df
#Function to convert all the data (individual tuple) into arrays/list for modelling
def convert_data_rows(df):
    data_points = []
    for colName,colData in df.items():
        data_points.append(np.asarray(colData))
    return data_points

In [10]:
HK_testname_data = concat_xy(HK_normalised)
print(HK_testname_data.head())
HK_model = model_xy(HK_corr_label,HK_testname_data)
print(HK_model.head())
print(HK_model['Distribution Type'].value_counts())
print(HK_model['Distribution Type'])

                                                Data  \
0  [0.7087378640776696, 0.737864077669903, 0.7864...   
1  [0.4456761723727496, 0.6582697960780024, 0.557...   
2  [0.8347090082415121, 0.6804384607235257, 0.677...   
3  [0.0, 0.1538461538461542, 0.4615384615384617, ...   
4  [0.6612903225806455, 0.7258064516129035, 0.709...   

                                               Name  
0  a_clcXrftrimm_CLIF.TX.PHASE.VDDPA.66C.13_x_x_x__  
1   p_open_PADS.DIG.OPENVDDIO.V_x_1mA_x__SE_SPI_CLK  
2       a_anlXphase_CLIF.TX.BETA.40_x_VDDPA.3V3_x__  
3           a_vihXvtx_CLIF.LPDET.DIFF.SEL10_x_x_x__  
4   a_clcXrftrimm_CLIF.TX.PHASE.VDDPA.66C.6_x_x_x__  
                                                Data  \
0  [0.7087378640776696, 0.737864077669903, 0.7864...   
1  [0.4456761723727496, 0.6582697960780024, 0.557...   
2  [0.8347090082415121, 0.6804384607235257, 0.677...   
3  [0.0, 0.1538461538461542, 0.4615384615384617, ...   
4  [0.6612903225806455, 0.7258064516129035, 0.709...   

  

## Modelling - Support Vector Machine

### Separate them based on distribution type for separate analysis of prediction
- This is so that we can preserve the testnames to plot them back in Exensio for analysis

In [11]:
#Drop the tests with undeterministic distributions/not necessary for prediction
all_tests = list(HK_model['Name'])
#Those that are weird (to be removed) and potentially belong to another dist but are labelled to have a dist
unecessary = ['i_TOP-MISC-DLOG_x_x_x_x__Touch-Down-Num','p_postXfreq_QA.NFC.RINGO.fe.nand2.long.0_x_nom_x','p_postXfreq_QA.NFC.RINGO.fe.nor2.long.0_x_nom_x','p_postXfreq_QA.NFC.RINGO.be.m24.rc.line_x_nom_x']
#Remove untagged or unecessary (those that have tagged but undeterminstic)
untagged = list(HK_model[HK_model['Distribution Type']=='Untagged']['Name'])
print(len(untagged))
HK_model_dropped = HK_model[~HK_model['Name'].isin(unecessary+untagged)]
print(HK_model_dropped.head())
print(len(HK_model_dropped))

26
                                                Data  \
0  [0.7087378640776696, 0.737864077669903, 0.7864...   
1  [0.4456761723727496, 0.6582697960780024, 0.557...   
2  [0.8347090082415121, 0.6804384607235257, 0.677...   
3  [0.0, 0.1538461538461542, 0.4615384615384617, ...   
4  [0.6612903225806455, 0.7258064516129035, 0.709...   

                                               Name Distribution Type  
0  a_clcXrftrimm_CLIF.TX.PHASE.VDDPA.66C.13_x_x_x__           outlier  
1   p_open_PADS.DIG.OPENVDDIO.V_x_1mA_x__SE_SPI_CLK            normal  
2       a_anlXphase_CLIF.TX.BETA.40_x_VDDPA.3V3_x__           outlier  
3           a_vihXvtx_CLIF.LPDET.DIFF.SEL10_x_x_x__          discrete  
4   a_clcXrftrimm_CLIF.TX.PHASE.VDDPA.66C.6_x_x_x__           outlier  
1662


In [12]:
HK_model_dropped['Distribution Type'].value_counts()

outlier       809
functional    292
longtail      184
normal        149
discrete      136
bimodal        92
Name: Distribution Type, dtype: int64

### Analysing the unbalanced dataset
- How many instances are there for each distribution

In [13]:
#Grouping the dataframe by the distribution type
grouped_df = HK_model_dropped.groupby('Distribution Type')
#Verifying that grouping is indeed done
for group_name, group_data in grouped_df:
    print("Group: {}".format(group_name))
    print(group_data.head())
    print("------")

Group: bimodal
                                                  Data  \
11   [0.2741480578231607, 0.2869562789139079, 0.184...   
35   [0.3009006829558842, 0.265729385711893, 0.2835...   
102  [0.5335934209651043, 0.2199144370802059, 0.362...   
133  [0.0, 0.032258064516129, 1.0, 1.0, 0.032258064...   
135  [0.0, 0.1333333333333333, 0.0, 1.0, 1.0, 0.066...   

                                            Name Distribution Type  
11         a_clk_TXDC.DELAYCHAIN_x_VDDPA.3V3_x__           bimodal  
35       p_clc_PADS.SIG.DELTA.EOF.V_x_1mA_x__TX2           bimodal  
102           p_short_PADS.DIG.V_x_1mA_x__PMUVCC           bimodal  
133  ip_trim_TOP--DLOG-TRIMVALUE-VREF.LQ-WR.PFN2           bimodal  
135  ip_trim_TOP--DLOG-TRIMVALUE-IREF.HQ-WR.PFN2           bimodal  
------
Group: discrete
                                                 Data  \
3   [0.0, 0.1538461538461542, 0.4615384615384617, ...   
13  [0.0, 0.7692307692307692, 0.2307692307692308, ...   
20  [0.4000000000000001, 0.5

### Trimming data to allow model to be trained with balanced data set

In [14]:
#Function to trim the group
def trim_df (new_df,grouped_df):
    trimmed_group = None
    for _,group in grouped_df:
        count = len(group)
        #If the number of instance for any group is below limit (100), print warning statement
        if count <= 90:
            print("Group: {} has less than 90!".format(group))
        else:
            trimmed_group = group.sample(n=90,random_state=42)
        trimmed_df = pd.concat([new_df,trimmed_group])
        new_df = trimmed_df
    return trimmed_df
#Creating a new dataframe to store the trimmed data
trimmed_df = pd.DataFrame(columns=HK_model_dropped.columns)
print(trimmed_df.head())
#Applying the function
trimmed_df = trim_df(trimmed_df,grouped_df)
#Resetting the index of the trimmed dataframe
trimmed_df = trimmed_df.reset_index(drop=True)
print(trimmed_df)
print(trimmed_df['Distribution Type'].value_counts())

Empty DataFrame
Columns: [Data, Name, Distribution Type]
Index: []
                                                  Data  \
0    [0.549284438224691, 0.6680452392509326, 0.3225...   
1    [0.6203717955893876, 0.5743185925235628, 0.438...   
2    [0.104463099767329, 0.4583138840177989, 0.3892...   
3    [0.4847588264545095, 0.2324714072600784, 0.307...   
4    [0.2741480578231607, 0.2869562789139079, 0.184...   
..                                                 ...   
535  [0.5370823145884271, 0.5215973920130399, 0.449...   
536  [0.3209093280119366, 0.2417547091500351, 0.162...   
537  [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...   
538  [0.0026848040093073, 0.0035797386790764, 0.003...   
539  [0.9999942766938472, 0.9999944533970044, 0.999...   

                                                  Name Distribution Type  
0               p_lkg_PADS.IILEND_x_1V8_x__NFC_CLK_REQ           bimodal  
1                p_lkg_PADS.IIH_x_1V8_x__NFC_SIM_SWIO1           bimodal  
2          

### Training
- We extracted 90 instances of each distribution
- We trained the model with 72 instances of each distribution (80%)

In [86]:
#Prepare the training and sorting data 
from sklearn.model_selection import train_test_split
def support_vector(model_df):
    #label_encoder = LabelEncoder()
    xData = sort_df(model_df)['Data']
    yData = sort_df(model_df)['Distribution Type']
    X_train, X_test, Y_train, Y_test = train_test_split(xData,yData,test_size = 0.2,random_state=42) 
    X_train = convert_data_rows(X_train)
    X_test = convert_data_rows(X_test)
    X_train = np.array(X_train).astype('float64')
    X_test= np.array(X_test).astype('float64')
    #num_samples, num_timesteps = X_train.shape
    #X_train = X_train.reshape(num_samples,num_timesteps,1)
    #X_test = X_test.reshape(X_test.shape[0],num_timesteps,1)
    #Y_train = label_encoder.fit_transform(Y_train)
    #Y_train = to_categorical(Y_train,num_classes)
    #Y_test = label_encoder.fit_transform(Y_test)
    #Y_test = to_categorical(Y_test,num_classes)
    return X_train,X_test,Y_train,Y_test

In [87]:
X_train_SV, X_test_SV, Y_train_SV, Y_test_SV = support_vector(trimmed_df)

In [90]:
print(X_test_SV[28])

[0.         0.00724202 0.00724202 ... 0.94834142 0.94834142 1.        ]


In [17]:
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
import joblib
svm_classifier = SVC(kernel='linear',random_state=42)
svm_classifier.fit(X_train_SV,Y_train_SV)
pred_classes = svm_classifier.predict(X_test_SV)
print("Accuracy: {}".format(accuracy_score(pred_classes,Y_test_SV)))

Accuracy: 0.7777777777777778


### Extracting data based on distribution types for further analysis

In [18]:
HK_model = HK_model_dropped
outlier_df = HK_model[HK_model['Distribution Type']=='outlier']
normal_df =  HK_model[HK_model['Distribution Type']=='normal']
bimodal_df =  HK_model[HK_model['Distribution Type']=='bimodal']
longtail_df =  HK_model[HK_model['Distribution Type']=='longtail']
functional_df =  HK_model[HK_model['Distribution Type']=='functional']
discrete_df =  HK_model[HK_model['Distribution Type']=='discrete']
#Check if all the cases are covered
print(len(outlier_df)+len(normal_df)+len(bimodal_df)+len(longtail_df)+len(functional_df)+len(discrete_df)==len(HK_model))

True


In [19]:
#Function to conserve the test name 
def class_model(HK_model):
    HK_arrays = sort_df(HK_model)['Data']
    HK_arrays = convert_data_rows(HK_arrays)
    HK_arrays = np.array(HK_arrays).astype('float64')
    HK_testlabel = HK_model['Distribution Type']
    return HK_arrays, HK_testlabel
outlier_arrays, outlier_label = class_model(outlier_df)
normal_arrays, normal_label = class_model(normal_df)
bimodal_arrays, bimodal_label = class_model(bimodal_df)
longtail_arrays, longtail_label = class_model(longtail_df)
functional_arrays, functional_label = class_model(functional_df)
discrete_arrays, discrete_label = class_model(discrete_df)

### Retrieving the class accuracy for SVM

In [20]:
#Creating a function to test accuracy of classes 
def class_accuracy(dist,model,arrays,testlabels):
    pred_classes = model.predict(arrays)
    print("Accuracy for {} is: {}".format(dist,accuracy_score(pred_classes,testlabels)))
    return pred_classes
#sv_model = svm_classifier
outlier_pred = class_accuracy("outlier",svm_classifier,outlier_arrays,outlier_label)
normal_pred = class_accuracy("normal",svm_classifier,normal_arrays,normal_label)
bimodal_pred = class_accuracy("bimodal",svm_classifier,bimodal_arrays,bimodal_label)
longtail_pred = class_accuracy("longtail",svm_classifier,longtail_arrays,longtail_label)
functional_pred = class_accuracy("functional",svm_classifier,functional_arrays,functional_label)
discrete_pred = class_accuracy("discrete",svm_classifier,discrete_arrays,discrete_label)

Accuracy for outlier is: 0.9245982694684796
Accuracy for normal is: 0.9261744966442953
Accuracy for bimodal is: 0.8478260869565217
Accuracy for longtail is: 0.8043478260869565
Accuracy for functional is: 0.9315068493150684
Accuracy for discrete is: 0.7647058823529411


## Finding out misclassification for SVM

In [21]:
from collections import Counter
def missclass(distclass,pred_class,testlabel):
    pred_counter = Counter(pred_class)
    for dist, times in pred_counter.items():
        print("{}: Model predicted {} for {} times".format(distclass,dist,times))
    print("------------------------------------------------------------")
missclass("outlier",outlier_pred,outlier_label)
missclass("normal",normal_pred,normal_label)
missclass("bimodal",bimodal_pred,bimodal_label)
missclass("longtail",longtail_pred,longtail_label)
missclass("functional",functional_pred,functional_label)
missclass("discrete",discrete_pred,discrete_label)

outlier: Model predicted outlier for 748 times
outlier: Model predicted normal for 21 times
outlier: Model predicted longtail for 32 times
outlier: Model predicted bimodal for 6 times
outlier: Model predicted discrete for 2 times
------------------------------------------------------------
normal: Model predicted normal for 138 times
normal: Model predicted longtail for 8 times
normal: Model predicted bimodal for 3 times
------------------------------------------------------------
bimodal: Model predicted bimodal for 78 times
bimodal: Model predicted longtail for 4 times
bimodal: Model predicted normal for 8 times
bimodal: Model predicted discrete for 2 times
------------------------------------------------------------
longtail: Model predicted longtail for 148 times
longtail: Model predicted normal for 27 times
longtail: Model predicted outlier for 8 times
longtail: Model predicted discrete for 1 times
------------------------------------------------------------
functional: Model pred

In [22]:
#Creating a function to get a predicted column
def prediction_SVM(df,arrays,testlabels,model):
    model_df = pd.DataFrame()
    pred_classes = model.predict(arrays)
    model_df['Name'] = df['Name']
    model_df['Actual Distribution'] = testlabels
    model_df['Predicted Distribution'] = pred_classes 
    return model_df
#Sieve out those the specific tests that model misclassified
def sieve_test(model_df,actual,predicted):
    sieved = model_df[(model_df['Actual Distribution']==actual) & (model_df['Predicted Distribution']==predicted)]
    return sieved

## Misclassified Outliers

In [23]:
#Try for only outlier df
outlier_predicted = prediction_SVM(outlier_df,outlier_arrays,outlier_label,svm_classifier)
print(outlier_predicted.head())

                                                Name Actual Distribution  \
0   a_clcXrftrimm_CLIF.TX.PHASE.VDDPA.66C.13_x_x_x__             outlier   
2        a_anlXphase_CLIF.TX.BETA.40_x_VDDPA.3V3_x__             outlier   
4    a_clcXrftrimm_CLIF.TX.PHASE.VDDPA.66C.6_x_x_x__             outlier   
5                      a_vrf_CLIF.VCM.HP.RXN_x_x_x__             outlier   
6  f_trustp_TOP-NFC-SE-INIT-JCOP.CASAppletID.ans2...             outlier   

  Predicted Distribution  
0                outlier  
2                outlier  
4                outlier  
5                outlier  
6                outlier  


In [24]:
#Outliers predicted correctly as outliers
correct = sieve_test(outlier_predicted,"outlier","outlier")
print(correct)
print(correct.shape)

                                                   Name Actual Distribution  \
0      a_clcXrftrimm_CLIF.TX.PHASE.VDDPA.66C.13_x_x_x__             outlier   
2           a_anlXphase_CLIF.TX.BETA.40_x_VDDPA.3V3_x__             outlier   
4       a_clcXrftrimm_CLIF.TX.PHASE.VDDPA.66C.6_x_x_x__             outlier   
5                         a_vrf_CLIF.VCM.HP.RXN_x_x_x__             outlier   
6     f_trustp_TOP-NFC-SE-INIT-JCOP.CASAppletID.ans2...             outlier   
...                                                 ...                 ...   
1682                 p_dcspec_PADS.VOH_x_1V2_x__NFC_IRQ             outlier   
1684  f_trustp_TOP-NFC-SE-INIT-JCOP.ConfigAppletID.a...             outlier   
1685  f_trustp_TOP-NFC-SE-INIT-JCOP.SMX.APDU.Pow.Boo...             outlier   
1686  a_vrf_PMUVDDPA_x_VSUP-PWR.4V25-VDDPA.3V3-530mA...             outlier   
1687  a_icc_PMUVDDPA_x_VSUP-PWR.4V0-VDDPA.3V45.320mA...             outlier   

     Predicted Distribution  
0                   o

In [25]:
#Outliers predicted correctly as outliers
correct = sieve_test(outlier_predicted,"outlier","outlier")
print(len(correct))
correct = sieve_test(outlier_predicted,"outlier","normal")
print(len(correct))
correct = sieve_test(outlier_predicted,"outlier","longtail")
print(len(correct))
correct = sieve_test(outlier_predicted,"outlier","discrete")
print(len(correct))
correct = sieve_test(outlier_predicted,"outlier","functional")
print(len(correct))




748
21
32
2
0


## Misclassified Normal

In [26]:
#Try for only outlier df
outlier_predicted = prediction_SVM(outlier_df,outlier_arrays,outlier_label,svm_classifier)
print(outlier_predicted.head())

                                                Name Actual Distribution  \
0   a_clcXrftrimm_CLIF.TX.PHASE.VDDPA.66C.13_x_x_x__             outlier   
2        a_anlXphase_CLIF.TX.BETA.40_x_VDDPA.3V3_x__             outlier   
4    a_clcXrftrimm_CLIF.TX.PHASE.VDDPA.66C.6_x_x_x__             outlier   
5                      a_vrf_CLIF.VCM.HP.RXN_x_x_x__             outlier   
6  f_trustp_TOP-NFC-SE-INIT-JCOP.CASAppletID.ans2...             outlier   

  Predicted Distribution  
0                outlier  
2                outlier  
4                outlier  
5                outlier  
6                outlier  


### Note - Only Normal and Outliers are shown. 
- Repeat and reuse codes to analyse other distributions accordingly

## Modelling - Random Forest
- We then repeated the whole process of training using the random forest model

In [27]:
#Prepare the training and sorting data 
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
def train_test_RF(model_df):
    #label_encoder = LabelEncoder()
    xData = sort_df(model_df)['Data']
    yData = sort_df(model_df)['Distribution Type']
    X_train, X_test, Y_train, Y_test = train_test_split(xData,yData,test_size = 0.2,random_state=42)
    X_train = convert_data_rows(X_train)
    X_test = convert_data_rows(X_test)
    X_train = np.array(X_train).astype('float64')
    X_test= np.array(X_test).astype('float64')
    #num_samples, num_timesteps = X_train.shape
    #X_train = X_train.reshape(num_samples,num_timesteps,1)
    #X_test = X_test.reshape(X_test.shape[0],num_timesteps,1)
    Y_train = label_encoder.fit_transform(Y_train)
    Y_train = to_categorical(Y_train,num_classes)
    Y_test = label_encoder.fit_transform(Y_test)
    Y_test = to_categorical(Y_test,num_classes)
    return X_train,X_test,Y_train,Y_test

In [63]:
X_train_RF,X_test_RF,Y_train_RF,Y_test_RF = train_test_RF(trimmed_df)

In [29]:
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
# Create a Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
# Train the Random Forest classifier
rf_classifier.fit(X_train_RF, Y_train_RF)
# Predict the labels for the test set
y_pred = rf_classifier.predict(X_test_RF)
# Evaluate the accuracy of the classifier
accuracy = np.mean(y_pred == Y_test_RF)
print("Accuracy: {}".format(accuracy_score(y_pred,Y_test_RF)))

Accuracy: 0.7777777777777778


### Unpacking the encoding of Random Forest model
- This is so that we can identify the labels as strings instead of integers and analyse them accordingly

In [30]:
class_labels = label_encoder.classes_
labels = []
for i,label in enumerate(class_labels):
    print("Integer {} corresponds to class labels: {}".format(i,label))
    labels.append(label)

Integer 0 corresponds to class labels: bimodal
Integer 1 corresponds to class labels: discrete
Integer 2 corresponds to class labels: functional
Integer 3 corresponds to class labels: longtail
Integer 4 corresponds to class labels: normal
Integer 5 corresponds to class labels: outlier


In [31]:
#Switch statements to return corresponding distributions
def return_dist(arg):
    match arg:
        case 0:
            return "bimodal"
        case 1:
            return "discrete"
        case 2:
            return "functional"
        case 3:
            return "longtail"
        case 4:
            return "normal"
        case 5:
            return "outlier"
#Function to concat the distributions together
def dist_array(num_array):
    dist = []
    for num in num_array:
        dist_name = return_dist(num)
        dist.append(dist_name)
    return dist

### Retrieving the class accuracy for RF

In [32]:
#Creating a function to test accuracy of classes 
def class_accuracy_RF(dist,model,arrays,testlabels):
    arrays = np.array(arrays).astype('float64')
    pred_classes = model.predict(arrays)
    pred_classes = np.argmax(pred_classes,axis=1)
    pred_classes = dist_array(pred_classes)
    print("Accuracy for {} is: {}".format(dist,accuracy_score(pred_classes,testlabels)))
    return pred_classes
#sv_model = rf_classifier
outlier_pred_RF = class_accuracy_RF("outlier",rf_classifier,outlier_arrays,outlier_label)
normal_pred_RF = class_accuracy_RF("normal",rf_classifier,normal_arrays,normal_label)
bimodal_pred_RF = class_accuracy_RF("bimodal",rf_classifier,bimodal_arrays,bimodal_label)
longtail_pred_RF = class_accuracy_RF("longtail",rf_classifier,longtail_arrays,longtail_label)
functional_pred_RF = class_accuracy_RF("functional",rf_classifier,functional_arrays,functional_label)
discrete_pred_RF = class_accuracy_RF("discrete",rf_classifier,discrete_arrays,discrete_label)

Accuracy for outlier is: 0.9184177997527813
Accuracy for normal is: 0.9664429530201343
Accuracy for bimodal is: 0.9347826086956522
Accuracy for longtail is: 0.8641304347826086
Accuracy for functional is: 0.9623287671232876
Accuracy for discrete is: 0.8455882352941176


## Finding out where misclassification went wrong RF

In [44]:
from collections import Counter
def missclass(distclass,pred_class,testlabel):
    pred_counter = Counter(pred_class)
    for dist, times in pred_counter.items():
        print("{}: Model predicted {} for {} times".format(distclass,dist,times))
    print("------------------------------------------------------------")
missclass("outlier",outlier_pred_RF,outlier_label)
missclass("normal",normal_pred_RF,normal_label)
missclass("bimodal",bimodal_pred_RF,bimodal_label)
missclass("longtail",longtail_pred_RF,longtail_label)
missclass("functional",functional_pred_RF,functional_label)
missclass("discrete",discrete_pred_RF,discrete_label)

outlier: Model predicted outlier for 743 times
outlier: Model predicted normal for 10 times
outlier: Model predicted discrete for 5 times
outlier: Model predicted bimodal for 28 times
outlier: Model predicted longtail for 23 times
------------------------------------------------------------
normal: Model predicted normal for 144 times
normal: Model predicted bimodal for 5 times
------------------------------------------------------------
bimodal: Model predicted bimodal for 86 times
bimodal: Model predicted longtail for 2 times
bimodal: Model predicted discrete for 1 times
bimodal: Model predicted normal for 2 times
bimodal: Model predicted outlier for 1 times
------------------------------------------------------------
longtail: Model predicted longtail for 159 times
longtail: Model predicted normal for 5 times
longtail: Model predicted outlier for 8 times
longtail: Model predicted bimodal for 12 times
------------------------------------------------------------
functional: Model pred

In [45]:
#Accuracy in proportion
from collections import Counter
def missclass_prop(distclass,pred_class,testlabel):
    pred_counter = Counter(pred_class)
    for dist, times in pred_counter.items():
        print("{}: Model predicted {} for {} percent".format(distclass,dist, (times/len(testlabel)*100) ))
    print("------------------------------------------------------------")
#For old
'''
missclass_prop("outlier",outlier_pred_RF_old,outlier_label_old)
missclass_prop("normal",normal_pred_RF_old,normal_label_old)
missclass_prop("bimodal",bimodal_pred_RF_old,bimodal_label_old)
missclass_prop("longtail",longtail_pred_RF_old,longtail_label_old)
missclass_prop("functional",functional_pred_RF_old,functional_label_old)
missclass_prop("discrete",discrete_pred_RF_old,discrete_label_old)
'''
print("~~~~~~~~~~~~~~~~~~~~RELABELLED MODELLLLL~~~~~~~~~~~~~~~~~~~~~~~~")
#For new
missclass_prop("outlier",outlier_pred_RF,outlier_label)
missclass_prop("normal",normal_pred_RF,normal_label)
missclass_prop("bimodal",bimodal_pred_RF,bimodal_label)
missclass_prop("longtail",longtail_pred_RF,longtail_label)
missclass_prop("functional",functional_pred_RF,functional_label)
missclass_prop("discrete",discrete_pred_RF,discrete_label)

outlier: Model predicted outlier for 81.81818181818183 percent
outlier: Model predicted bimodal for 9.87012987012987 percent
outlier: Model predicted longtail for 6.363636363636363 percent
outlier: Model predicted normal for 1.8181818181818181 percent
outlier: Model predicted discrete for 0.12987012987012986 percent
------------------------------------------------------------
normal: Model predicted normal for 82.5 percent
normal: Model predicted outlier for 2.5 percent
normal: Model predicted bimodal for 8.333333333333332 percent
normal: Model predicted longtail for 5.833333333333333 percent
normal: Model predicted functional for 0.8333333333333334 percent
------------------------------------------------------------
bimodal: Model predicted bimodal for 100.0 percent
------------------------------------------------------------
longtail: Model predicted longtail for 51.55807365439094 percent
longtail: Model predicted bimodal for 31.444759206798867 percent
longtail: Model predicted outli

### Conclusion 
- Random Forests and Support Vector Machine both perform relatively well >75%
- Random Forests predicts best in: 1. Normal, 2. Functional, 3. Bimodal
- Support Vector Machine predicts best in: 1. Functional, 2. Normal, 3. Outlier
- We can exploit these data for future steps (for further explanation, refer to 'Ensemble Learning Attempt' pdf report