## Importing necessary libraries and files

In [1]:
import pandas as pd
import numpy as np

In [2]:
#Loading the cleaned, concatenated using the npz file 
loaded_data = np.load('YK_TY_df.npz')
print(loaded_data)
loaded_data_arrays = loaded_data['data']
loaded_labels = loaded_data['labels']
loaded_df = pd.DataFrame({'Data': loaded_data_arrays.tolist(), 'Distribution Type': loaded_labels.tolist()})

<numpy.lib.npyio.NpzFile object at 0x000002B52FCC60B0>


In [3]:
#Visualing the data we have imported
print(loaded_df.head())
print(loaded_df['Distribution Type'].value_counts())

                                                Data Distribution Type
0  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...        functional
1  [0.399193055385744, 0.3974405999103395, 0.3969...          longtail
2  [0.0, 0.0002055076037813, 0.000308261405672, 0...          longtail
3  [0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, ...        functional
4  [1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, ...        functional
functional    1186
outlier        373
normal         327
longtail       290
bimodal        131
discrete       120
Name: Distribution Type, dtype: int64


In [25]:
#Checking shape 
print(len(loaded_df['Data'].iloc[1]))

[0.399193055385744, 0.3974405999103395, 0.3969922973468639, 0.5864612625830379, 0.5864612625830379, 0.5864612625830379, 0.5929412723641847, 0.5929412723641847, 0.5929412723641847, 0.5929412723641847, 0.5735827525777398, 0.5735827525777398, 0.5735827525777398, 0.5735827525777398, 0.5970575049924604, 0.5970575049924604, 0.5970575049924604, 0.5970575049924604, 0.5799404980233933, 0.5799404980233933, 0.5799404980233933, 0.5929820271426826, 0.5929820271426826, 0.5929820271426826, 0.5929820271426826, 0.5770469087500509, 0.5770469087500509, 0.5770469087500509, 0.5770469087500509, 0.5912295716672781, 0.5912295716672781, 0.5912295716672781, 0.5912295716672781, 0.5839344663161756, 0.5839344663161756, 0.5839344663161756, 0.5926152341362025, 0.5926152341362025, 0.5926152341362025, 0.5926152341362025, 0.5789216285609489, 0.5789216285609489, 0.5789216285609489, 0.5789216285609489, 0.5899254187553491, 0.5899254187553491, 0.5899254187553491, 0.5899254187553491, 0.578147287769491, 0.578147287769491, 0.

As seen above, the classes are unbalanced. It is not possible to oversample the classes with fewer numbers (eg discrete, bimodal etc) as it might introduce bias if we input 50,000 data points each for instance we oversample. Hence, we will try to balance the class by limiting each class to ~100 

## Addressing the unbalanced classes
We shall cut each class to 100 instances

In [4]:
#Grouping the dataframe by the distribution type
grouped_df = loaded_df.groupby('Distribution Type')
#Verifying that grouping is indeed done
for group_name, group_data in grouped_df:
    print("Group: {}".format(group_name))
    print(group_data.head())
    print("------")

Group: bimodal
                                                 Data Distribution Type
21  [0.5733333333333334, 0.3333333333333333, 0.52,...           bimodal
22  [0.5441176470588236, 0.5, 0.6323529411764706, ...           bimodal
23  [0.5789473684210528, 0.3157894736842106, 0.543...           bimodal
32  [0.4029850746268657, 0.373134328358209, 0.6567...           bimodal
69  [0.4519230769230768, 0.3846153846153846, 0.423...           bimodal
------
Group: discrete
                                                  Data Distribution Type
97   [0.526276733389146, 0.4399402542050392, 0.5414...          discrete
102  [0.5361310054553652, 0.4429104227188603, 0.547...          discrete
118  [0.3999999999999999, 0.3999999999999999, 0.399...          discrete
119  [0.25, 0.5, 0.25, 0.25, 0.3333333333333335, 0....          discrete
147  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...          discrete
------
Group: functional
                                                Data Distribution T

In [5]:
#Function to trim the group
def trim_df (new_df,grouped_df):
    for _,group in grouped_df:
        count = len(group)
        #If the number of instance for any group is below limit (100), print warning statement
        if count <= 100:
            print("Group: {} has less than 100!".format(group))
        else:
            trimmed_group = group.sample(n=100,random_state=42)
        trimmed_df = pd.concat([new_df,trimmed_group])
        new_df = trimmed_df
    return trimmed_df

In [6]:
#Creating a new dataframe to store the trimmed data
trimmed_df = pd.DataFrame(columns=loaded_df.columns)
print(trimmed_df.head())
#Applying the function
trimmed_df = trim_df(trimmed_df,grouped_df)
#Resetting the index of the trimmed dataframe
trimmed_df = trimmed_df.reset_index(drop=True)
print(trimmed_df)
print(trimmed_df['Distribution Type'].value_counts())

Empty DataFrame
Columns: [Data, Distribution Type]
Index: []
                                                  Data Distribution Type
0    [0.7591974135354557, 0.1181593030329548, 0.193...           bimodal
1    [0.7519267102526808, 0.2311100841364126, 0.316...           bimodal
2    [0.6567157154958743, 0.6949298779531272, 0.253...           bimodal
3    [0.6688971655829814, 0.6919859857332238, 0.257...           bimodal
4    [1.2104049194514532e-24, 1.202252104985288e-24...           bimodal
..                                                 ...               ...
595  [0.128440366972477, 0.128440366972477, 0.13761...           outlier
596  [0.7654995657651831, 0.7636762084978036, 0.757...           outlier
597  [0.6666455123336696, 0.7699331037568015, 0.790...           outlier
598  [0.74529091888708, 0.7469155875998774, 0.73799...           outlier
599  [0.992110753094851, 0.9927640028385896, 0.9931...           outlier

[600 rows x 2 columns]
bimodal       100
discrete      100
fun

## Modelling 
We will start modelling LSTM 

In [9]:
#Importing relevant modules to build model
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

In [21]:
#Initiating the model we will be using
num_classes = 6
model = Sequential()
model.add(LSTM(64, input_shape=(50000, 1)))
model.add(Dense(num_classes, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [12]:
#Sorting the data beforehand before modelling
def sort_df(df):
    #Make a copy then sort
    sorted_df = df
    for index in range(sorted_df.shape[0]):
        sorted_df['Data'].iloc[index].sort()
    return sorted_df
#Function to convert all the data (individual tuple) into arrays/list for modelling
def convert_data_rows(df):
    data_points = []
    for colName,colData in df.items():
        data_points.append(np.asarray(colData))
    return data_points
#Prepare the training and sorting data 
def train_test(model_df):
    label_encoder = LabelEncoder()
    xData = sort_df(model_df)['Data']
    yData = sort_df(model_df)['Distribution Type']
    X_train, X_test, Y_train, Y_test = train_test_split(xData,yData,test_size = 0.2,random_state=42)
    X_train = convert_data_rows(X_train)
    X_test = convert_data_rows(X_test)
    X_train = np.array(X_train).astype('float64')
    X_test= np.array(X_test).astype('float64')
    num_samples, num_timesteps = X_train.shape
    X_train = X_train.reshape(num_samples,num_timesteps,1)
    X_test = X_test.reshape(X_test.shape[0],num_timesteps,1)
    Y_train = label_encoder.fit_transform(Y_train)
    Y_train = to_categorical(Y_train,num_classes)
    Y_test = label_encoder.fit_transform(Y_test)
    Y_test = to_categorical(Y_test,num_classes)
    return X_train,X_test,Y_train,Y_test

In [23]:
#Preparing training data 
X_train,X_test,Y_train,Y_test = train_test(trimmed_df)

In [57]:
print(X_train.shape)
print(X_test.shape)

(360, 50000, 1)
(120, 50000, 1)


In [24]:
#Running the model
import time
start_time = time.time()
model.fit(X_train, Y_train, epochs=10, batch_size=32)
end_time = time.time()
elasped_time = end_time - start_time
print("Time taken: {}".format(elasped_time))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Time taken: 54526.24233055115


In [26]:
from tensorflow.keras.models import save_model, load_model
save_model(model,'LSTM_trained.h5')

In [41]:
#Try load model
loaded_model_LSTM = load_model('LSTM_trained.h5')

## Attempting another model (CNN)

In [28]:
#Use same x train y train except further split the train into validation
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.25, random_state=42)  # 60% train, 20% validation, 20% test

In [30]:
from keras.layers import Conv1D, MaxPooling1D, Flatten, Dense
modelCnn = Sequential()
modelCnn.add(Conv1D(filters=32, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)))
modelCnn.add(MaxPooling1D(pool_size=2))
modelCnn.add(Flatten())
modelCnn.add(Dense(64, activation='relu'))
modelCnn.add(Dense(num_classes, activation='softmax'))
modelCnn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [32]:
# Train the model
modelCnn.fit(X_train, Y_train, epochs=10, batch_size=32, validation_data=(X_val, Y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1a0020c5510>

In [34]:
#Saving the model
save_model = save_model(modelCnn,'Cnn_trained.h5')

In [35]:
#Load the CNN model and try it with test
cnn_model = load_model('Cnn_trained.h5')

In [39]:
y_pred_cnn = cnn_model.predict(X_test)



In [40]:
y_pred_classes_cnn = np.argmax(y_pred_cnn,axis=1)
print(y_pred_classes_cnn)

[1 4 1 1 1 1 1 4 3 1 0 1 0 1 1 1 1 1 0 1 0 1 1 1 1 1 1 0 1 4 5 1 0 1 1 4 4
 1 4 1 1 1 5 1 5 1 4 1 0 4 1 1 1 1 1 0 1 1 1 1 1 4 4 1 1 1 5 1 1 1 1 1 1 4
 2 1 3 1 5 1 1 4 1 1 3 0 1 2 4 1 1 5 0 4 1 1 1 0 1 1 1 1 1 1 1 1 1 1 2 1 1
 1 1 1 1 1 1 1 1 4]


In [42]:
y_pred_lstm = loaded_model_LSTM.predict(X_test)



In [43]:
y_pred_classes_lstm = np.argmax(y_pred_lstm,axis=1)
print(y_pred_classes_lstm)

[2 4 4 2 2 5 2 4 4 4 4 4 4 4 4 4 4 2 4 2 4 4 4 4 4 4 2 4 2 4 4 5 4 5 4 4 4
 4 4 4 4 4 2 4 2 4 4 2 4 4 2 2 2 1 4 4 4 4 4 4 4 4 4 4 5 4 4 4 2 4 4 4 4 4
 4 4 4 4 2 4 1 4 4 4 4 4 4 4 4 4 5 2 4 4 2 4 2 4 4 4 4 4 4 4 5 2 2 4 4 5 4
 2 4 4 4 4 4 4 4 4]


#Check the actual labels/ Y test
Y_test_check = np.argmax(Y_test,axis=1)
print(Y_test_check)

In [45]:
#Accuracy for CNN
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(Y_test_check,y_pred_classes_cnn)
print(accuracy)

0.325


In [47]:
#Accuracy for LSTM
accuracy = accuracy_score(Y_test_check,y_pred_classes_lstm)
print(accuracy)

0.16666666666666666


## XGBoost

In [38]:
!pip install xgboost
import xgboost as xgb



In [39]:
params = {
'objective': 'multi:softmax',  # Multi-class classification objective
'num_class': 6,  # Number of classes
'eta': 0.3,  # Learning rate
'max_depth': 6,  # Maximum depth of the trees
'subsample': 0.8,  # Subsample ratio of the training instances
'colsample_bytree': 0.8,  # Subsample ratio of columns when constructing each tree
'seed': 42  # Random seed for reproducibility
}

In [40]:
#Prepare the training and sorting data 
def train_test_XGB(model_df):
    label_encoder = LabelEncoder()
    xData = sort_df(model_df)['Data']
    yData = sort_df(model_df)['Distribution Type']
    X_train, X_test, Y_train, Y_test = train_test_split(xData,yData,test_size = 0.2,random_state=42) 
    X_train = convert_data_rows(X_train)
    X_test = convert_data_rows(X_test)
    X_train = np.array(X_train).astype('float64')
    X_test= np.array(X_test).astype('float64')
    #num_samples, num_timesteps = X_train.shape
    #X_train = X_train.reshape(num_samples,num_timesteps,1)
    #X_test = X_test.reshape(X_test.shape[0],num_timesteps,1)
    Y_train = label_encoder.fit_transform(Y_train)
    Y_train = to_categorical(Y_train,num_classes)
    Y_test = label_encoder.fit_transform(Y_test)
    Y_test = to_categorical(Y_test,num_classes)
    return X_train,X_test,Y_train,Y_test

In [41]:
#Get training, validation and test data sets
X_train_XGB, X_test_XGB, Y_train_XGB, Y_test_XGB = train_test_XGB(trimmed_df)
#X_train_XGB, X_val_XGB, Y_train_XGB, Y_val_XGB = train_test_split(X_train_XGB, Y_train_XGB, test_size=0.25, random_state=42)  # 70% train, 15% validation, 15% test

In [42]:
#Ensure shape of the y is correst
print(Y_train_XGB.shape)
print(Y_test_XGB.shape)
Y_train_XGB = np.argmax(Y_train_XGB,axis=1)
Y_test_XGB = np.argmax(Y_test_XGB,axis=1)
print(Y_train_XGB.shape)
print(Y_test_XGB.shape)

(480, 6)
(120, 6)
(480,)
(120,)


In [43]:
#Check shape of x 
print(X_train_XGB.shape)

(480, 50000)


In [44]:
dtrain = xgb.DMatrix(X_train_XGB, label=Y_train_XGB)
dtest = xgb.DMatrix(X_test_XGB, label=Y_test_XGB)

In [45]:
model_XGB = xgb.train(params, dtrain, num_boost_round=100)

In [80]:
xgb_classifier = xgb.XGBClassifier()
xgb_classifier.fit(X_train_XGB,Y_train_XGB)

In [81]:
pred_classes = xgb_classifier.predict(X_test_XGB)

In [82]:
print(pred_classes)
accuracy = accuracy_score(pred_classes,Y_test_XGB)
print("accuracy: ",accuracy)

[1 4 3 0 1 2 0 3 0 3 0 1 1 5 0 2 0 1 2 0 0 5 5 5 3 5 0 1 0 4 0 1 0 2 2 4 1
 5 4 2 3 0 5 5 0 1 4 1 0 4 1 2 1 5 5 3 2 1 1 1 0 1 0 5 5 0 5 3 0 3 0 0 4 3
 5 5 3 2 5 3 2 4 5 3 1 0 1 2 3 2 2 5 0 4 1 2 1 0 5 0 4 2 5 3 0 0 1 2 2 0 1
 1 3 5 3 3 3 2 0 0]
accuracy:  0.775


In [92]:
import pickle
model_filename = "XGB_model_try.pkl"
with open(model_filename,'wb') as model_file_2:
    pickle.dump(xgb_classifier,model_file_2)

In [83]:
print(Y_test_XGB)

[1 4 5 0 1 2 0 4 0 3 0 1 1 5 3 2 0 2 1 0 0 5 5 5 3 5 0 1 5 4 5 1 0 2 1 4 1
 5 4 2 3 0 5 5 0 1 4 1 0 4 2 2 1 5 5 3 2 1 1 1 0 1 3 5 5 0 5 5 0 3 3 0 4 3
 4 5 3 2 5 4 2 4 5 3 0 0 1 2 3 2 2 5 0 3 2 2 2 0 4 0 0 2 5 3 0 4 1 2 2 4 1
 4 5 5 3 3 3 1 4 4]


In [86]:
#After tuning hyperparameters
xgb_classifier_tuned = xgb.XGBClassifier({'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200})
xgb_classifier_tuned.fit(X_train_XGB,Y_train_XGB)



In [87]:
pred_classes = xgb_classifier_tuned.predict(X_test_XGB)
print(pred_classes)
accuracy = accuracy_score(pred_classes,Y_test_XGB)
print("accuracy: ",accuracy)

[1 4 3 0 1 2 0 3 0 3 0 1 1 5 0 2 0 1 2 0 0 5 5 5 3 5 0 1 0 4 0 1 0 2 2 4 1
 5 4 2 3 0 5 5 0 1 4 1 0 4 1 2 1 5 5 3 2 1 1 1 0 1 0 5 5 0 5 3 0 3 0 0 4 3
 5 5 3 2 5 3 2 4 5 3 1 0 1 2 3 2 2 5 0 4 1 2 1 0 5 0 4 2 5 3 0 0 1 2 2 0 1
 1 3 5 3 3 3 2 0 0]
accuracy:  0.775


In [76]:
#Findng out inference time 
import time
start_time = time.time()
y_pred_XGB = model_XGB.predict(dtest)
end_time = time.time()
elapsed_time = end_time - start_time
print("Time taken: {:.2f}".format(elapsed_time))

Time taken: 0.00


In [77]:
pred_test

array([1., 4., 5., 0., 1., 2., 0., 3., 0., 3., 0., 1., 1., 5., 0., 2., 0.,
       1., 2., 0., 0., 5., 5., 5., 3., 5., 0., 1., 0., 4., 1., 1., 0., 2.,
       2., 4., 1., 5., 4., 2., 3., 0., 5., 0., 0., 1., 4., 1., 0., 4., 1.,
       1., 1., 5., 5., 3., 2., 1., 1., 1., 0., 1., 0., 5., 1., 0., 5., 3.,
       0., 3., 0., 0., 4., 3., 5., 5., 3., 2., 5., 3., 2., 4., 5., 3., 1.,
       0., 1., 2., 3., 2., 2., 5., 0., 4., 1., 2., 1., 0., 5., 0., 4., 2.,
       5., 3., 0., 0., 1., 2., 2., 0., 1., 1., 3., 5., 3., 3., 3., 2., 0.,
       0.], dtype=float32)

In [78]:
pred_test = model_XGB.predict(dtest)
accuracy_test = accuracy_score(Y_test_XGB, pred_test)
print("Test Accuracy:", accuracy_test)

Test Accuracy: 0.7583333333333333


## Tuning hyperparameters of XGBoost


In [49]:
classifier_XGB = xgb.XGBClassifier()
param_grid = {
'n_estimators': [100, 200, 300],
'max_depth': [3, 4, 5],
'learning_rate': [0.1, 0.01, 0.001]
}

In [53]:
search = HalvingGridSearchCV(classifier_XGB, param_grid, scoring='accuracy', cv=3, factor=2, random_state=42,n_jobs=-1)
search.fit(X_train_XGB, Y_train_XGB)

In [54]:
print("Best Parameters: ", search.best_params_)
print("Best Accuracy: ", search.best_score_)

Best Parameters:  {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200}
Best Accuracy:  0.6597222222222223


In [122]:
best_model = search.best_estimator_
pred_test = best_model.predict(X_test_XGB)
accuracy_test = accuracy_score(Y_test_XGB, pred_test)
print("Test Accuracy: ", accuracy_test)

Test Accuracy:  0.7833333333333333


print(accuracy_test)

In [66]:
import pickle
model_filename = "xgb_model.pkl"
with open(model_filename,'wb') as model_file:
    pickle.dump(best_model,model_file)

In [67]:
print("{}".format(model_filename))

xgb_model.pkl


In [65]:
import os
print("Current working directory:",os.getcwd())

Current working directory: C:\Users\nxg00371\Desktop\Coding Projs


## Support vector machine 

In [7]:
#Prepare the training and sorting data 
def support_vector(model_df):
    #label_encoder = LabelEncoder()
    xData = sort_df(model_df)['Data']
    yData = sort_df(model_df)['Distribution Type']
    X_train, X_test, Y_train, Y_test = train_test_split(xData,yData,test_size = 0.2,random_state=42) 
    X_train = convert_data_rows(X_train)
    X_test = convert_data_rows(X_test)
    X_train = np.array(X_train).astype('float64')
    X_test= np.array(X_test).astype('float64')
    #num_samples, num_timesteps = X_train.shape
    #X_train = X_train.reshape(num_samples,num_timesteps,1)
    #X_test = X_test.reshape(X_test.shape[0],num_timesteps,1)
    #Y_train = label_encoder.fit_transform(Y_train)
    #Y_train = to_categorical(Y_train,num_classes)
    #Y_test = label_encoder.fit_transform(Y_test)
    #Y_test = to_categorical(Y_test,num_classes)
    return X_train,X_test,Y_train,Y_test

In [8]:
print(trimmed_df)
print(trimmed_df['Distribution Type'].value_counts())

                                                  Data Distribution Type
0    [0.7591974135354557, 0.1181593030329548, 0.193...           bimodal
1    [0.7519267102526808, 0.2311100841364126, 0.316...           bimodal
2    [0.6567157154958743, 0.6949298779531272, 0.253...           bimodal
3    [0.6688971655829814, 0.6919859857332238, 0.257...           bimodal
4    [1.2104049194514532e-24, 1.202252104985288e-24...           bimodal
..                                                 ...               ...
595  [0.128440366972477, 0.128440366972477, 0.13761...           outlier
596  [0.7654995657651831, 0.7636762084978036, 0.757...           outlier
597  [0.6666455123336696, 0.7699331037568015, 0.790...           outlier
598  [0.74529091888708, 0.7469155875998774, 0.73799...           outlier
599  [0.992110753094851, 0.9927640028385896, 0.9931...           outlier

[600 rows x 2 columns]
bimodal       100
discrete      100
functional    100
longtail      100
normal        100
outlier   

In [96]:
X_train_SV, X_test_SV, Y_train_SV, Y_test_SV = support_vector(trimmed_df)

In [97]:
from sklearn.svm import SVC
import joblib

In [98]:


svm_classifier = SVC(kernel='linear',random_state=42)
svm_classifier.fit(X_train_SV,Y_train_SV)

In [103]:
pred_classes = svm_classifier.predict(X_test_XGB)

array(['discrete', 'normal', 'outlier', 'bimodal', 'discrete',
       'functional', 'bimodal', 'normal', 'bimodal', 'longtail',
       'bimodal', 'discrete', 'discrete', 'outlier', 'bimodal',
       'functional', 'bimodal', 'discrete', 'bimodal', 'bimodal',
       'bimodal', 'outlier', 'outlier', 'outlier', 'normal', 'outlier',
       'bimodal', 'longtail', 'bimodal', 'normal', 'outlier', 'discrete',
       'bimodal', 'functional', 'longtail', 'normal', 'outlier',
       'outlier', 'normal', 'functional', 'longtail', 'bimodal',
       'bimodal', 'outlier', 'bimodal', 'discrete', 'normal', 'discrete',
       'bimodal', 'normal', 'discrete', 'outlier', 'discrete', 'outlier',
       'outlier', 'longtail', 'functional', 'discrete', 'discrete',
       'discrete', 'bimodal', 'discrete', 'bimodal', 'outlier', 'outlier',
       'bimodal', 'outlier', 'outlier', 'bimodal', 'outlier', 'longtail',
       'bimodal', 'outlier', 'longtail', 'functional', 'outlier',
       'normal', 'functional', 'out

In [114]:
print(pred_classes)
print(len(pred_classes))
print(type(pred_classes))

['discrete' 'normal' 'outlier' 'bimodal' 'discrete' 'functional' 'bimodal'
 'normal' 'bimodal' 'longtail' 'bimodal' 'discrete' 'discrete' 'outlier'
 'bimodal' 'functional' 'bimodal' 'discrete' 'bimodal' 'bimodal' 'bimodal'
 'outlier' 'outlier' 'outlier' 'normal' 'outlier' 'bimodal' 'longtail'
 'bimodal' 'normal' 'outlier' 'discrete' 'bimodal' 'functional' 'longtail'
 'normal' 'outlier' 'outlier' 'normal' 'functional' 'longtail' 'bimodal'
 'bimodal' 'outlier' 'bimodal' 'discrete' 'normal' 'discrete' 'bimodal'
 'normal' 'discrete' 'outlier' 'discrete' 'outlier' 'outlier' 'longtail'
 'functional' 'discrete' 'discrete' 'discrete' 'bimodal' 'discrete'
 'bimodal' 'outlier' 'outlier' 'bimodal' 'outlier' 'outlier' 'bimodal'
 'outlier' 'longtail' 'bimodal' 'outlier' 'longtail' 'functional'
 'outlier' 'normal' 'functional' 'outlier' 'normal' 'functional' 'normal'
 'outlier' 'normal' 'normal' 'bimodal' 'discrete' 'functional' 'normal'
 'functional' 'functional' 'outlier' 'bimodal' 'normal' 'discr

In [112]:
print(Y_test_SV[:30])
print(type(Y_test_SV))
print(len(Y_test_SV))

110      discrete
419        normal
565       outlier
77        bimodal
181      discrete
284    functional
10        bimodal
469        normal
78        bimodal
349      longtail
55        bimodal
118      discrete
109      discrete
588       outlier
369      longtail
234    functional
30        bimodal
212    functional
184      discrete
86        bimodal
2         bimodal
587       outlier
535       outlier
596       outlier
368      longtail
539       outlier
72        bimodal
135      discrete
556       outlier
437        normal
Name: Distribution Type, dtype: object
<class 'pandas.core.series.Series'>
120


In [116]:
compare_df = pd.DataFrame()
compare_df['predicted'] = pred_classes
compare_df['actual'] = np.array(Y_test_SV)
print(compare_df[10:50])

     predicted      actual
10     bimodal     bimodal
11    discrete    discrete
12    discrete    discrete
13     outlier     outlier
14     bimodal    longtail
15  functional  functional
16     bimodal     bimodal
17    discrete  functional
18     bimodal    discrete
19     bimodal     bimodal
20     bimodal     bimodal
21     outlier     outlier
22     outlier     outlier
23     outlier     outlier
24      normal    longtail
25     outlier     outlier
26     bimodal     bimodal
27    longtail    discrete
28     bimodal     outlier
29      normal      normal
30     outlier     outlier
31    discrete    discrete
32     bimodal     bimodal
33  functional  functional
34    longtail    discrete
35      normal      normal
36     outlier    discrete
37     outlier     outlier
38      normal      normal
39  functional  functional
40    longtail    longtail
41     bimodal     bimodal
42     bimodal     outlier
43     outlier     outlier
44     bimodal     bimodal
45    discrete    discrete
4

In [118]:
correct_pred = sum(1 for pred,act in zip(Y_test_SV,pred_classes) if pred==act)
print(correct_pred)
print(correct_pred/120)

85
0.7083333333333334


## K-nearest neighbour

In [126]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [131]:
#Prepare the training and sorting data 
def knn(model_df):
    #label_encoder = LabelEncoder()
    xData = sort_df(model_df)['Data']
    yData = sort_df(model_df)['Distribution Type']
    X_train, X_test, Y_train, Y_test = train_test_split(xData,yData,test_size = 0.2,random_state=42) 
    X_train = convert_data_rows(X_train)
    X_test = convert_data_rows(X_test)
    X_train = np.array(X_train).astype('float64')
    X_test= np.array(X_test).astype('float64')
    #num_samples, num_timesteps = X_train.shape
    #X_train = X_train.reshape(num_samples,num_timesteps,1)
    #X_test = X_test.reshape(X_test.shape[0],num_timesteps,1)
    #Y_train = label_encoder.fit_transform(Y_train)
    #Y_train = to_categorical(Y_train,num_classes)
    #Y_test = label_encoder.fit_transform(Y_test)
    #Y_test = to_categorical(Y_test,num_classes)
    return X_train,X_test,Y_train,Y_test

In [129]:
print(trimmed_df['Distribution Type'].value_counts())

bimodal       100
discrete      100
functional    100
longtail      100
normal        100
outlier       100
Name: Distribution Type, dtype: int64


In [132]:
X_train_K, X_test_K, Y_train_K, Y_test_K = knn(trimmed_df)
# Create a KNN classifier
k = 6
knn_classifier = KNeighborsClassifier(n_neighbors=k)
# Fit the classifier on the training data
knn_classifier.fit(X_train_K, Y_train_K)
# Make predictions on the test data
predictions = knn_classifier.predict(X_test_K)
# Calculate accuracy
accuracy = accuracy_score(Y_test_K, predictions)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.61
