In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import tensorflow as tf

tf.test.gpu_device_name()


'/device:GPU:0'

# Data Preprocessing

## Import Statements and Tensorflow functions



In [5]:
from collections.abc import Sequence
from sklearn import preprocessing
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn import metrics
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout, Conv2D,MaxPooling2D, Flatten
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint

from sklearn import svm, datasets
from sklearn.metrics import confusion_matrix, classification_report

In [6]:
# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)


# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_


# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd


# Convert all missing values in the specified column to the median
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)


# Convert all missing values in the specified column to the default
def missing_default(df, name, default_value):
    df[name] = df[name].fillna(default_value)


# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column.
    target_type = df[target].dtypes
    target_type = target_type[0] if isinstance(target_type, Sequence) else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    else:
        # Regression
        return df[result].values.astype(np.float32), df[target].values.astype(np.float32)

# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)


# Regression chart.
def chart_regression(pred,y,sort=True):
    t = pd.DataFrame({'pred' : pred, 'y' : y.flatten()})
    if sort:
        t.sort_values(by=['y'],inplace=True)
    a = plt.plot(t['y'].tolist(),label='expected')
    b = plt.plot(t['pred'].tolist(),label='prediction')
    plt.ylabel('output')
    plt.legend()
    plt.show()

# Remove all rows where the specified column is +/- sd standard deviations
def remove_outliers(df, name, sd):
    drop_rows = df.index[(np.abs(df[name] - df[name].mean()) >= (sd * df[name].std()))]
    df.drop(drop_rows, axis=0, inplace=True)


# Encode a column to a range between normalized_low and normalized_high.
def encode_numeric_range(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
               * (normalized_high - normalized_low) + normalized_low


## Import and view data

In [7]:
# test_path = '/content/drive/MyDrive/CS/intelligent-systems/p2/UNSW_NB15_test-set.csv'
# train_path = '/content/drive/MyDrive/CS/intelligent-systems/p2/UNSW_NB15_training-set.csv'

test_path2 = './UNSW_NB15_test-set.csv'
train_path2 = './UNSW_NB15_training-set.csv'

In [8]:
# test = pd.read_csv(test_path)
# train = pd.read_csv(train_path)

test = pd.read_csv(test_path2)
train = pd.read_csv(train_path2)

In [9]:
print("test shape: ", test.shape)
print("train shape: ", train.shape)

test shape:  (82332, 45)
train shape:  (175341, 45)


In [10]:
train.head().iloc[:, :20]

Unnamed: 0,id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,dttl,sload,dload,sloss,dloss,sinpkt,dinpkt,sjit,djit
0,1,0.121478,tcp,-,FIN,6,4,258,172,74.08749,252,254,14158.94238,8495.365234,0,0,24.2956,8.375,30.177547,11.830604
1,2,0.649902,tcp,-,FIN,14,38,734,42014,78.473372,62,252,8395.112305,503571.3125,2,17,49.915,15.432865,61.426934,1387.77833
2,3,1.623129,tcp,-,FIN,8,16,364,13186,14.170161,62,252,1572.271851,60929.23047,1,6,231.875571,102.737203,17179.58686,11420.92623
3,4,1.681642,tcp,ftp,FIN,12,12,628,770,13.677108,62,252,2740.178955,3358.62207,1,3,152.876547,90.235726,259.080172,4991.784669
4,5,0.449454,tcp,-,FIN,10,6,534,268,33.373826,254,252,8561.499023,3987.059814,2,1,47.750333,75.659602,2415.837634,115.807


In [11]:
train.head().iloc[:, 20:40]

Unnamed: 0,swin,stcpb,dtcpb,dwin,tcprtt,synack,ackdat,smean,dmean,trans_depth,response_body_len,ct_srv_src,ct_state_ttl,ct_dst_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd
0,255,621772692,2202533631,255,0.0,0.0,0.0,43,43,0,0,1,0,1,1,1,1,0,0,0
1,255,1417884146,3077387971,255,0.0,0.0,0.0,52,1106,0,0,43,1,1,1,1,2,0,0,0
2,255,2116150707,2963114973,255,0.111897,0.061458,0.050439,46,824,0,0,7,1,2,1,1,3,0,0,0
3,255,1107119177,1047442890,255,0.0,0.0,0.0,52,64,0,0,1,1,2,1,1,3,1,1,0
4,255,2436137549,1977154190,255,0.128381,0.071147,0.057234,53,45,0,0,43,1,2,2,1,40,0,0,0


In [12]:
train.head().iloc[:, 40:]

Unnamed: 0,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
0,1,1,0,Normal,0
1,1,6,0,Normal,0
2,2,6,0,Normal,0
3,2,1,0,Normal,0
4,2,39,0,Normal,0


In [13]:
train.columns

Index(['id', 'dur', 'proto', 'service', 'state', 'spkts', 'dpkts', 'sbytes',
       'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss', 'dloss',
       'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin',
       'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth',
       'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm',
       'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm',
       'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'ct_src_ltm',
       'ct_srv_dst', 'is_sm_ips_ports', 'attack_cat', 'label'],
      dtype='object')

In [14]:
test.columns

Index(['id', 'dur', 'proto', 'service', 'state', 'spkts', 'dpkts', 'sbytes',
       'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss', 'dloss',
       'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin',
       'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth',
       'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm',
       'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm',
       'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'ct_src_ltm',
       'ct_srv_dst', 'is_sm_ips_ports', 'attack_cat', 'label'],
      dtype='object')

In [15]:
categorical_cols = ['service', 'state', 'proto', 'stcpb', 'dtcpb',
                    'is_sm_ips_ports', 'is_ftp_login']

In [16]:
not_matching = []
for col in categorical_cols:


  train_set = set(train[col])
  test_set = set(test[col])

  if train_set != test_set:
    print("Not Matching: ", col)
    print(f"Unique values in train data for {col}: {train[col].unique()}")
    print(f"Unique values in test data for {col}: {test[col].unique()}")
    print()

    not_matching.append(col)

print(not_matching)

Not Matching:  state
Unique values in train data for state: ['FIN' 'INT' 'CON' 'ECO' 'REQ' 'RST' 'PAR' 'URN' 'no']
Unique values in test data for state: ['INT' 'FIN' 'REQ' 'ACC' 'CON' 'RST' 'CLO']

Not Matching:  proto
Unique values in train data for proto: ['tcp' 'udp' 'arp' 'ospf' 'icmp' 'igmp' 'rtp' 'ddp' 'ipv6-frag' 'cftp'
 'wsn' 'pvp' 'wb-expak' 'mtp' 'pri-enc' 'sat-mon' 'cphb' 'sun-nd' 'iso-ip'
 'xtp' 'il' 'unas' 'mfe-nsp' '3pc' 'ipv6-route' 'idrp' 'bna' 'swipe'
 'kryptolan' 'cpnx' 'rsvp' 'wb-mon' 'vmtp' 'ib' 'dgp' 'eigrp' 'ax.25'
 'gmtp' 'pnni' 'sep' 'pgm' 'idpr-cmtp' 'zero' 'rvd' 'mobile' 'narp' 'fc'
 'pipe' 'ipcomp' 'ipv6-no' 'sat-expak' 'ipv6-opts' 'snp' 'ipcv'
 'br-sat-mon' 'ttp' 'tcf' 'nsfnet-igp' 'sprite-rpc' 'aes-sp3-d' 'sccopmce'
 'sctp' 'qnx' 'scps' 'etherip' 'aris' 'pim' 'compaq-peer' 'vrrp' 'iatp'
 'stp' 'l2tp' 'srp' 'sm' 'isis' 'smp' 'fire' 'ptp' 'crtp' 'sps'
 'merit-inp' 'idpr' 'skip' 'any' 'larp' 'ipip' 'micp' 'encap' 'ifmp'
 'tp++' 'a/n' 'ipv6' 'i-nlsp' 'ipx-n-ip'

## Option #1: Remove all records that only appear in either

In [None]:
# for the training data we want to drop all
print("Before: ", train.shape)
training_data = train.dropna()

print("After: ", training_data.shape)

Before:  (175341, 45)
After:  (175341, 45)


In [None]:
# for the training data we want to drop all
print("Before: ", test.shape)
test_data = test.dropna()

print("After: ", test_data.shape)

Before:  (82332, 45)
After:  (82332, 45)


In [None]:
training_data = training_data.drop('id', axis=1)
training_data = training_data.drop('attack_cat', axis=1)

test_data = test_data.drop('id', axis=1)
test_data = test_data.drop('attack_cat', axis=1)

In [None]:
# Remove all the records that are not common between test and train
for i in range(0, 3):
  for col in categorical_cols:
    # print(f"Unique values in train data for {col}: {training_data[col].unique()}")
    # print(f"Unique values in test data for {col}: {test_data[col].unique()}")
    common_col_elements = set(training_data[col].unique()).intersection(set(test_data[col].unique()))

    # print(f"Unique values in both data for {col}: {test_data[col].unique()}")


    training_data = training_data[training_data[col].isin(common_col_elements)]
    test_data = test_data[test_data[col].isin(common_col_elements)]

print("Final Training Data Shape:", training_data.shape)
print("Final Test Data Shape:", test_data.shape)

# Why is it that I had to run it multiple times to keep filtering the data out
# Drop features that have mismatch

Final Training Data Shape: (39163, 43)
Final Test Data Shape: (40890, 43)


In [None]:
# Make sure all columns match
print("Train attributes: ", len(training_data.columns))
print("Test attributes: ", len(test_data.columns))

cols = set(training_data.columns).intersection(test_data.columns)

print("Matching columns: ", len(cols))


Train attributes:  43
Test attributes:  43
Matching columns:  43


In [None]:
test_data = pd.get_dummies(test_data, columns=categorical_cols)
training_data = pd.get_dummies(training_data, columns=categorical_cols)

In [None]:
print("Training Data Shape:", training_data.shape)
print("Test Data Shape:", test_data.shape)

Training Data Shape: (39163, 200)
Test Data Shape: (40890, 200)


In [None]:
for cols in test.columns:
  if col in categorical_cols or col == 'label':
    continue

  encode_numeric_zscore(training_data, col)
  encode_numeric_zscore(test_data, col)

In [None]:
print("Training Data Shape:", training_data.shape)
print("Test Data Shape:", test_data.shape)

Training Data Shape: (39163, 200)
Test Data Shape: (40890, 200)


## Option #2 : Remove all features that only appear in either

In [17]:
training_data2 = train.dropna()
test_data2 = test.dropna()

In [18]:
training_data2 = training_data2.drop('attack_cat', axis=1)
test_data2 = test_data2.drop('attack_cat', axis=1)

training_data2 = training_data2.drop('id', axis=1)
test_data2 = test_data2.drop('id', axis=1)

# Drop all not matching categorical columns
for col in not_matching:
  training_data2 = training_data2.drop(col, axis = 1)
  test_data2 = test_data2.drop(col, axis = 1)

In [19]:
print(training_data2.shape, test_data2.shape)

(175341, 38) (82332, 38)


In [20]:
# One hot encoding for all of the columns remaining
for col in categorical_cols:
  if col not in not_matching:
    test_data2 = pd.get_dummies(test_data2, columns=[col])
    training_data2 = pd.get_dummies(training_data2, columns=[col])

In [21]:
print(training_data2.shape, test_data2.shape)

(175341, 51) (82332, 51)


In [22]:
for cols in test.columns:
  if col in categorical_cols:
    continue

  encode_numeric_zscore(training_data2, col)
  encode_numeric_zscore(test_data2, col)

In [23]:
print(training_data2.shape, test_data2.shape)

(175341, 51) (82332, 51)


# Model Preparation

## Option 1: Test/Train Split

In [24]:
x_train, y_train = to_xy(training_data, 'label')
x_test, y_test = to_xy(test_data, 'label')

print("Train Split :", x_train.shape, y_train.shape)
print("Test Split :", x_test.shape, y_test.shape)

NameError: name 'training_data' is not defined

In [None]:
y_train[:-5]

## Option 2: Test/Train Split

In [25]:
x_train2, y_train2 = to_xy(training_data2, 'label')
x_test2, y_test2 = to_xy(test_data2, 'label')

print("Train Split :", x_train2.shape, y_train2.shape)
print("Test Split :", x_test2.shape, y_test2.shape)

Train Split : (175341, 50) (175341, 2)
Test Split : (82332, 50) (82332, 2)


# Dense Model Training

## Dense Model #1

In [None]:
print(x_train2.shape, y_train2.shape)
print(x_test2.shape, y_test2.shape)

(175341, 50) (175341, 2)
(82332, 50) (82332, 2)


In [None]:
# Define ModelCheckpoint outside the loop
# filepath = '/content/drive/MyDrive/CS/intelligent-systems/p2/best_weights.keras'
filepath = './best_weights.keras'


checkpointer = ModelCheckpoint(filepath=filepath, verbose=0, save_best_only=True) # save best model

# otherwise new model will override after each loop
for i in range(5):
    print(i)

    # Build network
    model = Sequential()
    model.add(Dense(20, input_dim=x_train2.shape[1], activation='relu'))
    model.add(Dense(10, activation='relu'))
    model.add(Dense(y_train2.shape[1], activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')

    # add early stopping within the loop
    model.fit(x_train2,y_train2,validation_data=(x_test2,y_test2),callbacks=[monitor,checkpointer],verbose=2,epochs=100)


print('Training finished...Loading the best model')
print()
model.load_weights('./best_weights.keras') # load weights from best model

# Measure accuracy
pred = model.predict(x_test2)
pred = np.argmax(pred,axis=1)

y_true = np.argmax(y_test2,axis=1)

score = metrics.accuracy_score(y_true, pred)

0


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
5480/5480 - 18s - 3ms/step - accuracy: 0.8401 - loss: 11502.7656 - val_accuracy: 0.7425 - val_loss: 36.7234
Epoch 2/100
5480/5480 - 14s - 3ms/step - accuracy: 0.8702 - loss: 27.2310 - val_accuracy: 0.7187 - val_loss: 3.2978
Epoch 3/100


KeyboardInterrupt: 

In [None]:
print("Final accuracy: {}".format(score))
print(classification_report(y_true, pred))

Final accuracy: 0.7389957732109022
              precision    recall  f1-score   support

           0       0.99      0.42      0.59     37000
           1       0.68      1.00      0.81     45332

    accuracy                           0.74     82332
   macro avg       0.84      0.71      0.70     82332
weighted avg       0.82      0.74      0.71     82332



## Dense Model #2

In [None]:
# Define ModelCheckpoint outside the loop
# filepath = '/content/drive/MyDrive/CS/intelligent-systems/p2/best_weights.keras'
filepath = './best_weights2.keras'


checkpointer = ModelCheckpoint(filepath=filepath, verbose=0, save_best_only=True) # save best model

# otherwise new model will override after each loop
for i in range(5):
    print(i)

    # Build network
    model = Sequential()
    model.add(Dense(128, input_dim=x_train2.shape[1], activation='sigmoid'))
    model.add(Dense(64, activation='sigmoid'))
    model.add(Dense(32, activation='sigmoid'))
    model.add(Dense(y_train2.shape[1], activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')

    # add early stopping within the loop
    model.fit(x_train2,y_train2,validation_data=(x_test2,y_test2),callbacks=[monitor,checkpointer],verbose=2,epochs=100)


print('Training finished...Loading the best model')
print()
model.load_weights('./best_weights2.keras') # load weights from best model

# Measure accuracy
pred2 = model.predict(x_test2)
pred2 = np.argmax(pred2,axis=1)

y_true2 = np.argmax(y_test2,axis=1)

score2 = metrics.accuracy_score(y_true2, pred2)

0
Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


5480/5480 - 15s - 3ms/step - loss: 0.3494 - val_loss: 0.5432
Epoch 2/100
5480/5480 - 18s - 3ms/step - loss: 0.3403 - val_loss: 0.5315
Epoch 3/100
5480/5480 - 22s - 4ms/step - loss: 0.3334 - val_loss: 0.5734
Epoch 4/100
5480/5480 - 12s - 2ms/step - loss: 0.3367 - val_loss: 0.5525
Epoch 5/100
5480/5480 - 19s - 4ms/step - loss: 0.3236 - val_loss: 0.5167
Epoch 6/100
5480/5480 - 14s - 2ms/step - loss: 0.3090 - val_loss: 0.6050
Epoch 7/100
5480/5480 - 11s - 2ms/step - loss: 0.3107 - val_loss: 0.5642
Epoch 8/100
5480/5480 - 9s - 2ms/step - loss: 0.3014 - val_loss: 0.5345
Epoch 9/100
5480/5480 - 12s - 2ms/step - loss: 0.3054 - val_loss: 0.5441
Epoch 10/100
5480/5480 - 11s - 2ms/step - loss: 0.3037 - val_loss: 0.5242
Epoch 10: early stopping
1
Epoch 1/100
5480/5480 - 13s - 2ms/step - loss: 0.3623 - val_loss: 0.5273
Epoch 2/100
5480/5480 - 20s - 4ms/step - loss: 0.3171 - val_loss: 0.5166
Epoch 3/100
5480/5480 - 21s - 4ms/step - loss: 0.3048 - val_loss: 0.5729
Epoch 4/100
5480/5480 - 9s - 2ms/ste

In [None]:
print("Final accuracy: {}".format(score2))
print(classification_report(y_true2, pred2))

Final accuracy: 0.7284409464120877
              precision    recall  f1-score   support

           0       0.85      0.48      0.61     37000
           1       0.69      0.93      0.79     45332

    accuracy                           0.73     82332
   macro avg       0.77      0.71      0.70     82332
weighted avg       0.76      0.73      0.71     82332



## Dense Model #3

In [None]:
# Define ModelCheckpoint outside the loop
# filepath = '/content/drive/MyDrive/CS/intelligent-systems/p2/best_weights.keras'
filepath = './best_weights3.keras'


checkpointer = ModelCheckpoint(filepath=filepath, verbose=0, save_best_only=True) # save best model

# otherwise new model will override after each loop
for i in range(5):
    print(i)

    # Build network
    model = Sequential()
    model.add(Dense(64, input_dim=x_train2.shape[1], activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(16, activation='sigmoid'))
    model.add(Dense(y_train2.shape[1], activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')

    # add early stopping within the loop
    model.fit(x_train2,y_train2,validation_data=(x_test2,y_test2),callbacks=[monitor,checkpointer],verbose=2,epochs=100)


print('Training finished...Loading the best model')
print()
model.load_weights('./best_weights3.keras') # load weights from best model

# Measure accuracy
pred3 = model.predict(x_test2)
pred3 = np.argmax(pred3,axis=1)

y_true3 = np.argmax(y_test2,axis=1)

score = metrics.accuracy_score(y_true3, pred3)

0


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
5480/5480 - 36s - 7ms/step - accuracy: 0.8204 - loss: 0.4522 - val_accuracy: 0.6877 - val_loss: 0.6789
Epoch 2/100
5480/5480 - 21s - 4ms/step - accuracy: 0.7245 - loss: 0.5703 - val_accuracy: 0.5626 - val_loss: 0.7914
Epoch 3/100
5480/5480 - 20s - 4ms/step - accuracy: 0.7021 - loss: 0.6020 - val_accuracy: 0.5627 - val_loss: 0.7082
Epoch 4/100
5480/5480 - 11s - 2ms/step - accuracy: 0.6967 - loss: 0.6057 - val_accuracy: 0.5626 - val_loss: 0.7170
Epoch 5/100
5480/5480 - 21s - 4ms/step - accuracy: 0.6983 - loss: 0.5869 - val_accuracy: 0.5625 - val_loss: 0.7140
Epoch 6/100
5480/5480 - 21s - 4ms/step - accuracy: 0.6976 - loss: 0.5960 - val_accuracy: 0.5625 - val_loss: 0.7187
Epoch 6: early stopping
1
Epoch 1/100
5480/5480 - 13s - 2ms/step - accuracy: 0.7492 - loss: 0.5457 - val_accuracy: 0.7021 - val_loss: 0.6501
Epoch 2/100
5480/5480 - 9s - 2ms/step - accuracy: 0.7117 - loss: 0.5888 - val_accuracy: 0.5626 - val_loss: 0.7317
Epoch 3/100
5480/5480 - 12s - 2ms/step - accuracy: 0.67

In [None]:
score3 = metrics.accuracy_score(y_true3, pred3)
print("Final accuracy: {}".format(score3))
print(classification_report(y_true3, pred3))

Final accuracy: 0.5920055385512316
              precision    recall  f1-score   support

           0       0.71      0.16      0.26     37000
           1       0.58      0.95      0.72     45332

    accuracy                           0.59     82332
   macro avg       0.64      0.55      0.49     82332
weighted avg       0.64      0.59      0.51     82332



## Dense Model #4

In [None]:
# Define ModelCheckpoint outside the loop
# filepath = '/content/drive/MyDrive/CS/intelligent-systems/p2/best_weights.keras'
filepath = './best_weights4.keras'


checkpointer = ModelCheckpoint(filepath=filepath, verbose=0, save_best_only=True) # save best model

# otherwise new model will override after each loop
for i in range(5):
    print(i)

    # Build network
    model = Sequential()
    model.add(Dense(128, input_dim=x_train2.shape[1], activation='tanh'))
    model.add(Dense(64, activation='tanh'))
    model.add(Dense(32, activation='tanh'))
    model.add(Dense(y_train2.shape[1], activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')

    # add early stopping within the loop
    model.fit(x_train2,y_train2,validation_data=(x_test2,y_test2),callbacks=[monitor,checkpointer],verbose=2,epochs=100)


print('Training finished...Loading the best model')
print()
model.load_weights('./best_weights4.keras') # load weights from best model

# Measure accuracy
pred4 = model.predict(x_test2)
pred4 = np.argmax(pred4,axis=1)

y_true4 = np.argmax(y_test2,axis=1)

score4 = metrics.accuracy_score(y_true4, pred4)

0


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
5480/5480 - 17s - 3ms/step - accuracy: 0.8739 - loss: 0.3150 - val_accuracy: 0.7289 - val_loss: 0.5440
Epoch 2/100
5480/5480 - 12s - 2ms/step - accuracy: 0.8760 - loss: 0.3094 - val_accuracy: 0.7590 - val_loss: 0.5083
Epoch 3/100
5480/5480 - 11s - 2ms/step - accuracy: 0.8796 - loss: 0.2992 - val_accuracy: 0.7610 - val_loss: 0.5069
Epoch 4/100
5480/5480 - 12s - 2ms/step - accuracy: 0.8824 - loss: 0.2950 - val_accuracy: 0.7213 - val_loss: 0.5696
Epoch 5/100
5480/5480 - 10s - 2ms/step - accuracy: 0.8747 - loss: 0.3073 - val_accuracy: 0.7322 - val_loss: 0.5699
Epoch 6/100
5480/5480 - 21s - 4ms/step - accuracy: 0.8774 - loss: 0.3087 - val_accuracy: 0.7581 - val_loss: 0.5419
Epoch 7/100
5480/5480 - 19s - 4ms/step - accuracy: 0.8750 - loss: 0.3140 - val_accuracy: 0.7283 - val_loss: 0.5839
Epoch 8/100
5480/5480 - 14s - 2ms/step - accuracy: 0.8756 - loss: 0.3099 - val_accuracy: 0.7251 - val_loss: 0.5410
Epoch 8: early stopping
1
Epoch 1/100
5480/5480 - 66s - 12ms/step - accuracy: 0.

In [None]:
print("Final accuracy: {}".format(score4))
print(classification_report(y_true4, pred4))

Final accuracy: 0.7609920808434145
              precision    recall  f1-score   support

           0       0.90      0.52      0.66     37000
           1       0.71      0.95      0.81     45332

    accuracy                           0.76     82332
   macro avg       0.81      0.74      0.74     82332
weighted avg       0.80      0.76      0.75     82332



## Dense Model #5

In [None]:
# Define ModelCheckpoint outside the loop
# filepath = '/content/drive/MyDrive/CS/intelligent-systems/p2/best_weights.keras'
filepath = './best_weights5.keras'


checkpointer = ModelCheckpoint(filepath=filepath, verbose=0, save_best_only=True) # save best model

# otherwise new model will override after each loop
for i in range(5):
    print(i)

    # Build network
    model = Sequential()

    model.add(Dense(512, input_dim=x_train2.shape[1], activation='tanh'))

    model.add(Dense(64, activation='tanh'))

    model.add(Dense(32, activation='tanh'))

    model.add(Dense(y_train2.shape[1], activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy'])
    monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')
    model.fit(x_train2,y_train2,validation_data=(x_test2,y_test2),callbacks=[monitor,checkpointer],verbose=2,epochs=100)


print('Training finished...Loading the best model')
print()
model.load_weights('./best_weights5.keras') # load weights from best model

# Measure accuracy
pred5 = model.predict(x_test2)
pred5 = np.argmax(pred5,axis=1)

y_true5 = np.argmax(y_test2,axis=1)

score5 = metrics.accuracy_score(y_true5, pred5)

0


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
5480/5480 - 19s - 3ms/step - accuracy: 0.8489 - loss: 0.3537 - val_accuracy: 0.6771 - val_loss: 0.6900
Epoch 2/100
5480/5480 - 17s - 3ms/step - accuracy: 0.8329 - loss: 0.3764 - val_accuracy: 0.6013 - val_loss: 0.7931
Epoch 3/100
5480/5480 - 14s - 2ms/step - accuracy: 0.8340 - loss: 0.3699 - val_accuracy: 0.7165 - val_loss: 0.5825
Epoch 4/100
5480/5480 - 21s - 4ms/step - accuracy: 0.8298 - loss: 0.3780 - val_accuracy: 0.7487 - val_loss: 0.5126
Epoch 5/100
5480/5480 - 14s - 3ms/step - accuracy: 0.8333 - loss: 0.3790 - val_accuracy: 0.7192 - val_loss: 0.5725
Epoch 6/100
5480/5480 - 14s - 3ms/step - accuracy: 0.8275 - loss: 0.3821 - val_accuracy: 0.7292 - val_loss: 0.5889
Epoch 7/100
5480/5480 - 21s - 4ms/step - accuracy: 0.8167 - loss: 0.3943 - val_accuracy: 0.7065 - val_loss: 0.6019
Epoch 8/100
5480/5480 - 21s - 4ms/step - accuracy: 0.8224 - loss: 0.3908 - val_accuracy: 0.7199 - val_loss: 0.5998
Epoch 9/100
5480/5480 - 13s - 2ms/step - accuracy: 0.8317 - loss: 0.3792 - val_a

In [None]:
print("Final accuracy: {}".format(score5))
print(classification_report(y_true5, pred5))

Final accuracy: 0.738509935383569
              precision    recall  f1-score   support

           0       0.79      0.57      0.66     37000
           1       0.72      0.87      0.79     45332

    accuracy                           0.74     82332
   macro avg       0.75      0.72      0.72     82332
weighted avg       0.75      0.74      0.73     82332



# CNN Model Training

## CNN Model Prep

In [30]:
# CNN input must be 4 dimensions
x_train_cnn = x_train2.reshape((x_train2.shape[0], 1, x_train2.shape[1], 1))
x_test_cnn = x_test2.reshape((x_test2.shape[0], 1, x_train2.shape[1], 1))

# One hot encoded output
y_train_cnn = y_train2
y_test_cnn = y_test2


In [31]:
print("Train Split :", x_train_cnn.shape, y_train_cnn.shape)
print("Test Split :", x_test_cnn.shape, y_test_cnn.shape)

Train Split : (175341, 1, 50, 1) (175341, 2)
Test Split : (82332, 1, 50, 1) (82332, 2)


## CNN Model # 1

In [None]:
filepath = './cnn_best_weights1.keras'


checkpointer = ModelCheckpoint(filepath=filepath, verbose=0, save_best_only=True) # save best model

for i in range(2):
  cnn = Sequential()
  # First Conv2D layer: 64 filters, kernel size (1, 3) - moving along width
  cnn.add(Conv2D(64, kernel_size=(1, 3), strides=(1, 1), activation='relu',
                input_shape=(1, x_train_cnn.shape[2], 1)))  # input shape matches your data

  cnn.add(MaxPooling2D(pool_size=(1, 2)))
  cnn.add(Conv2D(128, kernel_size=(1, 3), strides=(1, 1), activation='relu'))
  cnn.add(MaxPooling2D(pool_size=(1, 2)))


  cnn.add(Flatten())

  cnn.add(Dense(128, activation="sigmoid"))

  cnn.add(Dense(64, activation="tanh"))

  cnn.add(Dense(32, activation="tanh"))

  # Output layer: 2 classes for binary classification
  cnn.add(Dense(y_test_cnn.shape[1], activation="softmax"))

  cnn.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
  monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')
  cnn.fit(x_train_cnn,y_train_cnn,validation_data=(x_test_cnn,y_test_cnn),callbacks=[monitor,checkpointer],verbose=2,epochs=100)

print('CNN - Training finished...Loading the best model')
print()
cnn.load_weights('./cnn_best_weights1.keras') # load weights from best model

# Measure accuracy
pred_cnn = cnn.predict(x_test_cnn)
pred_cnn = np.argmax(pred_cnn,axis=1)

y_true_cnn = np.argmax(y_test_cnn,axis=1)

score_cnn = metrics.accuracy_score(y_true_cnn, pred_cnn)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
5480/5480 - 26s - 5ms/step - accuracy: 0.6882 - loss: 0.6195 - val_accuracy: 0.5614 - val_loss: 0.7329
Epoch 2/100
5480/5480 - 18s - 3ms/step - accuracy: 0.6952 - loss: 0.6091 - val_accuracy: 0.5618 - val_loss: 0.7009
Epoch 3/100
5480/5480 - 21s - 4ms/step - accuracy: 0.6855 - loss: 0.6220 - val_accuracy: 0.5506 - val_loss: 0.7467
Epoch 4/100
5480/5480 - 19s - 3ms/step - accuracy: 0.6806 - loss: 0.6275 - val_accuracy: 0.5506 - val_loss: 0.7300
Epoch 5/100
5480/5480 - 20s - 4ms/step - accuracy: 0.6806 - loss: 0.6276 - val_accuracy: 0.5506 - val_loss: 0.7257
Epoch 6/100
5480/5480 - 19s - 3ms/step - accuracy: 0.6806 - loss: 0.6275 - val_accuracy: 0.5506 - val_loss: 0.7173
Epoch 7/100
5480/5480 - 20s - 4ms/step - accuracy: 0.6806 - loss: 0.6275 - val_accuracy: 0.5506 - val_loss: 0.7483
Epoch 7: early stopping
Epoch 1/100
5480/5480 - 25s - 5ms/step - accuracy: 0.6828 - loss: 0.6254 - val_accuracy: 0.5506 - val_loss: 0.7253
Epoch 2/100
5480/5480 - 19s - 3ms/step - accuracy: 0.681

In [None]:
print("Final accuracy: {}".format(score_cnn))
print(classification_report(y_true_cnn, pred_cnn))

Final accuracy: 0.5618350094738377
              precision    recall  f1-score   support

           0       1.00      0.03      0.05     37000
           1       0.56      1.00      0.72     45332

    accuracy                           0.56     82332
   macro avg       0.78      0.51      0.38     82332
weighted avg       0.76      0.56      0.42     82332



## CNN Model #2

In [None]:
filepath = './cnn_best_weights1.keras'


checkpointer = ModelCheckpoint(filepath=filepath, verbose=0, save_best_only=True) # save best model

for i in range(2):
  cnn = Sequential()
  cnn.add(Conv2D(64, kernel_size=(1, 5), strides=(1, 1), activation='tanh',
                input_shape=(1, x_train_cnn.shape[2], 1)))  # input shape matches your data

  cnn.add(MaxPooling2D(pool_size=(1, 2)))
  cnn.add(Conv2D(128, kernel_size=(1, 5), strides=(1, 1), activation='tanh'))
  cnn.add(MaxPooling2D(pool_size=(1, 2)))


  cnn.add(Flatten())

  cnn.add(Dense(512, activation="relu"))

  cnn.add(Dense(128, activation="relu"))

  cnn.add(Dense(16, activation="relu"))

  cnn.add(Dense(y_test_cnn.shape[1], activation="softmax"))

  cnn.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy'])
  monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')
  cnn.fit(x_train_cnn,y_train_cnn,validation_data=(x_test_cnn,y_test_cnn),callbacks=[monitor,checkpointer],verbose=2,epochs=100)

print('CNN - Training finished...Loading the best model')
print()
cnn.load_weights('./cnn_best_weights1.keras') # load weights from best model

# Measure accuracy
pred_cnn = cnn.predict(x_test_cnn)
pred_cnn = np.argmax(pred_cnn,axis=1)

y_true_cnn = np.argmax(y_test_cnn,axis=1)

score_cnn = metrics.accuracy_score(y_true_cnn, pred_cnn)

In [None]:
print("Final accuracy: {}".format(score_cnn))
print(classification_report(y_true_cnn, pred_cnn))

Final accuracy: 0.5618350094738377
              precision    recall  f1-score   support

           0       1.00      0.03      0.05     37000
           1       0.56      1.00      0.72     45332

    accuracy                           0.56     82332
   macro avg       0.78      0.51      0.38     82332
weighted avg       0.76      0.56      0.42     82332



## CNN Model #3

In [35]:
filepath = './cnn_best_weights1.keras'


checkpointer = ModelCheckpoint(filepath=filepath, verbose=0, save_best_only=True) # save best model

for i in range(5):
  cnn = Sequential()
  cnn.add(Conv2D(64, kernel_size=(1, 5), strides=(1, 1), activation='sigmoid',
                input_shape=(1, x_train_cnn.shape[2], 1)))  # input shape matches your data

  cnn.add(MaxPooling2D(pool_size=(1, 2)))

  cnn.add(Conv2D(128, kernel_size=(1, 5), strides=(1, 1), activation='sigmoid'))

  cnn.add(MaxPooling2D(pool_size=(1, 2)))


  cnn.add(Flatten())

  cnn.add(Dense(512, activation="relu"))

  cnn.add(Dense(128, activation="relu"))

  cnn.add(Dense(16, activation="relu"))

  cnn.add(Dense(y_test_cnn.shape[1], activation="softmax"))

  cnn.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy'])
  monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')
  cnn.fit(x_train_cnn,y_train_cnn,validation_data=(x_test_cnn,y_test_cnn),callbacks=[monitor,checkpointer],verbose=2,epochs=100)

print('CNN - Training finished...Loading the best model')
print()
cnn.load_weights('./cnn_best_weights1.keras') # load weights from best model

# Measure accuracy
pred_cnn = cnn.predict(x_test_cnn)
pred_cnn = np.argmax(pred_cnn,axis=1)

y_true_cnn = np.argmax(y_test_cnn,axis=1)

score_cnn = metrics.accuracy_score(y_true_cnn, pred_cnn)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
5480/5480 - 19s - 4ms/step - accuracy: 0.8690 - loss: 0.3023 - val_accuracy: 0.7860 - val_loss: 0.4915
Epoch 2/100
5480/5480 - 18s - 3ms/step - accuracy: 0.9139 - loss: 0.2056 - val_accuracy: 0.8328 - val_loss: 0.3491
Epoch 3/100
5480/5480 - 20s - 4ms/step - accuracy: 0.9215 - loss: 0.1809 - val_accuracy: 0.8117 - val_loss: 0.3962
Epoch 4/100
5480/5480 - 16s - 3ms/step - accuracy: 0.9269 - loss: 0.1645 - val_accuracy: 0.8123 - val_loss: 0.3942
Epoch 5/100
5480/5480 - 20s - 4ms/step - accuracy: 0.9304 - loss: 0.1537 - val_accuracy: 0.8319 - val_loss: 0.3488
Epoch 6/100
5480/5480 - 16s - 3ms/step - accuracy: 0.9343 - loss: 0.1441 - val_accuracy: 0.8151 - val_loss: 0.4457
Epoch 7/100
5480/5480 - 16s - 3ms/step - accuracy: 0.9363 - loss: 0.1394 - val_accuracy: 0.8178 - val_loss: 0.3408
Epoch 8/100
5480/5480 - 16s - 3ms/step - accuracy: 0.9373 - loss: 0.1365 - val_accuracy: 0.8259 - val_loss: 0.3756
Epoch 9/100
5480/5480 - 15s - 3ms/step - accuracy: 0.9376 - loss: 0.1349 - val_a

In [36]:
print("Final accuracy: {}".format(score_cnn))
print(classification_report(y_true_cnn, pred_cnn))

Final accuracy: 0.9010105426808531
              precision    recall  f1-score   support

           0       0.90      0.88      0.89     37000
           1       0.90      0.92      0.91     45332

    accuracy                           0.90     82332
   macro avg       0.90      0.90      0.90     82332
weighted avg       0.90      0.90      0.90     82332



In [None]:
cnn.summary()

## CNN Model #4

In [33]:
filepath = './cnn_best_weights1.keras'


checkpointer = ModelCheckpoint(filepath=filepath, verbose=0, save_best_only=True) # save best model

for i in range(5):
  cnn = Sequential()
  cnn.add(Conv2D(64, kernel_size=(1, 5), strides=(1, 1), activation='sigmoid',
                input_shape=(1, x_train_cnn.shape[2], 1)))  # input shape matches your data

  cnn.add(MaxPooling2D(pool_size=(1, 2)))

  cnn.add(Conv2D(128, kernel_size=(1, 5), strides=(1, 1), activation='sigmoid'))

  cnn.add(MaxPooling2D(pool_size=(1, 2)))

  cnn.add(Conv2D(256, kernel_size=(1, 3), strides=(1, 1), activation='sigmoid'))

  cnn.add(MaxPooling2D(pool_size=(1, 2)))

  cnn.add(Flatten())

  cnn.add(Dense(512, activation="sigmoid"))

  cnn.add(Dense(256, activation="tanh"))

  cnn.add(Dense(32, activation="tanh"))

  cnn.add(Dense(y_test_cnn.shape[1], activation="softmax"))

  cnn.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy'])
  monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')
  cnn.fit(x_train_cnn,y_train_cnn,validation_data=(x_test_cnn,y_test_cnn),callbacks=[monitor,checkpointer],verbose=2,epochs=100)

print('CNN - Training finished...Loading the best model')
print()
cnn.load_weights('./cnn_best_weights1.keras') # load weights from best model

# Measure accuracy
pred_cnn = cnn.predict(x_test_cnn)
pred_cnn = np.argmax(pred_cnn,axis=1)

y_true_cnn = np.argmax(y_test_cnn,axis=1)

score_cnn = metrics.accuracy_score(y_true_cnn, pred_cnn)

Epoch 1/100
5480/5480 - 24s - 4ms/step - accuracy: 0.6802 - loss: 0.6264 - val_accuracy: 0.5506 - val_loss: 0.6832
Epoch 2/100
5480/5480 - 35s - 6ms/step - accuracy: 0.7539 - loss: 0.4946 - val_accuracy: 0.7500 - val_loss: 0.5215
Epoch 3/100
5480/5480 - 20s - 4ms/step - accuracy: 0.8774 - loss: 0.2967 - val_accuracy: 0.7675 - val_loss: 0.5739
Epoch 4/100
5480/5480 - 15s - 3ms/step - accuracy: 0.9019 - loss: 0.2427 - val_accuracy: 0.8293 - val_loss: 0.3711
Epoch 5/100
5480/5480 - 15s - 3ms/step - accuracy: 0.9118 - loss: 0.2139 - val_accuracy: 0.8024 - val_loss: 0.4427
Epoch 6/100
5480/5480 - 16s - 3ms/step - accuracy: 0.9172 - loss: 0.1979 - val_accuracy: 0.8012 - val_loss: 0.4458
Epoch 7/100
5480/5480 - 16s - 3ms/step - accuracy: 0.9218 - loss: 0.1878 - val_accuracy: 0.8476 - val_loss: 0.3582
Epoch 8/100
5480/5480 - 16s - 3ms/step - accuracy: 0.9267 - loss: 0.1747 - val_accuracy: 0.7988 - val_loss: 0.4791
Epoch 9/100
5480/5480 - 20s - 4ms/step - accuracy: 0.9302 - loss: 0.1640 - val_a

In [34]:
print("Final accuracy: {}".format(score_cnn))
print(classification_report(y_true_cnn, pred_cnn))

Final accuracy: 0.9060511101394355
              precision    recall  f1-score   support

           0       0.92      0.87      0.89     37000
           1       0.90      0.93      0.92     45332

    accuracy                           0.91     82332
   macro avg       0.91      0.90      0.90     82332
weighted avg       0.91      0.91      0.91     82332

