In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from keras import Sequential
from keras import layers
from keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from keras.optimizers import adam_v2
from tensorflow.keras.utils import to_categorical
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense
import random
from tqdm.notebook import tqdm
import collections

In [2]:
df = pd.read_csv('F:/data_firefox/3_data/data.csv', header=None)
df = df.sort_values([1, 0])

train = np.array(df[0 : 7000].sample(7000))
train_data = train[:, 2:502] # shape (7000, 500), 100 website, 70 repeat
train_index = train[:, 0] # shape (7000,), 0-99 range, random

test = np.array(df[7000 : 10000].sort_values([0, 1]))
test_data = test[:, 2:502] # shape (3000, 500), 100 website, 70 repeat
test_index = test[:, 0] # shape (3000,), 0-99 range, sequence

In [3]:
def get_data_length(data_vector):
    data_length = 500
    for i in range(500):
        if data_vector[i] == 0:
            data_length = i
            break
    return data_length

def add_noise(data_vector, data_length, count):
    for i in range(count):
        location = random.randint(0, data_length - 1)
        first = second = 0
        if data_vector[location] > 5:
            first = random.randint(1, data_vector[location] - 1)
            second = data_vector[location] - first
        elif data_vector[location] < -5:
            first = random.randint(data_vector[location] + 1, -1)
            second = data_vector[location] - first
        data_vector[location] = second
        data_vector = np.insert(data_vector, location, first)
        if data_length < 500:
            data_length += 1
    return data_vector[0 : 500]

In [4]:
train_data_aug = train_data.copy()
train_index_aug = train_index.copy()
for j in range(5):
    train_data_aug_one = train_data.copy()
    for i in range(7000):
        length = get_data_length(train_data_aug_one[i])
        rate = random.randint(0, 100) / 100
        train_data_aug_one[i] = add_noise(train_data_aug_one[i], length, int(length * rate))
    train_data_aug = np.append(train_data_aug, train_data_aug_one, axis=0)
    train_index_aug = np.append(train_index_aug, train_index, axis=0)

In [5]:
model  = Sequential()
model.add(Conv1D(input_shape = (500, 1),
                 filters=32,
                 kernel_size=5,
                 padding = 'same',
                 strides=1,
                 activation='relu'))
model.add(Conv1D(filters=32,
                 kernel_size=5,
                 padding = 'same',
                 strides=1,
                 activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=64,
                 kernel_size=5,
                 padding = 'same',
                 activation='relu'))
model.add(Conv1D(filters=64,
                 kernel_size=5,
                 padding = 'same',
                 activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=128,
                 kernel_size=5,
                 padding = 'same',
                 activation='relu'))
model.add(Conv1D(filters=128,
                 kernel_size=5,
                 padding = 'same',
                 activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=256,
                 kernel_size=5,
                 padding = 'same',
                 activation='relu'))
model.add(Conv1D(filters=256,
                 kernel_size=5,
                 padding = 'same',
                 activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(256, activation = 'relu'))
model.add(Dense(100, activation='softmax'))
model.compile(loss='categorical_crossentropy',
              optimizer=adam_v2.Adam(learning_rate=0.001),
              metrics='accuracy')

In [6]:
model.fit(train_data_aug.astype('float32')/1600, to_categorical(train_index_aug), epochs = 10, validation_split=0.3)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2010009b6c8>

In [7]:
model.evaluate(test_data.astype('float32')/1600, to_categorical(test_index))



[0.7297779321670532, 0.9100000262260437]

In [8]:
model.save('data/model_tcp_split_aug.h5')