In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn import linear_model
from sklearn.utils import shuffle

In [0]:
x_train = np.memmap('drive/My Drive/Xtrain.dat', shape=(600000, 2351), dtype=np.float32)
y_train = np.memmap('drive/My Drive/Ytrain.dat', dtype=np.float32)
# 0 for benign (nb of elm => 300000)
# 1 for malicious (nb of elm => 300000)

In [0]:
# normalization (standard normalization)
std_scaler = preprocessing.StandardScaler()
x_train = std_scaler.fit_transform(x_train)

In [0]:
x_train, y_train = shuffle(x_train, y_train)

In [14]:
# split the data into two parts (training set and test set)
x_train, x_test = x_train[:550000], x_train[550000:]
y_train, y_test = y_train[:550000], y_train[550000:]

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

# create validation set
x_validation, y_validation = x_train[500000:], y_train[500000:]
print(x_validation.shape)
print(y_validation.shape)

(550000, 2351)
(550000,)
(50000, 2351)
(50000,)
(50000, 2351)
(50000,)


In [0]:
# nbr of iteration
it = 10
# size of one batch
batch_size = 10000
# nbr of batch
nb_batch = 500000 // batch_size

In [0]:
# function for prediction
def predict_model(datas, labels, model):
  '''
  predict according to our model
  '''
  m = len(datas) # total nbr of data
  good_pred = 0 # nbr of good prediction
  for i in range(m):
    res = model.predict(np.array([datas[i]]))[0] # get the result
    label = labels[i]
    if res == label:
      good_pred += 1
  return good_pred / m

In [0]:
# create the model
# loss = 'hinge' <=> to linear SVClassifier
model = linear_model.SGDClassifier(shuffle=True, loss='hinge')

In [0]:
def fit_model(model):
  '''
  fit our model
  '''
  # fit the model
  for i in range(it):
    print("Epoch nbr {}".format(i + 1))
    inf = 0
    supp = 10000
    for j in range(nb_batch):
      print("=", end='')
      x_train_sub, y_train_sub = x_train[inf:supp], y_train[inf:supp]
      model.partial_fit(x_train_sub, y_train_sub, classes=np.unique(y_train))
      inf += batch_size
      supp += batch_size
    print()
    # validation set
    acc_val = predict_model(x_validation, y_validation, model)
    print("validation accuracy: {}".format(acc_val))
    print('--------------------')
    print('--------------------')

In [24]:
fit_model(model)

Epoch nbr 1
validation accuracy: 0.95402
--------------------
--------------------
Epoch nbr 2
validation accuracy: 0.95918
--------------------
--------------------
Epoch nbr 3
validation accuracy: 0.95532
--------------------
--------------------
Epoch nbr 4
validation accuracy: 0.9541
--------------------
--------------------
Epoch nbr 5
validation accuracy: 0.96152
--------------------
--------------------
Epoch nbr 6
validation accuracy: 0.95688
--------------------
--------------------
Epoch nbr 7
validation accuracy: 0.96264
--------------------
--------------------
Epoch nbr 8
validation accuracy: 0.96382
--------------------
--------------------
Epoch nbr 9
validation accuracy: 0.95924
--------------------
--------------------
Epoch nbr 10
validation accuracy: 0.9654
--------------------
--------------------


In [0]:
acc = predict_model(x_test, y_test, model)

In [26]:
print(acc)

0.96482
