#### Importing packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
from tensorflow.keras import regularizers, Sequential, layers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.regularizers import l2
from tensorflow.keras.models import load_model, save_model
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from keras.wrappers.scikit_learn import KerasClassifier  # Import KerasClassifier
from sklearn.model_selection import cross_validate
from tensorflow.keras import datasets, layers, models
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import cross_validate

#### Upload dataset

In [2]:
data = pd.read_csv('../raw_data/validation_dataset_2.csv')
data.head()

Unnamed: 0,domain,ranking,isIp,valid,activeDuration,urlLen,is@,isredirect,haveDash,domainLen,nosOfSubdomain,label
0,www.voting-yahoo.com,10000000,0,0,0,20,0,0,1,20,2,1
1,www.zvon.org/xxl/WSDL1.1/Output/index.html,194914,0,1,7305,42,0,0,0,12,2,0
2,tecportais.com/file-security-update-infonfmati...,10000000,0,0,0,155,0,0,0,14,1,1
3,bima.astro.umd.edu/nemo/linuxastro/,7001,0,0,0,35,0,0,0,18,3,0
4,huarui-tec.com/js/?us.battle.net/login/en/?ref...,10000000,0,1,730,79,0,0,1,14,1,1


#### Preprocessing data

##### Drop irrelevant columns

In [3]:
data.columns

Index(['domain', 'ranking', 'isIp', 'valid', 'activeDuration', 'urlLen', 'is@',
       'isredirect', 'haveDash', 'domainLen', 'nosOfSubdomain', 'label'],
      dtype='object')

In [4]:
data.drop(columns=['ranking'], inplace=True)

In [5]:
data.columns

Index(['domain', 'isIp', 'valid', 'activeDuration', 'urlLen', 'is@',
       'isredirect', 'haveDash', 'domainLen', 'nosOfSubdomain', 'label'],
      dtype='object')

##### Balance dataset

In [6]:
indices_to_remove = data[data['label'] == 1].index

indices_to_remove = indices_to_remove[:16000]

data.drop(indices_to_remove, inplace=True)

#### Define X and y

In [7]:
X = data.drop(columns=['domain', 'label'])
y = data['label']

#### Split data on train and test sets

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, 
                                                    random_state=42, stratify=y)

In [9]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((55937, 9), (23973, 9), (55937,), (23973,))

#### Initialize the model

In [22]:
def initialize_model():
    
    reg_l1 = regularizers.L1(0.02)
    # reg_l2 = regularizers.L2(0.02)
    
    model = Sequential()
    model.add(layers.Dense(64, activation='relu', input_shape=(9,)))
    model.add(layers.Dropout(0.2))
    model.add(layers.Dense(32, activation='relu', bias_regularizer=reg_l1))
    model.add(layers.Dropout(0.2)
    model.add(layers.Dense(1, activation='sigmoid'))
    
    opt = tf.keras.optimizers.Adam(learning_rate=0.00009)
    model.compile(optimizer=opt,
              loss='binary_crossentropy',
              metrics=['accuracy'])
    
    return model

#### Fitting the model

In [25]:
model = initialize_model()

es = EarlyStopping(monitor='val_loss', patience=1, restore_best_weights=True)

history = model.fit(X_train, y_train, 
          epochs=75, batch_size=32, 
          verbose=1, validation_data=(X_test, y_test))

Epoch 1/75
Epoch 2/75
Epoch 3/75
Epoch 4/75
Epoch 5/75
Epoch 6/75
Epoch 7/75
Epoch 8/75
Epoch 9/75
Epoch 10/75
Epoch 11/75
Epoch 12/75
Epoch 13/75
Epoch 14/75
Epoch 15/75
Epoch 16/75
Epoch 17/75
Epoch 18/75
Epoch 19/75
Epoch 20/75
Epoch 21/75
Epoch 22/75
Epoch 23/75
Epoch 24/75
Epoch 25/75
Epoch 26/75
Epoch 27/75
Epoch 28/75
Epoch 29/75
Epoch 30/75
Epoch 31/75
Epoch 32/75
Epoch 33/75
Epoch 34/75
Epoch 35/75
Epoch 36/75
Epoch 37/75
Epoch 38/75
Epoch 39/75

KeyboardInterrupt: 

#### Testing the model

In [24]:
test_accuracy = model.evaluate(X_test, y_test)[1]



#### Save the model

In [None]:
from tensorflow.keras.models import load_model, save_model

model.save(f'../saved_models/validation_model_{int(round(test_accuracy, 2) * 100)}_2.h5')