In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
import re
import string
import nltk

from nltk.util import ngrams
from itertools import islice
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import regexp_tokenize
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
plt.style.use('seaborn-v0_8')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import pickle

with open('/content/drive/MyDrive/SKILL ACADEMY/FINAL PROJECT/Tokenizer/tokenizer_char_level.pkl', 'rb') as f:
    tokenizer_char_level = pickle.load(f)

with open('/content/drive/MyDrive/SKILL ACADEMY/FINAL PROJECT/Tokenizer/tokenizer_word_level.pkl', 'rb') as f:
    tokenizer_word_level = pickle.load(f)

In [5]:
from tensorflow.keras.layers import Layer
from tensorflow.keras import backend as K, initializers, regularizers, constraints


class Attention_layer(Layer):
    def __init__(self,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):

        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        super(Attention_layer, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight(shape=(input_shape[-1], input_shape[-1],),
                                initializer=self.init,
                                name='{}_W'.format(self.name),  # Hanya satu kali penambahan argumen name
                                regularizer=self.W_regularizer,
                                constraint=self.W_constraint)
        if self.bias:
            self.b = self.add_weight(shape=(input_shape[-1],),
                                    initializer='zero',
                                    name='{}_b'.format(self.name),
                                    regularizer=self.b_regularizer,
                                    constraint=self.b_constraint)

        super(Attention_layer, self).build(input_shape)

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        uit = K.dot(x, self.W)

        if self.bias:
            uit += self.b

        uit = K.tanh(uit)

        a = K.exp(uit)

        if mask is not None:
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[-1])

In [6]:
from tensorflow.keras.models import load_model

model = load_model('/content/drive/MyDrive/SKILL ACADEMY/FINAL PROJECT/Model/CNN 1D+BiLSTM+Attention.h5', custom_objects={'Attention_layer': Attention_layer})
# model = load_model('/content/drive/MyDrive/SKILL ACADEMY/FINAL PROJECT/Model/CNN 1D.h5')

In [7]:
def preprocessing(url):

    url = url.lower()

    url = re.sub(r'https?://', '', url)

    url = re.sub(r'www\.', '', url)

    url = re.sub(r'\.+', ' ', url)

    url = re.sub(r'[^a-zA-Z0-9\s]', ' ', url) #remove emoji

    url = url.strip()

    url = word_tokenize(url)

    listStopwords = set(stopwords.words('english'))

    filtered = []

    for link in url:

        if link not in listStopwords:

            filtered.append(link)

    url = filtered

    lemmatizer = WordNetLemmatizer()

    lemmatized_text = []

    for link in url:

        lemmatized_text.append(lemmatizer.lemmatize(link))

    data = ' '.join(tokens for tokens in lemmatized_text)

    return data

In [8]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

def preprocess_char_level(url):

    # Menggunakan tokenizer yang telah dimuat
    tokenizer = tokenizer_char_level

    # Konversi teks menjadi urutan angka
    X_sequences = tokenizer.texts_to_sequences(url)

    # Padding sequence
    X_padded = pad_sequences(X_sequences, padding='post', truncating='post', maxlen=500)

    return X_padded

def preprocess_word_level(url):

    # Menggunakan tokenizer yang telah dimuat
    tokenizer = tokenizer_word_level

    # Konversi teks menjadi urutan angka
    X_sequences = tokenizer.texts_to_sequences(url)

    # Padding sequence
    X_padded = pad_sequences(X_sequences, padding='post', truncating='post', maxlen=500)

    return X_padded


In [9]:
preprocessing('https://www.youtube.com/watch?v=bN9WTxzLBRE')

'youtube com watch v bn9wtxzlbre'

In [14]:
URL = 'https://huggingface.co/blog/gradio-spaces'

In [15]:
cleaned_url = preprocessing(URL)

cleaned_url

'huggingface co blog gradio space'

In [16]:
url_char_level = preprocess_char_level(URL)
url_word_level = preprocess_word_level(URL)

display(url_char_level)
display(url_word_level)

array([[16,  0,  0, ...,  0,  0,  0],
       [ 7,  0,  0, ...,  0,  0,  0],
       [ 7,  0,  0, ...,  0,  0,  0],
       ...,
       [ 6,  0,  0, ...,  0,  0,  0],
       [ 3,  0,  0, ...,  0,  0,  0],
       [12,  0,  0, ...,  0,  0,  0]], dtype=int32)

array([[ 463,    0,    0, ...,    0,    0,    0],
       [4513,    0,    0, ...,    0,    0,    0],
       [4513,    0,    0, ...,    0,    0,    0],
       ...,
       [  20,    0,    0, ...,    0,    0,    0],
       [  40,    0,    0, ...,    0,    0,    0],
       [1019,    0,    0, ...,    0,    0,    0]], dtype=int32)

In [17]:
new_predictions_prob = model.predict([url_char_level, url_word_level])
new_predictions = (new_predictions_prob > 0.5).astype(int)

predicted_category = "Phishing" if new_predictions[0] == 1 else "Non-phishing"
print(predicted_category)
# predicted_category

Non-phishing
