In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
import re
import string
import nltk

from nltk.util import ngrams
from itertools import islice
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import regexp_tokenize
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
plt.style.use('seaborn-v0_8')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import pickle

with open('/content/drive/MyDrive/SKILL ACADEMY/FINAL PROJECT/Tokenizer/tokenizer_char_level.pkl', 'rb') as f:
    tokenizer_char_level = pickle.load(f)

with open('/content/drive/MyDrive/SKILL ACADEMY/FINAL PROJECT/Tokenizer/tokenizer_word_level.pkl', 'rb') as f:
    tokenizer_word_level = pickle.load(f)

In [5]:
from tensorflow.keras.models import load_model

# model = load_model('/content/drive/MyDrive/SKILL ACADEMY/FINAL PROJECT/Model/CNN 1D+BiLSTM+Attention.h5', custom_objects={'Attention_layer': Attention_layer})
model = load_model('/content/drive/MyDrive/SKILL ACADEMY/FINAL PROJECT/Model/CNN 1D.h5')

In [6]:
def preprocessing(url):

    url = url.lower()

    url = re.sub(r'https?://', '', url)

    url = re.sub(r'www\.', '', url)

    url = re.sub(r'\.+', ' ', url)

    url = re.sub(r'[^a-zA-Z0-9\s]', ' ', url) #remove emoji

    url = url.strip()

    url = word_tokenize(url)

    listStopwords = set(stopwords.words('english'))

    filtered = []

    for link in url:

        if link not in listStopwords:

            filtered.append(link)

    url = filtered

    lemmatizer = WordNetLemmatizer()

    lemmatized_text = []

    for link in url:

        lemmatized_text.append(lemmatizer.lemmatize(link))

    data = ' '.join(tokens for tokens in lemmatized_text)

    return data

In [7]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

def preprocess_char_level(url):

    # Menggunakan tokenizer yang telah dimuat
    tokenizer = tokenizer_char_level

    # Konversi teks menjadi urutan angka
    X_sequences = tokenizer.texts_to_sequences(url)

    # Padding sequence
    X_padded = pad_sequences(X_sequences, padding='post', truncating='post', maxlen=500)

    return X_padded

def preprocess_word_level(url):

    # Menggunakan tokenizer yang telah dimuat
    tokenizer = tokenizer_word_level

    # Konversi teks menjadi urutan angka
    X_sequences = tokenizer.texts_to_sequences(url)

    # Padding sequence
    X_padded = pad_sequences(X_sequences, padding='post', truncating='post', maxlen=500)

    return X_padded


In [8]:
preprocessing('https://www.youtube.com/watch?v=bN9WTxzLBRE')

'youtube com watch v bn9wtxzlbre'

In [68]:
URL = 'https://updateservicesaol11.weeblysite.com/'

In [69]:
cleaned_url = preprocessing(URL)

cleaned_url

'updateservicesaol11 weeblysite com'

In [70]:
url_char_level = preprocess_char_level(URL)
url_word_level = preprocess_word_level(URL)

display(url_char_level)
display(url_word_level)

array([[16,  0,  0, ...,  0,  0,  0],
       [ 7,  0,  0, ...,  0,  0,  0],
       [ 7,  0,  0, ...,  0,  0,  0],
       ...,
       [ 4,  0,  0, ...,  0,  0,  0],
       [ 9,  0,  0, ...,  0,  0,  0],
       [ 1,  0,  0, ...,  0,  0,  0]], dtype=int32)

array([[ 463,    0,    0, ...,    0,    0,    0],
       [4513,    0,    0, ...,    0,    0,    0],
       [4513,    0,    0, ...,    0,    0,    0],
       ...,
       [1466,    0,    0, ...,    0,    0,    0],
       [ 605,    0,    0, ...,    0,    0,    0],
       [   0,    0,    0, ...,    0,    0,    0]], dtype=int32)

In [71]:
new_predictions_prob = model.predict([url_char_level, url_word_level])
new_predictions = (new_predictions_prob > 0.5).astype(int)

predicted_category = "Phishing" if new_predictions[0] == 1 else "Non-phishing"
print(predicted_category)
# predicted_category

Phishing


In [59]:
new_predictions_prob

array([[0.7205458 ],
       [0.95144635],
       [0.95144635],
       [0.4601572 ],
       [0.5348026 ],
       [0.5524718 ],
       [0.5524718 ],
       [0.5524718 ],
       [0.6219497 ],
       [0.6219497 ],
       [0.6219497 ],
       [0.5524718 ],
       [0.5590148 ],
       [0.75190306],
       [0.5494252 ],
       [0.95144635],
       [0.5494252 ],
       [0.56664765],
       [0.56694686],
       [0.5524718 ],
       [0.6167655 ],
       [0.75190306],
       [0.65837765],
       [0.5524718 ],
       [0.6219497 ],
       [0.62802035],
       [0.95144635],
       [0.6167655 ],
       [0.7205458 ],
       [0.5524718 ],
       [0.29572994],
       [0.5524718 ],
       [0.3446688 ],
       [0.7082865 ],
       [0.5348026 ],
       [0.2986326 ],
       [0.48985603],
       [0.5348026 ],
       [0.06938756],
       [0.2986326 ],
       [0.43335918],
       [0.7205458 ],
       [0.47738943]], dtype=float32)