# 📢 File explanation!!
Di dalam file ini, saya akan melakukan preprocessing dataset dengan menghapus data yang terduplikat.

# 🎯 **Step 0: Import library**
---

In [1]:
!pip -q install tldextract

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/97.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━[0m [32m92.2/97.7 kB[0m [31m3.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.7/97.7 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import pandas as pd
import ipaddress
import re

from urllib.parse import urlparse, urlencode

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# 🎯 **Step 1: Load Dataset**
---

In [4]:
df_url = pd.read_csv('/content/drive/MyDrive/data_url_after_preprocess.csv')

df_url.head()

Unnamed: 0,Category,Data,clean_url,clean_url_sentence
0,spam,logtelstra2021.ddnsking.com/0dfa1b53b835500696...,"['logtelstra2021', 'ddnsking', 'com', '0dfa1b5...",logtelstra2021 ddnsking com 0dfa1b53b835500696...
1,spam,dvsber.ru/modules/mod_ariimageslidersa/KRENK3N...,"['dvsber', 'ru', 'module', 'mod', 'ariimagesli...",dvsber ru module mod ariimageslidersa krenk3n4...
2,spam,kimsinc564.000webhostapp.com/notification.php?...,"['kimsinc564', '000webhostapp', 'com', 'notifi...",kimsinc564 000webhostapp com notification php ...
3,spam,benjim.com/all,"['benjim', 'com']",benjim com
4,spam,www.m.micesrunescape.com-we.ru/,"['micesrunescape', 'com', 'ru']",micesrunescape com ru


# 🎯 **Step 2: Feature extraction**
---

- [1] https://eprints.hud.ac.uk/id/eprint/24330/6/MohammadPhishing14July2015.pdf
- [2] https://www.sciencedirect.com/science/article/pii/S0965997822001892#:~:text=To%20create%20a%20machine%20learning,model%20is%20the%20next%20step.
- [3] https://thesai.org/Publications/ViewPaper?Volume=11&Issue=4&Code=IJACSA&SerialNo=77
- [4] https://ijstm.inarah.co.id/index.php/ijstm/article/view/498

## ✨ 2.1 Check IP address

🔎 Berdasarkan jurnal nomor [2], jika menemukan alamat IP dalam URL, itu bisa menjadi tanda bahwa URL tersebut bersifat berbahaya. Penjahat dunia maya menggunakan alamat IP dalam URL untuk mencuri informasi sensitif. Jadi, jika ada alamat IP dalam URL, bisa menganggapnya sebagai URL yang berbahaya dan memberikan nilai 1, sedangkan jika tidak ada alamat IP, kita memberikan nilai 0 sebagai tanda bahwa URL tersebut aman. Contoh:
```
http://125.98.3.123/fake.html
```

In [5]:
def having_ip(url):
    try:
        ipaddress.ip_address(url)
        ip = 1
    except:
        ip = 0
    return ip

In [6]:
df_url['have_ip'] = df_url['Data'].apply(having_ip)

df_url.head()

Unnamed: 0,Category,Data,clean_url,clean_url_sentence,have_ip
0,spam,logtelstra2021.ddnsking.com/0dfa1b53b835500696...,"['logtelstra2021', 'ddnsking', 'com', '0dfa1b5...",logtelstra2021 ddnsking com 0dfa1b53b835500696...,0
1,spam,dvsber.ru/modules/mod_ariimageslidersa/KRENK3N...,"['dvsber', 'ru', 'module', 'mod', 'ariimagesli...",dvsber ru module mod ariimageslidersa krenk3n4...,0
2,spam,kimsinc564.000webhostapp.com/notification.php?...,"['kimsinc564', '000webhostapp', 'com', 'notifi...",kimsinc564 000webhostapp com notification php ...,0
3,spam,benjim.com/all,"['benjim', 'com']",benjim com,0
4,spam,www.m.micesrunescape.com-we.ru/,"['micesrunescape', 'com', 'ru']",micesrunescape com ru,0


## ✨ 2.2 Check @ symbol

🔎 Berdasarkan jurnal [3], suatu URL memiliki simbol '@', maka nilai atau indikatornya dianggap 1, yang menunjukkan kemungkinan adanya tindakan phishing. Sebaliknya, jika tidak ada simbol '@', maka nilai atau indikatornya dianggap 0, yang menandakan bahwa situs tersebut dianggap sah atau tidak mencurigakan. Contoh:
```
https://user@example.com
```

In [7]:
def have_at_sign(url):
    if "@" in url:
        at = 1
    else:
        at = 0
    return at

In [8]:
df_url['have_at_sign'] = df_url['Data'].apply(have_at_sign)

df_url.head()

Unnamed: 0,Category,Data,clean_url,clean_url_sentence,have_ip,have_at_sign
0,spam,logtelstra2021.ddnsking.com/0dfa1b53b835500696...,"['logtelstra2021', 'ddnsking', 'com', '0dfa1b5...",logtelstra2021 ddnsking com 0dfa1b53b835500696...,0,0
1,spam,dvsber.ru/modules/mod_ariimageslidersa/KRENK3N...,"['dvsber', 'ru', 'module', 'mod', 'ariimagesli...",dvsber ru module mod ariimageslidersa krenk3n4...,0,0
2,spam,kimsinc564.000webhostapp.com/notification.php?...,"['kimsinc564', '000webhostapp', 'com', 'notifi...",kimsinc564 000webhostapp com notification php ...,0,0
3,spam,benjim.com/all,"['benjim', 'com']",benjim com,0,0
4,spam,www.m.micesrunescape.com-we.ru/,"['micesrunescape', 'com', 'ru']",micesrunescape com ru,0,0


## ✨ 2.3 Check URL length

🔎 Berdasarkan jurnal [3], seringkali penjahat dunia maya menggunakan URL yang panjang untuk menyembunyikan bagian yang bersifat anonim. Jika panjang URL lebih dari 54 karakter, maka dianggap sebagai situs phishing dengan nilai 1, sedangkan jika kurang dari itu dianggap sebagai situs yang tidak mencurigakan dengan nilai 0. Contoh:

```
http://federmacedoadv.com.br/3f/aze/ab51e2e319e51502f416dbe46b773a5e/?cmd=_home&ampdispatch=11004d58f5b74f8dc1e7c2e8dd4105e811004d58f5b74f8dc1e7c2e8dd4105e8@phishing.website.html
```

In [9]:
def get_length(url):
    if len(url) < 54:
        length = 0
    else:
        length = 1
    return length

In [10]:
df_url['get_length'] = df_url['Data'].apply(get_length)

df_url.head()

Unnamed: 0,Category,Data,clean_url,clean_url_sentence,have_ip,have_at_sign,get_length
0,spam,logtelstra2021.ddnsking.com/0dfa1b53b835500696...,"['logtelstra2021', 'ddnsking', 'com', '0dfa1b5...",logtelstra2021 ddnsking com 0dfa1b53b835500696...,0,0,1
1,spam,dvsber.ru/modules/mod_ariimageslidersa/KRENK3N...,"['dvsber', 'ru', 'module', 'mod', 'ariimagesli...",dvsber ru module mod ariimageslidersa krenk3n4...,0,0,1
2,spam,kimsinc564.000webhostapp.com/notification.php?...,"['kimsinc564', '000webhostapp', 'com', 'notifi...",kimsinc564 000webhostapp com notification php ...,0,0,1
3,spam,benjim.com/all,"['benjim', 'com']",benjim com,0,0,0
4,spam,www.m.micesrunescape.com-we.ru/,"['micesrunescape', 'com', 'ru']",micesrunescape com ru,0,0,0


## ✨ 2.4 Check URL depth

🔎 Berdasarkan jurnal [3], semakin dalam kedalaman URL, semakin kompleks struktur situs web, dan ini dapat menjadi faktor yang perlu diperhatikan ketika menilai keamanan suatu situs. Contoh:

```
https://www.example.com/subpage1/subpage2/subpage3
```


In [11]:
def get_depth(url):
    s = urlparse(url).path.split('/')
    depth = 0
    for j in range(len(s)):
        if len(s[j]) != 0:
            depth = depth+1
    return depth

In [12]:
df_url['get_depth'] = df_url['Data'].apply(get_depth)

df_url.head()

Unnamed: 0,Category,Data,clean_url,clean_url_sentence,have_ip,have_at_sign,get_length,get_depth
0,spam,logtelstra2021.ddnsking.com/0dfa1b53b835500696...,"['logtelstra2021', 'ddnsking', 'com', '0dfa1b5...",logtelstra2021 ddnsking com 0dfa1b53b835500696...,0,0,1,2
1,spam,dvsber.ru/modules/mod_ariimageslidersa/KRENK3N...,"['dvsber', 'ru', 'module', 'mod', 'ariimagesli...",dvsber ru module mod ariimageslidersa krenk3n4...,0,0,1,5
2,spam,kimsinc564.000webhostapp.com/notification.php?...,"['kimsinc564', '000webhostapp', 'com', 'notifi...",kimsinc564 000webhostapp com notification php ...,0,0,1,2
3,spam,benjim.com/all,"['benjim', 'com']",benjim com,0,0,0,2
4,spam,www.m.micesrunescape.com-we.ru/,"['micesrunescape', 'com', 'ru']",micesrunescape com ru,0,0,0,1


## ✨ 2.5 Check redirect '//' the URL

🔎 Berdasarkan jurnal [3], tanda "//" harus muncul pada posisi keenam jika URL dimulai dengan "HTTP", atau pada posisi ketujuh jika URL dimulai dengan "HTTPS". Jika tanda "//" ditemukan di tempat lain dalam URL, maka nilai fitur ini harus menjadi 1 (phishing) atau 0 (benign) sesuai dengan kondisi tertentu. Dengan kata lain, posisi "//" dalam URL dapat menjadi indikator apakah URL tersebut mencurigakan (phishing) atau tidak (benign). Contoh:

```
http://www.legitimate.com//http://www.phishing.com
```

In [13]:
def redirection(url):
    pos = url.rfind('//')
    if pos > 6:
        if pos > 7:
            return 1
        else:
            return 0
    else:
        return 0

In [14]:
df_url['redirection'] = df_url['Data'].apply(redirection)

df_url.head()

Unnamed: 0,Category,Data,clean_url,clean_url_sentence,have_ip,have_at_sign,get_length,get_depth,redirection
0,spam,logtelstra2021.ddnsking.com/0dfa1b53b835500696...,"['logtelstra2021', 'ddnsking', 'com', '0dfa1b5...",logtelstra2021 ddnsking com 0dfa1b53b835500696...,0,0,1,2,0
1,spam,dvsber.ru/modules/mod_ariimageslidersa/KRENK3N...,"['dvsber', 'ru', 'module', 'mod', 'ariimagesli...",dvsber ru module mod ariimageslidersa krenk3n4...,0,0,1,5,0
2,spam,kimsinc564.000webhostapp.com/notification.php?...,"['kimsinc564', '000webhostapp', 'com', 'notifi...",kimsinc564 000webhostapp com notification php ...,0,0,1,2,0
3,spam,benjim.com/all,"['benjim', 'com']",benjim com,0,0,0,2,0
4,spam,www.m.micesrunescape.com-we.ru/,"['micesrunescape', 'com', 'ru']",micesrunescape com ru,0,0,0,1,0


## ✨ 2.6 Check HTTP/HTTPS in domain

🔎 Berdasarkan jurnal [3], jika URL memiliki "http/https" dalam bagian domain, maka nilainya adalah 1 (phishing), sedangkan jika tidak terdapat "http/https" dalam bagian domain, maka nilainya adalah 0 (benign). Contoh:

```
http://https-www-paypal-it-webapps-mpp-home.soft-hair.com/
```

In [15]:
def http_domain(url):
    domain = urlparse(url).netloc
    if 'https' in domain:
        return 1
    else:
        return 0

In [16]:
df_url['http_domain'] = df_url['Data'].apply(http_domain)

df_url.head()

Unnamed: 0,Category,Data,clean_url,clean_url_sentence,have_ip,have_at_sign,get_length,get_depth,redirection,http_domain
0,spam,logtelstra2021.ddnsking.com/0dfa1b53b835500696...,"['logtelstra2021', 'ddnsking', 'com', '0dfa1b5...",logtelstra2021 ddnsking com 0dfa1b53b835500696...,0,0,1,2,0,0
1,spam,dvsber.ru/modules/mod_ariimageslidersa/KRENK3N...,"['dvsber', 'ru', 'module', 'mod', 'ariimagesli...",dvsber ru module mod ariimageslidersa krenk3n4...,0,0,1,5,0,0
2,spam,kimsinc564.000webhostapp.com/notification.php?...,"['kimsinc564', '000webhostapp', 'com', 'notifi...",kimsinc564 000webhostapp com notification php ...,0,0,1,2,0,0
3,spam,benjim.com/all,"['benjim', 'com']",benjim com,0,0,0,2,0,0
4,spam,www.m.micesrunescape.com-we.ru/,"['micesrunescape', 'com', 'ru']",micesrunescape com ru,0,0,0,1,0,0


## ✨ 2.7 Check short URL with TinyURL

In [17]:
shortening_services = r"bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|" \
                      r"yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|" \
                      r"short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|" \
                      r"doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|db\.tt|" \
                      r"qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|q\.gs|is\.gd|" \
                      r"po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|x\.co|" \
                      r"prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|" \
                      r"tr\.im|link\.zip\.net"

In [18]:
def tiny_url(url):
    match=re.search(shortening_services,url)
    if match:
        return 1
    else:
        return 0

In [19]:
df_url['tiny_url'] = df_url['Data'].apply(tiny_url)

df_url.head()

Unnamed: 0,Category,Data,clean_url,clean_url_sentence,have_ip,have_at_sign,get_length,get_depth,redirection,http_domain,tiny_url
0,spam,logtelstra2021.ddnsking.com/0dfa1b53b835500696...,"['logtelstra2021', 'ddnsking', 'com', '0dfa1b5...",logtelstra2021 ddnsking com 0dfa1b53b835500696...,0,0,1,2,0,0,0
1,spam,dvsber.ru/modules/mod_ariimageslidersa/KRENK3N...,"['dvsber', 'ru', 'module', 'mod', 'ariimagesli...",dvsber ru module mod ariimageslidersa krenk3n4...,0,0,1,5,0,0,0
2,spam,kimsinc564.000webhostapp.com/notification.php?...,"['kimsinc564', '000webhostapp', 'com', 'notifi...",kimsinc564 000webhostapp com notification php ...,0,0,1,2,0,0,0
3,spam,benjim.com/all,"['benjim', 'com']",benjim com,0,0,0,2,0,0,0
4,spam,www.m.micesrunescape.com-we.ru/,"['micesrunescape', 'com', 'ru']",micesrunescape com ru,0,0,0,1,0,0,0


## ✨ 2.8 Check Preffix suffix '-' in domain

🔎 Berdasarkan jurnal [3], menyatakan bahwa meskipun URL sebenarnya tidak mengandung tanda hubung ("-"), para penjahat dunia maya atau peretas bisa menambahkan tanda hubung tersebut ke dalam URL. Sebagai hasilnya, nilai fitur ini akan diberikan 1, yang menunjukkan kemungkinan adanya phishing atau penipuan. Sebaliknya, jika tidak terdapat tanda hubung dalam URL, nilai fitur tersebut akan diberikan 0, menandakan bahwa URL tersebut dianggap tidak mencurigakan atau benign. Contoh:

```
http://www.Confirme-paypal.com/.
```

In [20]:
def prefix_suffix(url):
    if '-' in urlparse(url).netloc:
        return 1
    else:
        return 0

In [21]:
df_url['prefix_suffix'] = df_url['Data'].apply(prefix_suffix)

df_url.head()

Unnamed: 0,Category,Data,clean_url,clean_url_sentence,have_ip,have_at_sign,get_length,get_depth,redirection,http_domain,tiny_url,prefix_suffix
0,spam,logtelstra2021.ddnsking.com/0dfa1b53b835500696...,"['logtelstra2021', 'ddnsking', 'com', '0dfa1b5...",logtelstra2021 ddnsking com 0dfa1b53b835500696...,0,0,1,2,0,0,0,0
1,spam,dvsber.ru/modules/mod_ariimageslidersa/KRENK3N...,"['dvsber', 'ru', 'module', 'mod', 'ariimagesli...",dvsber ru module mod ariimageslidersa krenk3n4...,0,0,1,5,0,0,0,0
2,spam,kimsinc564.000webhostapp.com/notification.php?...,"['kimsinc564', '000webhostapp', 'com', 'notifi...",kimsinc564 000webhostapp com notification php ...,0,0,1,2,0,0,0,0
3,spam,benjim.com/all,"['benjim', 'com']",benjim com,0,0,0,2,0,0,0,0
4,spam,www.m.micesrunescape.com-we.ru/,"['micesrunescape', 'com', 'ru']",micesrunescape com ru,0,0,0,1,0,0,0,0


## ✨ 2.9 Check total dots in domain

🔎 Berdasarkan jurnal [1], menyatakan jika jumlah dots pada domain lebih dari 1 maka terindikasi sebagai phishing tetapi jika sebaliknya maka dapat dikatakan aman.

In [22]:
import tldextract

def total_dots_domain(url):
    extract_info = tldextract.extract(url)

    total_dot = extract_info.domain.count('.')

    if total_dot > 1:
        return 1
    else:
        return 0

In [23]:
df_url['total_dots_domain'] = df_url['Data'].apply(total_dots_domain)

df_url.head()

Unnamed: 0,Category,Data,clean_url,clean_url_sentence,have_ip,have_at_sign,get_length,get_depth,redirection,http_domain,tiny_url,prefix_suffix,total_dots_domain
0,spam,logtelstra2021.ddnsking.com/0dfa1b53b835500696...,"['logtelstra2021', 'ddnsking', 'com', '0dfa1b5...",logtelstra2021 ddnsking com 0dfa1b53b835500696...,0,0,1,2,0,0,0,0,0
1,spam,dvsber.ru/modules/mod_ariimageslidersa/KRENK3N...,"['dvsber', 'ru', 'module', 'mod', 'ariimagesli...",dvsber ru module mod ariimageslidersa krenk3n4...,0,0,1,5,0,0,0,0,0
2,spam,kimsinc564.000webhostapp.com/notification.php?...,"['kimsinc564', '000webhostapp', 'com', 'notifi...",kimsinc564 000webhostapp com notification php ...,0,0,1,2,0,0,0,0,0
3,spam,benjim.com/all,"['benjim', 'com']",benjim com,0,0,0,2,0,0,0,0,0
4,spam,www.m.micesrunescape.com-we.ru/,"['micesrunescape', 'com', 'ru']",micesrunescape com ru,0,0,0,1,0,0,0,0,0


## ✨ 2.10 Check HTTPS certificate

- https://developer.visa.com/pages/trusted_certifying_authorities
- https://community.rsa.com/s/article/List-of-Trusted-Certificate-Authorities-for-HFED-and-Trusted-Headers-Applications-0453c48e

In [24]:
# import ssl
# import socket
# from datetime import datetime

# def check_https_certificate(domain):
#     try:
#         # Membuka koneksi ke server
#         with socket.create_connection((domain, 443)) as sock:
#             # Membuat koneksi SSL
#             with ssl.create_default_context().wrap_socket(sock, server_hostname=domain) as ssock:
#                 # Mendapatkan informasi sertifikat
#                 cert = ssock.getpeercert()

#                 certificate_issuer = ["GeoTrust", "GoDaddy", "Network Solutions", "Thawte", "Comodo", "Doster", "VeriSign", "Actalis", "Add Trust", "Certum", "DigiCert", "Entrust", "GlobalSign", "QuoVadis", "SecureTrust", "USERTrust", "DigiCert Inc", "Comodo CA Limited", "ACCV", "EDICOM", "AddTrust AB", " AffirmTrust", "America Online Inc", "Greater Manchester", "Digital Signature Trust C", "America Online Inc", 'QuoVadis Limited', 'GeoTrust Inc.', 'WoSign CA Limited', 'AddTrust AB', 'AffirmTrust', 'Swisscom', 'RSA Security LLC', 'EMC Corporation', 'AS Sertifitseerimiskeskus', 'IdenTrust', 'VISA', 'RSA Security', 'Certplus', 'China Internet Network Information Center', 'China Financial Certification Authority', 'Certinomis', 'Autorité Racine - Certinomis', 'Dhimyotis']
#                 value_certificate_issuer = cert['issuer'][1][0][1]
#                 certificate_trusted = value_certificate_issuer in certificate_issuer

#                 # Menampilkan informasi usia sertifikat
#                 not_after = datetime.strptime(cert['notAfter'], "%b %d %H:%M:%S %Y %Z")
#                 certificate_age = (not_after - datetime.now()).days

#                 print(f'certicate_issuer: {value_certificate_issuer}')
#                 print(f'certicate_age: {certificate_age}')


#     except Exception as e:
#         print(f"Error: {e}")

# # Mengganti 'example.com' dengan domain yang ingin Anda periksa
# check_https_certificate('www.fs.fed.us')


# 🎯Step 4: Save feature extraction and pre-processed data into CSV
---

In [26]:
df_url.to_csv('/content/drive/MyDrive/data_url_after_feature_extraction.csv', index=False)