# 📢 File explanation!!
In this file, preprocessing of the dataset will be carried out based on the conclusion of information from the file 📈 01_Exploratory Data Analysis


# 🎯 **Step 0: Import library**
---

In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
import re
import string
import nltk

from nltk.util import ngrams
from itertools import islice
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import regexp_tokenize

In [22]:
plt.style.use('seaborn-v0_8')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [23]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# 🎯 **Step 1: Load dataset**
---

## ✨ 1.1 - Dataset URL Combined

In [24]:
path = '/content/drive/MyDrive/SKILL ACADEMY/FINAL PROJECT/Dataset/URL/URL.xlsx'

df_combined = pd.read_excel(path)

df_combined.head()

Unnamed: 0,Category,Data
0,spam,logtelstra2021.ddnsking.com/0dfa1b53b835500696...
1,spam,dvsber.ru/modules/mod_ariimageslidersa/KRENK3N...
2,spam,kimsinc564.000webhostapp.com/notification.php?...
3,spam,benjim.com/all
4,spam,www.m.micesrunescape.com-we.ru/


In [25]:
df_combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45373 entries, 0 to 45372
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  45373 non-null  object
 1   Data      45373 non-null  object
dtypes: object(2)
memory usage: 709.1+ KB


In [26]:
df_combined.rename(columns={'Data': 'URL'}, inplace=True)

# df_combined['URL'].loc[0]

In [27]:
df_combined['URL'].loc[4]

'www.m.micesrunescape.com-we.ru/'

## ✨ 1.2 - Dataset Non-Phihing from Alexa

In [28]:
# path = '/content/drive/MyDrive/SKILL ACADEMY/FINAL PROJECT/Dataset/URL/nonphishing_alexa_top-1m.csv'

# df_alexa = pd.read_csv(path)

# df_alexa.head()

# 🎯 **Step 2: Preprocessing**

---


## ✨ 2.1 - Remove Duplicate Data

Berdasarkan step 4 dari Exploratory Data Analysis (EDA), saya menghapus data yang terduplikasi dan hanya mempertahankan 1 dari data terduplikasi. Data terduplikasi dari EDA teridentifikasi sebanyak 1720 data

In [29]:
df_combined.drop_duplicates(keep='first', inplace=True)

In [30]:
duplicate = df_combined['URL']

duplicate.duplicated().sum()

0

In [31]:
url_counts = df_combined['Category'].value_counts(ascending=True)

url_counts

spam    21391
ham     22687
Name: Category, dtype: int64

🚧 Berdasarkan hasil di atas dapat dilihat bahwasanya jumlah distribusi data setiap kategori masih tidak seimbang

## ✨ 2.2 - Balance the amount of data for each category

🔎 Karena data yang  kurang ialah data dari kategori non-phishing maka saya hanya tinggal menambahkan data kategori non-phishing dari data Alexa dengan pengambilannya secara acak

In [32]:
number_of_data_added = 156241 - 123207

In [33]:
# additional_non_phishing_urls = df_alexa.sample(n=number_of_data_added, replace=True)

In [34]:
# additional_non_phishing_urls.head()

In [35]:
# additional_non_phishing_urls['Category'] = 'non-phishing'

# additional_non_phishing_urls.head()

In [36]:
# additional_non_phishing_urls.rename(columns={'google.com': 'URL'}, inplace=True)

# additional_non_phishing_urls.head()

In [37]:
# additional_non_phishing_urls = additional_non_phishing_urls[['URL', 'Category']]

# additional_non_phishing_urls.head()

In [38]:
# df_combined = pd.concat([df_combined[['Category', 'URL']], additional_non_phishing_urls[['Category', 'URL']]])

# df_combined.head()

In [39]:
# url_counts = df_combined['Category'].value_counts(ascending=True)

# url_counts

## ✨ 2.3 - Change to Lowercase


In [40]:
def case_folding_url(url):

    url = url.lower()

    return url

In [41]:
df_combined['clean_URL'] = df_combined['URL'].apply(case_folding_url)

df_combined.head()

Unnamed: 0,Category,URL,clean_URL
0,spam,logtelstra2021.ddnsking.com/0dfa1b53b835500696...,logtelstra2021.ddnsking.com/0dfa1b53b835500696...
1,spam,dvsber.ru/modules/mod_ariimageslidersa/KRENK3N...,dvsber.ru/modules/mod_ariimageslidersa/krenk3n...
2,spam,kimsinc564.000webhostapp.com/notification.php?...,kimsinc564.000webhostapp.com/notification.php?...
3,spam,benjim.com/all,benjim.com/all
4,spam,www.m.micesrunescape.com-we.ru/,www.m.micesrunescape.com-we.ru/


In [42]:
display(df_combined['URL'].loc[1])
display(df_combined['clean_URL'].loc[1])

'dvsber.ru/modules/mod_ariimageslidersa/KRENK3N453/e2d9ebfbc01e29f3061836c0dd7ce5dd'

'dvsber.ru/modules/mod_ariimageslidersa/krenk3n453/e2d9ebfbc01e29f3061836c0dd7ce5dd'

## ✨ 2.4 - Clean Up URL

🔎 Pada step ini saya menghapus http/https dan www karena setelah saya lihat data-datanya ada yang terdapat kedua hal tersebut dan ada yang tidak sehingga saya menyeragamkan semua data yang ada

In [43]:
def cleaning_url_char(url):

    url = re.sub(r'https?://', '', url)

    url = re.sub(r'www\.', '', url)

    url = re.sub(r'\.+', ' ', url)

    url = re.sub(r'[^a-zA-Z0-9\s]', ' ', url) #remove emoji

    url = url.strip()

    return url

# def cleaning_url_word(url):

#     url = re.sub(r'https?://', '', url)

#     url = re.sub(r'www\.', '', url)

#     url = re.sub(r'\.+', '/', url)

#     url = re.sub(r'[^a-zA-Z0-9\s]', '/', url) #remove emoji

#     url = url.strip()

#     return url

In [44]:
df_combined['clean_URL'] = df_combined['clean_URL'].apply(cleaning_url_char)

# df_combined['clean_URL_char'] = df_combined['clean_URL'].apply(cleaning_url_char)
# df_combined['clean_URL_word'] = df_combined['clean_URL'].apply(cleaning_url_word)

df_combined.head()

Unnamed: 0,Category,URL,clean_URL
0,spam,logtelstra2021.ddnsking.com/0dfa1b53b835500696...,logtelstra2021 ddnsking com 0dfa1b53b835500696...
1,spam,dvsber.ru/modules/mod_ariimageslidersa/KRENK3N...,dvsber ru modules mod ariimageslidersa krenk3n...
2,spam,kimsinc564.000webhostapp.com/notification.php?...,kimsinc564 000webhostapp com notification php ...
3,spam,benjim.com/all,benjim com all
4,spam,www.m.micesrunescape.com-we.ru/,m micesrunescape com we ru


In [45]:
# display(df_combined['URL'].loc[1])
# display(df_combined['clean_URL_char'].loc[1])
# display(df_combined['clean_URL_word'].loc[1])

## ✨ 2.5 - Breaks URL into tokens

🔎 At this stage I use tokenization from the NLTK library

In [46]:
# def tokenizing_url(url):

#     url_tokens = re.findall(r'\w+|[\W]', url)

#     return url_tokens


def tokenizing_url(url):

    url = word_tokenize(url)

    return url

In [47]:
df_combined['clean_URL'] = df_combined['clean_URL'].apply(tokenizing_url)

# df_combined['clean_URL_char'] = df_combined['clean_URL'].apply(cleaning_url_char)
# df_combined['clean_URL_word'] = df_combined['clean_URL'].apply(cleaning_url_word)

df_combined.head()

Unnamed: 0,Category,URL,clean_URL
0,spam,logtelstra2021.ddnsking.com/0dfa1b53b835500696...,"[logtelstra2021, ddnsking, com, 0dfa1b53b83550..."
1,spam,dvsber.ru/modules/mod_ariimageslidersa/KRENK3N...,"[dvsber, ru, modules, mod, ariimageslidersa, k..."
2,spam,kimsinc564.000webhostapp.com/notification.php?...,"[kimsinc564, 000webhostapp, com, notification,..."
3,spam,benjim.com/all,"[benjim, com, all]"
4,spam,www.m.micesrunescape.com-we.ru/,"[m, micesrunescape, com, we, ru]"


In [48]:
# display(df_combined['URL'].loc[1])
# display(df_combined['clean_URL_char'].loc[1])
# display(df_combined['clean_URL_word'].loc[1])

In [49]:
# display(df_combined['URL'].loc[2])
# display(df_combined['clean_URL_char'].loc[2])
# display(df_combined['clean_URL_word'].loc[2])

## ✨ 2.6 -  Remove Stopwords

In [50]:
def filtering_stopwords(url):

    listStopwords = set(stopwords.words('english'))

    filtered = []

    for link in url:

        if link not in listStopwords:

            filtered.append(link)

    url = filtered

    return url

In [51]:
df_combined['clean_URL'] = df_combined['clean_URL'].apply(filtering_stopwords)

# df_combined['clean_URL_char'] = df_combined['clean_URL'].apply(cleaning_url_char)
# df_combined['clean_URL_word'] = df_combined['clean_URL'].apply(cleaning_url_word)

df_combined.head()

Unnamed: 0,Category,URL,clean_URL
0,spam,logtelstra2021.ddnsking.com/0dfa1b53b835500696...,"[logtelstra2021, ddnsking, com, 0dfa1b53b83550..."
1,spam,dvsber.ru/modules/mod_ariimageslidersa/KRENK3N...,"[dvsber, ru, modules, mod, ariimageslidersa, k..."
2,spam,kimsinc564.000webhostapp.com/notification.php?...,"[kimsinc564, 000webhostapp, com, notification,..."
3,spam,benjim.com/all,"[benjim, com]"
4,spam,www.m.micesrunescape.com-we.ru/,"[micesrunescape, com, ru]"


In [52]:
# display(df_combined['URL'].loc[1])
# display(df_combined['clean_URL_char'].loc[1])
# display(df_combined['clean_URL_word'].loc[1])

## ✨ 2.7 - Change the word to its base word

In [53]:
def lemmatize_url(url):

    lemmatizer = WordNetLemmatizer()

    lemmatized_text = []

    for link in url:

        lemmatized_text.append(lemmatizer.lemmatize(link))

    return lemmatized_text

In [54]:
df_combined['clean_URL'] = df_combined['clean_URL'].apply(lemmatize_url)

# df_combined['clean_URL_char'] = df_combined['clean_URL'].apply(cleaning_url_char)
# df_combined['clean_URL_word'] = df_combined['clean_URL'].apply(cleaning_url_word)

df_combined.head()

Unnamed: 0,Category,URL,clean_URL
0,spam,logtelstra2021.ddnsking.com/0dfa1b53b835500696...,"[logtelstra2021, ddnsking, com, 0dfa1b53b83550..."
1,spam,dvsber.ru/modules/mod_ariimageslidersa/KRENK3N...,"[dvsber, ru, module, mod, ariimageslidersa, kr..."
2,spam,kimsinc564.000webhostapp.com/notification.php?...,"[kimsinc564, 000webhostapp, com, notification,..."
3,spam,benjim.com/all,"[benjim, com]"
4,spam,www.m.micesrunescape.com-we.ru/,"[micesrunescape, com, ru]"


In [55]:
# display(df_combined['URL'].loc[1])
# display(df_combined['clean_URL_char'].loc[1])
# display(df_combined['clean_URL_word'].loc[1])

## ✨ 2.8 - Recombine the tokenized words into sentences

In [56]:
df_combined['clean_URL_sentence'] = [' '.join(tokens) for tokens in df_combined['clean_URL']]
# df_combined['clean_URL_char_sentence'] = [''.join(tokens) for tokens in df_combined['clean_URL_char']]
# df_combined['clean_URL_word_sentence'] = [''.join(tokens) for tokens in df_combined['clean_URL_word']]
df_combined.head()

Unnamed: 0,Category,URL,clean_URL,clean_URL_sentence
0,spam,logtelstra2021.ddnsking.com/0dfa1b53b835500696...,"[logtelstra2021, ddnsking, com, 0dfa1b53b83550...",logtelstra2021 ddnsking com 0dfa1b53b835500696...
1,spam,dvsber.ru/modules/mod_ariimageslidersa/KRENK3N...,"[dvsber, ru, module, mod, ariimageslidersa, kr...",dvsber ru module mod ariimageslidersa krenk3n4...
2,spam,kimsinc564.000webhostapp.com/notification.php?...,"[kimsinc564, 000webhostapp, com, notification,...",kimsinc564 000webhostapp com notification php ...
3,spam,benjim.com/all,"[benjim, com]",benjim com
4,spam,www.m.micesrunescape.com-we.ru/,"[micesrunescape, com, ru]",micesrunescape com ru


In [57]:
# display(df_combined['URL'].loc[1])
# display(df_combined['clean_URL_char'].loc[1])
# display(df_combined['clean_URL_char_sentence'].loc[1])

# 🎯 **Step 3: Save pre-processed data into CSV**

---



In [58]:
path = '/content/drive/MyDrive/SKILL ACADEMY/FINAL PROJECT/Dataset/URL/combined_dataset_url_after_preprocess_v3.csv'

df_combined.to_csv(path, index=False)