In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datasets import load_dataset
from ydata_profiling import ProfileReport

In [1]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
phishing_websites = fetch_ucirepo(id=327) 
  
# data (as pandas dataframes) 
X = phishing_websites.data.features 
y = phishing_websites.data.targets 
  
# metadata 
print(phishing_websites.metadata) 
  
# variable information 
print(phishing_websites.variables) 

{'uci_id': 327, 'name': 'Phishing Websites', 'repository_url': 'https://archive.ics.uci.edu/dataset/327/phishing+websites', 'data_url': 'https://archive.ics.uci.edu/static/public/327/data.csv', 'abstract': 'This dataset collected mainly from: PhishTank archive, MillerSmiles archive, Googleâ€™s searching operators.', 'area': 'Computer Science', 'tasks': ['Classification'], 'characteristics': ['Tabular'], 'num_instances': 11055, 'num_features': 30, 'feature_types': ['Integer'], 'demographics': [], 'target_col': ['result'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2012, 'last_updated': 'Tue Mar 05 2024', 'dataset_doi': '10.24432/C51W2X', 'creators': ['Rami Mohammad', 'Lee McCluskey'], 'intro_paper': {'ID': 396, 'type': 'NATIVE', 'title': 'An assessment of features related to phishing websites using an automated technique', 'authors': 'R. Mohammad, F. Thabtah, L. Mccluskey', 'venue': 'International Conference for Internet Tec

In [5]:
dataset = pd.concat([X,y], ignore_index=True)

### Statistical summary

In [6]:
dataset.head()

Unnamed: 0,having_ip_address,url_length,shortining_service,having_at_symbol,double_slash_redirecting,prefix_suffix,having_sub_domain,sslfinal_state,domain_registration_length,favicon,...,popupwindow,iframe,age_of_domain,dnsrecord,web_traffic,page_rank,google_index,links_pointing_to_page,statistical_report,result
0,-1.0,1.0,1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,...,1.0,1.0,-1.0,-1.0,-1.0,-1.0,1.0,1.0,-1.0,
1,1.0,1.0,1.0,1.0,1.0,-1.0,0.0,1.0,-1.0,1.0,...,1.0,1.0,-1.0,-1.0,0.0,-1.0,1.0,1.0,1.0,
2,1.0,0.0,1.0,1.0,1.0,-1.0,-1.0,-1.0,-1.0,1.0,...,1.0,1.0,1.0,-1.0,1.0,-1.0,1.0,0.0,-1.0,
3,1.0,0.0,1.0,1.0,1.0,-1.0,-1.0,-1.0,1.0,1.0,...,1.0,1.0,-1.0,-1.0,1.0,-1.0,1.0,-1.0,1.0,
4,1.0,0.0,-1.0,1.0,1.0,-1.0,1.0,1.0,-1.0,1.0,...,-1.0,1.0,-1.0,-1.0,0.0,-1.0,1.0,1.0,1.0,


In [7]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22110 entries, 0 to 22109
Data columns (total 31 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   having_ip_address           11055 non-null  float64
 1   url_length                  11055 non-null  float64
 2   shortining_service          11055 non-null  float64
 3   having_at_symbol            11055 non-null  float64
 4   double_slash_redirecting    11055 non-null  float64
 5   prefix_suffix               11055 non-null  float64
 6   having_sub_domain           11055 non-null  float64
 7   sslfinal_state              11055 non-null  float64
 8   domain_registration_length  11055 non-null  float64
 9   favicon                     11055 non-null  float64
 10  port                        11055 non-null  float64
 11  https_token                 11055 non-null  float64
 12  request_url                 11055 non-null  float64
 13  url_of_anchor               110

In [8]:
num_features = dataset.select_dtypes(include=np.number)

In [9]:
num_features.describe().style.format(precision=2)

Unnamed: 0,having_ip_address,url_length,shortining_service,having_at_symbol,double_slash_redirecting,prefix_suffix,having_sub_domain,sslfinal_state,domain_registration_length,favicon,port,https_token,request_url,url_of_anchor,links_in_tags,sfh,submitting_to_email,abnormal_url,redirect,on_mouseover,rightclick,popupwindow,iframe,age_of_domain,dnsrecord,web_traffic,page_rank,google_index,links_pointing_to_page,statistical_report,result
count,11055.0,11055.0,11055.0,11055.0,11055.0,11055.0,11055.0,11055.0,11055.0,11055.0,11055.0,11055.0,11055.0,11055.0,11055.0,11055.0,11055.0,11055.0,11055.0,11055.0,11055.0,11055.0,11055.0,11055.0,11055.0,11055.0,11055.0,11055.0,11055.0,11055.0,11055.0
mean,0.31,-0.63,0.74,0.7,0.74,-0.73,0.06,0.25,-0.34,0.63,0.73,0.68,0.19,-0.08,-0.12,-0.6,0.64,0.71,0.12,0.76,0.91,0.61,0.82,0.06,0.38,0.29,-0.48,0.72,0.34,0.72,0.11
std,0.95,0.77,0.67,0.71,0.67,0.68,0.82,0.91,0.94,0.78,0.69,0.74,0.98,0.72,0.76,0.76,0.77,0.71,0.32,0.65,0.41,0.79,0.58,1.0,0.93,0.83,0.88,0.69,0.57,0.69,0.99
min,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
25%,-1.0,-1.0,1.0,1.0,1.0,-1.0,-1.0,-1.0,-1.0,1.0,1.0,1.0,-1.0,-1.0,-1.0,-1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,-1.0,-1.0,0.0,-1.0,1.0,0.0,1.0,-1.0
50%,1.0,-1.0,1.0,1.0,1.0,-1.0,0.0,1.0,-1.0,1.0,1.0,1.0,1.0,0.0,0.0,-1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,-1.0,1.0,0.0,1.0,1.0
75%,1.0,-1.0,1.0,1.0,1.0,-1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,-1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [10]:
profile = ProfileReport(num_features,
                        title='Phishing Websites Dataset'
                        )
profile.to_notebook_iframe()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

In [14]:
profile.to_file('../outputs/profile_phishing_websites.html')

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [15]:
skewness = (
    num_features
    .skew()
    .reset_index()
    .rename(columns={"index": "variable", 0: "skewness"})
)
kurtosis = (
    num_features
    .kurtosis()
    .reset_index()
    .rename(columns={"index": "variable", 0: "kurtosis"})
)
skewness_kurtosis = pd.concat([skewness, kurtosis["kurtosis"]], axis=1)
skewness_kurtosis.round(2).style.format(precision=2).background_gradient(
    vmax=3, vmin=-1, cmap="Blues"
)

Unnamed: 0,variable,skewness,kurtosis
0,having_ip_address,-0.66,-1.56
1,url_length,1.64,0.71
2,shortining_service,-2.19,2.81
3,having_at_symbol,-1.96,1.86
4,double_slash_redirecting,-2.21,2.89
5,prefix_suffix,2.17,2.7
6,having_sub_domain,-0.12,-1.49
7,sslfinal_state,-0.52,-1.6
8,domain_registration_length,0.72,-1.49
9,favicon,-1.62,0.61


In [17]:
fig, axes = plt.subplots(3, 2, figsize=(20, 10), dpi=200)
for ax, feature in zip(axes.flat, num_features.columns):
    sns.histplot(x=num_features[feature], ax=ax)
    ax.set_title(f'Histogram - {feature}')
plt.tight_layout()
plt.show()

  plt.show()


In [19]:
dataset['result'].value_counts()

result
 1.0    6157
-1.0    4898
Name: count, dtype: int64