In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
import joblib

In [4]:
!pip install imbalanced-learn


Collecting imbalanced-learn
  Downloading imbalanced_learn-0.13.0-py3-none-any.whl.metadata (8.8 kB)
Collecting sklearn-compat<1,>=0.1 (from imbalanced-learn)
  Downloading sklearn_compat-0.1.3-py3-none-any.whl.metadata (18 kB)
Downloading imbalanced_learn-0.13.0-py3-none-any.whl (238 kB)
   ---------------------------------------- 0.0/238.4 kB ? eta -:--:--
   ---------------------------------------- 0.0/238.4 kB ? eta -:--:--
   - -------------------------------------- 10.2/238.4 kB ? eta -:--:--
   - -------------------------------------- 10.2/238.4 kB ? eta -:--:--
   - -------------------------------------- 10.2/238.4 kB ? eta -:--:--
   - -------------------------------------- 10.2/238.4 kB ? eta -:--:--
   ----- --------------------------------- 30.7/238.4 kB 108.9 kB/s eta 0:00:02
   ----- --------------------------------- 30.7/238.4 kB 108.9 kB/s eta 0:00:02
   ----- --------------------------------- 30.7/238.4 kB 108.9 kB/s eta 0:00:02
   ----- -------------------------------


[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
# Load dataset
df = pd.read_csv('dataset_phishing.csv')

# Map labels
mapping = {'legitimate':0, 'phishing':1}
df['status'] = df['status'].map(mapping)

# Define features
selected_features = [
    'length_url', 'length_hostname', 'ip', 'nb_dots', 'nb_qm',
    'nb_and', 'nb_eq', 'nb_slash', 'nb_www', 'nb_com', 'shortening_service'
]

# Select features and target
selected_columns = selected_features + ['status']
df_selected = df[selected_columns]

In [7]:
# Add new features
def add_new_features(df):
    df['url_length_ratio'] = df['length_url'] / df['length_hostname']
    df['special_chars'] = df['nb_dots'] + df['nb_qm'] + df['nb_and'] + df['nb_eq']
    df['special_char_ratio'] = df['special_chars'] / df['length_url']
    return df

df_selected = add_new_features(df_selected)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['url_length_ratio'] = df['length_url'] / df['length_hostname']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['special_chars'] = df['nb_dots'] + df['nb_qm'] + df['nb_and'] + df['nb_eq']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['special_char_ratio'] = df['special_chars'] / df['length_