In [1]:
import torch
print (torch.cuda.is_available())

True


In [2]:
import tensorflow as tf
print (tf.config.list_physical_devices('GPU'))

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [3]:
!pip install scikit-learn pandas numpy matplotlib seaborn



In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier

from google.colab import drive
drive.mount('/content/drive')

#Đọc dữ liệu từ file CSV
data = pd.read_csv('/content/drive/MyDrive/Project 2/PhiUSIIL_Phishing_URL_Dataset.csv')
print("Dataset shape:", data.shape)
print("Columns:", data.columns.tolist())
print("First 5 rows:")
print(data.head())

Mounted at /content/drive
Dataset shape: (235795, 53)
Columns: ['FILENAME', 'URL', 'URLLength', 'Domain', 'DomainLength', 'IsDomainIP', 'TLD', 'CharContinuationRate', 'TLDLength', 'NoOfSubDomain', 'HasObfuscation', 'NoOfObfuscatedChar', 'ObfuscationRatio', 'NoOfLettersInURL', 'LetterRatioInURL', 'NoOfDegitsInURL', 'DegitRatioInURL', 'NoOfEqualsInURL', 'NoOfQMarkInURL', 'NoOfAmpersandInURL', 'NoOfOtherSpecialCharsInURL', 'SpacialCharRatioInURL', 'IsHTTPS', 'LineOfCode', 'LargestLineLength', 'HasTitle', 'Title', 'DomainTitleMatchScore', 'URLTitleMatchScore', 'HasFavicon', 'Robots', 'IsResponsive', 'NoOfURLRedirect', 'NoOfSelfRedirect', 'HasDescription', 'NoOfPopup', 'NoOfiFrame', 'HasExternalFormSubmit', 'HasSocialNet', 'HasSubmitButton', 'HasHiddenFields', 'HasPasswordField', 'Bank', 'Pay', 'Crypto', 'HasCopyrightInfo', 'NoOfImage', 'NoOfCSS', 'NoOfJS', 'NoOfSelfRef', 'NoOfEmptyRef', 'NoOfExternalRef', 'label']
First 5 rows:
     FILENAME                                 URL  URLLength  

In [5]:
# Loại bỏ cột không cần thiết:

drop_columns = ['FILENAME', 'URL', 'Domain', 'TLD', 'Title']
data = data.drop(columns=drop_columns)

# Kiểm tra lại dữ liệu

print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 235795 entries, 0 to 235794
Data columns (total 48 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   URLLength                   235795 non-null  int64  
 1   DomainLength                235795 non-null  int64  
 2   IsDomainIP                  235795 non-null  int64  
 3   CharContinuationRate        235795 non-null  float64
 4   TLDLength                   235795 non-null  int64  
 5   NoOfSubDomain               235795 non-null  int64  
 6   HasObfuscation              235795 non-null  int64  
 7   NoOfObfuscatedChar          235795 non-null  int64  
 8   ObfuscationRatio            235795 non-null  float64
 9   NoOfLettersInURL            235795 non-null  int64  
 10  LetterRatioInURL            235795 non-null  float64
 11  NoOfDegitsInURL             235795 non-null  int64  
 12  DegitRatioInURL             235795 non-null  float64
 13  NoOfEqualsInUR

In [6]:
# Kiểm tra dữ liệu còn thiếu

print(data.isnull().sum().sort_values(ascending=False).head(10))

URLLength               0
DomainLength            0
IsDomainIP              0
CharContinuationRate    0
TLDLength               0
NoOfSubDomain           0
HasObfuscation          0
NoOfObfuscatedChar      0
ObfuscationRatio        0
NoOfLettersInURL        0
dtype: int64


In [7]:
# Chia tập dữ liệu để train và thử nghiệm
X = data.drop(columns = ['label']) # cột nhãn
Y = data['label']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

# Thông tin kích thước tập dữ liệu
print(f"Train size: {X_train.shape}, Test size: {X_test.shape}")

# Cấu hình Random Forest
clf = RandomForestClassifier(
    n_estimators=300,
    max_depth=15,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features='log2',
    bootstrap=True,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

# Cross-Validation trước khi train
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(clf, X_train, Y_train, cv=cv, scoring='accuracy')

# In kết quả Cross-Validation
print(f" Mean CV Accuracy: {np.mean(cv_scores) * 100:.2f}%")
print(f" CV Accuracy Scores: {cv_scores * 100}")

# Train mô hình
clf.fit(X_train, Y_train)

Train size: (165056, 47), Test size: (70739, 47)
 Mean CV Accuracy: 99.99%
 CV Accuracy Scores: [99.97879559 99.98182424 99.98182424 99.99697071 99.98788283]


In [8]:
# Dự đoán trên tập kiểm tra
Y_pred = clf.predict(X_test)

# Tính độ chính xác
accuracy = accuracy_score(Y_test, Y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# In báo cáo chi tiết
print(classification_report(Y_test, Y_pred))

Accuracy: 99.98%
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     30151
           1       1.00      1.00      1.00     40588

    accuracy                           1.00     70739
   macro avg       1.00      1.00      1.00     70739
weighted avg       1.00      1.00      1.00     70739



In [9]:
# Lưu và tải mô hình
import joblib
from google.colab import files
joblib.dump(clf, 'random_forest_model.pkl')
files.download('random_forest_model.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>