In [None]:
import glob
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import svm 
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import average_precision_score
from sklearn.metrics import classification_report
from sklearn.ensemble import GradientBoostingClassifier


In [None]:
BENIGN_PATH = '../../phishingdatasets/benign'
MAL_PATH = '../../phishingdatasets/malicious'

In [None]:
benign_files = glob.glob(f'{BENIGN_PATH}/**/*.txt', recursive=True)
mal_files = glob.glob(f'{MAL_PATH}/**/*.txt', recursive=True)

In [None]:
# https://www.nature.com/articles/s41598-022-10841-5.pdf

class Features:
    def __init__(self, body):
        self.body = body
        self.soup = BeautifulSoup(body, 'html.parser')

    def get_attributes(self, attribute, alias, filter=None, **kwargs):
        features = {f'total_{alias}':0, 
                    f'with_int_ref_{alias}': 0, 
                    f'with_ext_ref_{alias}':0, 
                    f'with_contents_{alias}': 0,
                    'ref': filter
                   }
        all_attributes = self.soup.find_all(attribute, **kwargs)
        features[f'total_{alias}'] = len(all_attributes)
        for att in all_attributes:
            if att.get(filter):
                content = att[filter]
                if content.startswith('http') or content.startswith('https'):
                    features[f'with_ext_ref_{alias}'] += 1
                else:
                    features[f'with_int_ref_{alias}'] += 1
            else:
                if att.contents:
                    features[f'with_contents_{alias}'] += 1
        return features        


In [None]:
def gen_features(file: str) -> dict:
    with open(file, 'r') as fileob:
        file_data = fileob.read()

    all_features = Features(file_data)
    merged_features = {
        'file': file,
        **all_features.get_attributes('img', 'img', filter='src'),
        **all_features.get_attributes('link', 'css', filter='href', rel='stylesheet'),
        **all_features.get_attributes('script', 'script', filter='src'),
        **all_features.get_attributes('a', 'a', filter='href'),
        **all_features.get_attributes('form', 'form', filter='action')
    }
    return merged_features



In [None]:
ben_files = []
try:
    for benign_file in benign_files[:10000]:
        features = gen_features(benign_file)
        ben_files.append(features)
except Exception as err:
    print(err)

benign_df = pd.DataFrame(ben_files)
benign_df['result'] = 'benign'

In [None]:
phish_files = []
try:
    for mal_file in mal_files[:10000]:
        features = gen_features(mal_file)
        phish_files.append(features)
except Exception as err:
    print(err)

phish_df = pd.DataFrame(phish_files)
phish_df['result'] = 'phishing'

In [None]:
merged_df = pd.concat([phish_df, benign_df], ignore_index=True)
merged_df.drop(columns=['ref'], inplace=True)
merged_df.head()

In [None]:
X = merged_df.iloc[:, 1:-1]
y = merged_df['result']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.10, random_state=84
)

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# create the classifier
classifier = GradientBoostingClassifier(n_estimators=100)
# Train the model using the training sets
classifier.fit(X_train, y_train)

# Predict using test data
y_pred = classifier.predict(X_test)
print(classification_report(y_test, y_pred))