In [11]:
import pandas as pd
import re
import tldextract
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import pickle
from sklearn.feature_extraction import FeatureHasher

file_path1 = r'data\dt\url.csv'
file_path2 = r'data\dt\urls.csv'

df1 = pd.read_csv(file_path1)
df2 = pd.read_csv(file_path2)

df = pd.concat([df1, df2], ignore_index=True)

print(df.columns)
print(df.head())

def extract_features(url):
    features = {}
    ext = tldextract.extract(url)
    features['domain'] = ext.domain
    features['subdomain'] = ext.subdomain
    features['suffix'] = ext.suffix
    features['has_ip'] = bool(re.search(r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b', url))
    suspicious_tlds = ['top', 'info', 'xyzzy']
    features['suspicious_tld'] = ext.suffix in suspicious_tlds
    return features

features = df['url'].apply(extract_features)
features_df = pd.DataFrame(list(features))

if 'label_binary' in df.columns:
    features_df['label'] = df['label_binary']
else:
    raise KeyError("'label_binary' column is missing from the DataFrame")

top_domains = features_df['domain'].value_counts().nlargest(1000).index
features_df['domain'] = features_df['domain'].apply(lambda x: x if x in top_domains else 'other')

top_subdomains = features_df['subdomain'].value_counts().nlargest(1000).index
features_df['subdomain'] = features_df['subdomain'].apply(lambda x: x if x in top_subdomains else 'other')

top_suffixes = features_df['suffix'].value_counts().nlargest(100).index
features_df['suffix'] = features_df['suffix'].apply(lambda x: x if x in top_suffixes else 'other')

hasher = FeatureHasher(input_type='string', n_features=1024)
hashed_features = hasher.transform(features_df[['domain', 'subdomain', 'suffix']].astype(str).values)
hashed_df = pd.DataFrame.sparse.from_spmatrix(hashed_features, columns=[f'hash_{i}' for i in range(hashed_features.shape[1])])

features_df = pd.concat([features_df.drop(columns=['domain', 'subdomain', 'suffix']), hashed_df], axis=1)

X = features_df.drop(columns=['label'])
y = features_df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = RandomForestClassifier()
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

# Save the model
with open(r'model.pkl', 'wb') as f:
    pickle.dump(clf, f)


Index(['url', 'label', 'label_binary'], dtype='object')
                                                 url       label  label_binary
0                                   br-icloud.com.br    phishing             0
1                mp3raid.com/music/krizz_kaliko.html      benign             1
2                    bopsecrets.org/rexroth/cr/1.htm      benign             1
3  http://www.garage-pirenne.be/index.php?option=...  defacement             0
4  http://adventure-nicaragua.net/index.php?optio...  defacement             0




              precision    recall  f1-score   support

           0       0.91      0.67      0.78     55401
           1       0.94      0.99      0.96    286037

    accuracy                           0.94    341438
   macro avg       0.93      0.83      0.87    341438
weighted avg       0.94      0.94      0.93    341438

