In [22]:
import pandas as pd
from urllib.parse import urlparse
import re

In [23]:
df = pd.read_csv("malicious_url.csv")
# df.head()

# df_sample = df.head(5000) 
df_sample = df.groupby("type").sample(n=250, random_state=42)
print(df_sample["type"].value_counts()) 

type
benign        250
defacement    250
malware       250
phishing      250
Name: count, dtype: int64


In [24]:
def extract_features(url):

    feature = {}

    # URL Length
    feature['url_length'] = len(url)

    # Count Special Character
    feature['num_special_chars'] = sum(1 for c in url if c in['?', '=', '&', '%', '@', '-', '_'])

    # Count Digits
    feature['num_digit'] = sum(c.isdigit() for c in url)

    # Count subdomains (number of dots in URL)
    feature['num_subdomains'] = url.count('.')

    # Check if https is present
    # feature['https_present'] = 1 if url.startswith("https") else 0

    # Extract domain & path 
    parsed_url = urlparse(url)
    # feature['domain'] = parsed_url.netloc # Extract domain
    feature['path_length'] = len(parsed_url.path) # Path length
    feature['num_path_segments'] = parsed_url.path.count('/') # Count path segments

    # Check if the URL contains an IP address
    # feature['contains_ip'] = 2 if re.search(r'\d+\.\d+\.\d+\.\d+', url) else 0

    # Check if URL contains suspicious keywords
    suspicious_keywords = ['pay', '.io', 'login', 'secure', 'wallet', 'auth', 'support', 'block']
    feature['num_suspicious_keywords'] = sum(1 for keyword in suspicious_keywords if keyword in url)

    # Check if URL contains brand name
    # brand_names = ['facebook', 'google', 'paypal', 'amazon']
    # feature['contains_brand'] = sum(1 for brand in brand_names if brand in url.lower())



    return feature

In [25]:
# Apply feature extraction to all URLs
features_df = df_sample['url'].apply(lambda x: extract_features(x)).apply(pd.Series)

# Merge extracted features with the sampled dataset
df_sample = pd.concat([df_sample, features_df], axis=1)

# Drop the 'url' column from df_sample
df_sample = df_sample.drop(columns=['url'])

# Save the cleaned sampled dataset
df_sample.to_csv("sample_malicious_url.csv", index=False)

# Display the first few rows of the cleaned dataset
df_sample.head()

Unnamed: 0,type,url_length,num_special_chars,num_digit,num_subdomains,path_length,num_path_segments,num_suspicious_keywords
514768,benign,48,2,0,2,48,3,0
222536,benign,44,0,0,3,44,3,0
276748,benign,31,2,0,1,17,1,0
50193,benign,30,0,0,1,30,3,0
339822,benign,32,1,0,1,32,2,0
