In [11]:
import pandas as pd
from urllib.parse import urlparse
import re

In [12]:
df = pd.read_csv("malicious_url.csv")
# df.head()

# df_sample = df.head(5000) 
df_sample = df.groupby("type").sample(n=1000)
print(df_sample["type"].value_counts()) 

type
benign        1000
defacement    1000
malware       1000
phishing      1000
Name: count, dtype: int64


In [13]:
def extract_features(url):

    feature = {}

    # URL Length
    feature['url_length'] = len(url)

    # Count Special Character
    feature['num_special_chars'] = sum(1 for c in url if c in['?', '=', '&', '%', '@', '-', '_'])

    # Count Digits
    feature['num_digit'] = sum(c.isdigit() for c in url)

    # Count subdomains (number of dots in URL)
    feature['num_subdomains'] = url.count('.')
    
    # Extract domain & path 
    parsed_url = urlparse(url)
    # feature['domain'] = parsed_url.netloc # Extract domain
    feature['path_length'] = len(parsed_url.path) # Path length
    feature['num_path_segments'] = parsed_url.path.count('/') # Count path segments

    # Check if URL contains suspicious keywords
    suspicious_keywords = ['pay', '.io', 'login', 'secure', 'wallet', 'auth', 'support', 'block']
    feature['num_suspicious_keywords'] = sum(1 for keyword in suspicious_keywords if keyword in url)

    return feature

In [14]:
# Apply feature extraction to all URLs
features_df = df_sample['url'].apply(lambda x: extract_features(x)).apply(pd.Series)

# Merge extracted features with the sampled dataset
df_sample = df_sample.reset_index(drop=True)  
features_df = features_df.reset_index(drop=True) 
df_sample = pd.concat([df_sample, features_df], axis=1)

# Drop the 'url' column from df_sample
df_sample = df_sample.drop(columns=['url'])

# Save the cleaned sampled dataset
df_sample.to_csv("sample_malicious_url.csv", index=False)

# Display the first few rows of the cleaned dataset
df_sample.head()

Unnamed: 0,type,url_length,num_special_chars,num_digit,num_subdomains,path_length,num_path_segments,num_suspicious_keywords
0,benign,33,1,0,2,33,1,0
1,benign,156,13,3,2,156,6,0
2,benign,101,8,7,1,101,5,0
3,benign,26,0,0,2,26,1,0
4,benign,69,5,8,2,69,5,0
