In [14]:
import pandas as pd
from urllib.parse import urlparse
import re

In [15]:
df = pd.read_csv("malicious_url.csv")
# df.head()

df_sample = df.head(1000) 

In [16]:
def extract_features(url):

    feature = {}

    # URL Length
    feature['url_length'] = len(url)

    # Count Special Character
    feature['num_special_chars'] = sum(1 for c in url if c in['?', '=', '&', '%', '@', '-', '_'])

    # Count Digits
    feature['num_digit'] = sum(c.isdigit() for c in url)

    # Count subdomains (number of dots in URL)
    feature['num_subdomains'] = url.count('.')

    # Check if https is present
    feature['https_present'] = 1 if url.startswith("https") else 0

    # Extract domain & path 
    parsed_url = urlparse(url)
    # feature['domain'] = parsed_url.netloc # Extract domain
    feature['path_length'] = len(parsed_url.path) # Path length
    feature['num_path_segments'] = parsed_url.path.count('/') # Count path segments

    # Check if the URL contains an IP address
    feature['contains_ip'] = 2 if re.search(r'\d+\.\d+\.\d+\.\d+', url) else 0

    return feature

In [17]:
# Apply feature extraction to all URLs
features_df = df_sample['url'].apply(lambda x: extract_features(x)).apply(pd.Series)

# Merge extracted features with the sampled dataset
df_sample = pd.concat([df_sample, features_df], axis=1)

# Drop the 'url' column from df_sample
df_sample = df_sample.drop(columns=['url'])

# Save the cleaned sampled dataset
df_sample.to_csv("sample_malicious_url.csv", index=False)

# Display the first few rows of the cleaned dataset
df_sample.head()

Unnamed: 0,type,url_length,num_special_chars,num_digit,num_subdomains,https_present,path_length,num_path_segments,contains_ip
0,phishing,16,1,0,2,0,16,0,0
1,benign,35,1,1,2,0,35,2,0
2,benign,31,0,1,2,0,31,3,0
3,defacement,88,11,7,3,0,10,1,0
4,defacement,235,8,22,2,0,10,1,0
