In [2]:
import pandas as pd
import re
from urllib.parse import urlparse
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv("malicious_url.csv")
df.head()

# Randomly sample 10% of the dataset 
df_sample = df.head(1000) 

In [4]:
def extract_features(url):

    feature = {}

    # URL Length
    feature['url_length'] = len(url)

    # Count Special Character
    feature['num_special_chars'] = sum(1 for c in url if c in['?', '=', '&', '%', '@', '-', '_'])

    # Count Digits
    feature['num_digit'] = sum(c.isdigit() for c in url)

    # Count subdomains (number of dots in URL)
    feature['num_subdomains'] = url.count('.')

    # Check if https is present
    feature['https_present'] = 1 if url.startswith("https") else 0

    # Extract domain & path 
    parsed_url = urlparse(url)
    feature['domain'] = parsed_url.netloc # Extract domain
    feature['path_length'] = len(parsed_url.path) # Path length
    feature['num_path_segments'] = parsed_url.path.count('/') # Count path segments

    # Check if the URL contains an IP address
    feature['contains_ip'] = 2 if re.search(r'\d+\.\d+\.\d+\.\d+', url) else 0

    return feature

In [None]:
#Apply feature extraction to all URLS
features_df = df_sample['url'].apply(lambda x: extract_features(x)).apply(pd.Series)
X_train, X_test, y_train, y_test = train_test_split(df_sample.drop('type', axis=1), df_sample['type'], test_size=0.2, random_state=42)

# Merge extracted features with the sampled dataset
df_sample = pd.concat([df_sample, features_df], axis=1)

# Save the cleaned sampled dataset
df_sample.to_csv("sample_malicious_url.csv", index=False)

df_sample.head()


Unnamed: 0,url,type,url_length,num_special_chars,num_digit,num_subdomains,https_present,domain,path_length,num_path_segments,contains_ip
0,br-icloud.com.br,phishing,16,1,0,2,0,,16,0,0
1,mp3raid.com/music/krizz_kaliko.html,benign,35,1,1,2,0,,35,2,0
2,bopsecrets.org/rexroth/cr/1.htm,benign,31,0,1,2,0,,31,3,0
3,http://www.garage-pirenne.be/index.php?option=...,defacement,88,11,7,3,0,www.garage-pirenne.be,10,1,0
4,http://adventure-nicaragua.net/index.php?optio...,defacement,235,8,22,2,0,adventure-nicaragua.net,10,1,0
