In [4]:
import pandas as pd
from urllib.parse import urlparse
import re

In [5]:
df = pd.read_csv("u_malicious_url.csv")
# df.head()

# df_sample = df.head(5000) 
df_sample = df.groupby("Type").sample(n=900)
print(df_sample["Type"].value_counts()) 

Type
harmful    900
safe       900
Name: count, dtype: int64


In [6]:
def extract_features(url):

    feature = {}

    # URL Length
    feature['url_length'] = len(url)

    # Count Special Character
    feature['num_special_chars'] = sum(1 for c in url if c in['?', '=', '&', '%', '@', '-', '_'])

    # Count Digits
    feature['num_digit'] = sum(c.isdigit() for c in url)

    # Count subdomains (number of dots in URL)
    feature['num_subdomains'] = url.count('.')
    
    # Extract domain & path 
    parsed_url = urlparse(url)
    # feature['domain'] = parsed_url.netloc # Extract domain
    feature['path_length'] = len(parsed_url.path) # Path length
    feature['num_path_segments'] = parsed_url.path.count('/') # Count path segments

    # Check if URL contains suspicious keywords
    suspicious_keywords = ['pay', '.io', 'login', 'secure', 'wallet', 'auth', 'support', 'block']
    feature['num_suspicious_keywords'] = sum(1 for keyword in suspicious_keywords if keyword in url)

    return feature

In [7]:
# Apply feature extraction to all URLs
features_df = df_sample['URL'].apply(lambda x: extract_features(x)).apply(pd.Series)

# Merge extracted features with the sampled dataset
df_sample = df_sample.reset_index(drop=True)  
features_df = features_df.reset_index(drop=True) 
df_sample = pd.concat([df_sample, features_df], axis=1)

# Drop the 'url' column from df_sample
df_sample = df_sample.drop(columns=['URL'])

# Save the cleaned sampled dataset
# df_sample.to_csv("sample_malicious_url.csv", index=False)
df_sample.to_csv("training_malicious_url.csv", index=False)

# Display the first few rows of the cleaned dataset
df_sample.head()

Unnamed: 0,Type,url_length,num_special_chars,num_digit,num_subdomains,path_length,num_path_segments,num_suspicious_keywords
0,harmful,40,0,0,2,24,3,0
1,harmful,77,4,7,3,33,5,0
2,harmful,48,0,0,3,15,2,0
3,harmful,91,6,9,3,62,4,0
4,harmful,35,1,12,4,15,1,0


In [8]:
# import pandas as pd

# # Read the data
# df = pd.read_csv("malicious_url.csv")

# # Define the number of samples to take for each class
# samples_per_class = {
#     "benign": 5000,
#     "malware": 1666,
#     "phishing": 1666,  # Example for another class, adjust as needed
#     "defacement": 1666  # Example for another class, adjust as needed
# }

# # Create a list to store the resampled dataframes
# df_sampled = []

# # Loop through each class and sample the defined number of records
# for label, num_samples in samples_per_class.items():
#     df_class = df[df['Type'] == label]
#     df_class_sampled = df_class.sample(n=num_samples, random_state=42)
#     df_sampled.append(df_class_sampled)

# # Concatenate all the resampled dataframes into one
# df_sample = pd.concat(df_sampled)

# # Check the class distribution of the sampled data
# print(df_sample['type'].value_counts())

# df_sample.to_csv("new_url_dataset.csv", index=False)


In [9]:
# df = pd.read_csv("malicious_url.csv")
# df.head()

# df_sample = df.groupby("type").sample(n=1250)
# print(df_sample["type"].value_counts()) 

# df_sample.to_csv("new_url_dataset.csv", index=False)

In [2]:
import pandas as pd

df1 = pd.read_csv("../c_spam_text.csv")
df2 = pd.read_csv("../c_malicious_url.csv")

df1_sample = df1.groupby("Type").sample(n=250)
print(df1_sample["Type"].value_counts()) 

df2_sample = df2.groupby("Type").sample(n=250)
print(df2_sample["Type"].value_counts()) 

# Concatenate the datasets
df_combined = pd.concat([df1_sample, df2_sample], ignore_index=True)

# Save the combined dataset
df_combined.to_csv("combined_dataset.csv", index=False)

print("Combined dataset shape:", df_combined.shape)
print(df_combined.head())


Type
harmful    250
safe       250
Name: count, dtype: int64
Type
harmful    250
safe       250
Name: count, dtype: int64
Combined dataset shape: (1000, 2)
                                             Content     Type
0  URGENT. Important information for 02 user. Tod...  harmful
1  Do you want a NEW video phone750 anytime any n...  harmful
2  Promotion Number: 8714714 - UR awarded a City ...  harmful
3  Congrats! 2 mobile 3G Videophones R yours. cal...  harmful
4  Twinks, bears, scallies, skins and jocks are c...  harmful
