In [None]:
import pandas as pd
import numpy as np

In [None]:
df =pd.read_csv("malicious_phish.csv")

In [None]:
df.head()

In [None]:
## Create column for len of url

df["url_len"] = df["url"].apply(lambda x: len(x))

In [None]:
## Create new column separating data buy parts

df["url_parts"] = df["url"].apply(lambda x: len(x.split(".")))

In [None]:
df.loc[df["url_parts"].max()]

In [None]:
df["url_parts"].value_counts()

In [None]:
df["suffix_len"] = df["url"].apply(lambda x: len(x.split(".")[-1]))

In [None]:
df["suffix_len"].value_counts()

In [None]:
df

In [None]:
def query_domain_length(query):
    try:
        length =  len(query.split('.')[-2])
    except:
        length = 0
    return length

df["domain_len"] = df["url"].apply(query_domain_length)

In [None]:
df.head()

In [None]:
pd.set_option('display.max_colwidth', None)
df["url"].loc[df["type"] == "malware"]

In [None]:
"[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}"

In [None]:
import re

def is_ip_address_present(s):
    ipv4_pattern = r"\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b"
    ipv6_pattern = r"\b([0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}\b"
    ipv4_match = re.search(ipv4_pattern, s)
    ipv6_match = re.search(ipv6_pattern, s)
    if ipv4_match or ipv6_match:
        return 1
    else:
        return 0


In [None]:
df["ip_present"] = df["url"].apply(is_ip_address_present)

In [None]:
df[["type","url","ip_present"]].loc[df["type"] == "malware"]

In [None]:
df.head()

In [None]:
def has_malicious_file_extension(s):
    pattern = r"(\.exe|\.dll|\.bat|\.cmd|\.msi|\.vbs|\.ps1|\.psm1|\.js|\.jse|\.wsh|\.wsf|\.hta|\.scr|\.pif|\.cpl|\.ade|\.adp|\.bas|\.chm|\.cmd|\.com|\.crt|\.csh|\.hlp|\.inf|\.ins|\.isp|\.job|\.js|\.jse|\.lnk|\.mda|\.mdb|\.mde|\.mdt|\.mdw|\.mdz|\.msc|\.msi|\.msp|\.mst|\.nws|\.pcd|\.prf|\.reg|\.scf|\.shb|\.shs|\.tmp|\.url|\.vb|\.vbe|\.vbs|\.wsc|\.wsf|\.wsh)$"
    match = re.search(pattern, s, re.IGNORECASE)
    return 1 if bool(match) else 0

In [None]:
df["file_extension"] = df["url"].apply(has_malicious_file_extension)

In [None]:
import re

def has_shortened_link(url):
    shortening_services = ["bit.ly", "t.co", "tinyurl.com", "ow.ly", "goo.gl", "is.gd", "buff.ly", "adcrun.ch", "qr.net", "adf.ly", "bc.vc", "ow.ly", "po.st", "tr.im", "v.gd", "x.co", "tiny.cc", "tinyurl.co.uk", "tinyurl.de", "tinyurl.fr", "tinyurl.pl", "tinylink.in", "tinyuri.ca", "tinyurl.dk", "url.ie", "zi.pe"]
    for service in shortening_services:
        pattern = fr"\b{service}\b"
        if re.search(pattern, url, re.IGNORECASE):
            return 1
    return 0

df["shorted_link"] = df["url"].apply(has_shortened_link)

In [None]:
df["shorted_link"].value_counts()

In [None]:
df["count_percents"] = df["url"].apply(lambda x: x.count("%"))

In [None]:
df["count_equals"] = df["url"].apply(lambda x: x.count("="))
df["count_ats"] = df["url"].apply(lambda x: x.count("@"))

In [None]:
for value in df["type"].unique():
    print(f'{value} --------------------------------------------------------------------------')
    print(df["url"].loc[df["type"] == value][:10])
    print("\n"*2)

## EDA

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df.head()

In [None]:
pd.set_option('display.max_rows', None)
df.groupby("type").describe().transpose()

In [None]:
pd.reset_option("all")

In [None]:
np.vectorize(has_shortened_link)

In [None]:
new_df = pd.read_csv("malicious_phish.csv")

In [None]:
new_df["shorted_link"] = np.vectorize(has_shortened_link)(new_df["url"])

In [None]:
new_df

In [None]:
!pip install pycaret pandas shap

In [None]:
from pycaret.classification import *

In [None]:
reg1 = setup(data = new_df, target="type")

In [None]:
reg1.

In [None]:
compare_models()

In [None]:
sns.countplot(df["type"])

In [None]:
df.columns

In [None]:
df_sample = df.sample(1000)

In [None]:
sns.scatterplot(x="url_len",y="url_parts",data=df_sample,hue="type")

In [None]:
sns.scatterplot(x="suffix_len",y="domain_len",data=df_sample,hue="type",alpha=.3)

In [None]:
sns.pairplot(df_sample,hue="type")

In [None]:
df_sample = df.sample(500)

In [None]:
sns.heatmap(df_sample.corr(),annot=True)

## Train | Test Split

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
df = df.drop("url",axis=1)

In [None]:
X = df.drop("type",axis=1)
y = df["type"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [None]:
scaler = StandardScaler()

In [None]:
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
log_model = LogisticRegression(solver='saga',multi_class="ovr",max_iter=50)

In [None]:
# Penalty Type
penalty = ['l1', 'l2']

# Use logarithmically spaced C values (recommended in official docs)
C = np.logspace(0, 4, 10)

In [None]:
log_model.fit(scaled_X_train,y_train)

In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,plot_confusion_matrix

In [None]:
y_pred = log_model.predict(scaled_X_test)

In [None]:
accuracy_score(y_test,y_pred)

In [None]:
confusion_matrix(y_test,y_pred)

In [None]:
plot_confusion_matrix(log_model,scaled_X_test,y_test)

In [None]:
log_model

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
model = RandomForestClassifier(n_estimators=10,max_features='auto',random_state=101)

In [None]:
model.fit(scaled_X_train,y_train)

In [None]:
preds = model.predict(X_test)

In [None]:
confusion_matrix(y_test,preds)

In [None]:
plot_confusion_matrix(model,X_test,y_test)

In [None]:
model.feature_importances_

In [None]:
print(classification_report(y_test,preds))