# end to end

In [1]:
import pandas as pd
path = r"C:\Users\exusille\Documents\school\thesis\vsc thesis workspace\NEW_WORKSPACE\dataset_v5\links_v5.2.csv"
df = pd.read_csv(path)
df = df.sample(frac=1).reset_index(drop=True)

## feature extraction

In [7]:
#from urlparse import urlparse
from urllib import parse, request
#from urllib.parse import urlparse
from typing import Optional, Dict

def parse_url(url: str) -> Optional[Dict[str, str]]:
    try:
        no_scheme = not url.startswith('https://') and not url.startswith('http://')
        if no_scheme:
            parsed_url = urlparse(f"http://{url}")
            url_dict = {
                #"scheme": None, # not established a value for this
                "netloc": parsed_url.netloc,
                "domain": parsed_url.netloc.split(':')[0], # extract domain from netloc
                "path": parsed_url.path,
                "params": parsed_url.params,
                "query": parsed_url.query,
                "fragment": parsed_url.fragment,
            }
        else:
            parsed_url = urlparse(url)
            url_dict = {
                "scheme": parsed_url.scheme,
                "netloc": parsed_url.netloc,
                "domain": parsed_url.netloc.split(':')[0], # extract domain from netloc
                "path": parsed_url.path,
                "params": parsed_url.params,
                "query": parsed_url.query,
                "fragment": parsed_url.fragment,
            }

        # Split path into directory and file
        directory, file = parsed_url.path.rsplit('/', 1)
        url_dict['directory'] = directory
        url_dict['file'] = file

        return url_dict

    except:
        return None

In [3]:
df['parsed_url'] = df.url.apply(parse_url) #parse urls in url_df
df = pd.concat([
    df.drop(['parsed_url'], axis=1),
    df['parsed_url'].apply(pd.Series)], axis=1)

In [4]:
def get_length(row):
    return pd.Series({
        'url_length': len(row['url']),
        'domain_length': len(row['domain'])
    })
length_df = df.apply(get_length, axis=1)
df = df.merge(length_df, left_index=True, right_index=True)

In [5]:
import tldextract
#tld
df["tld"] = df.netloc.apply(lambda nl: tldextract.extract(nl).suffix)
df['tld'] = df['tld'].replace('','None')

In [6]:
symbols = '.-'
for char in symbols:
    df['qnty_' + char + '_url'] = df['url'].apply(lambda x: x.count(char))
    df['qnty_' + char + '_domain'] = df['domain'].apply(lambda x: x.count(char))

In [7]:
##server domain
def extract_server_client_domain(domains):
    results = []
    for domain in domains:
        parts = domain.split('.')
        if len(parts) == 3 and parts[0] == 'server':
            results.append(1)
        elif len(parts) == 2 and parts[0] != 'www':
            results.append(0)
        else:
            results.append(-1)  # invalid domain format
    return results
df['is_server_domain'] = extract_server_client_domain(df['domain'])

In [8]:
def get_num_subdomains(netloc: str) -> int:
    subdomain = tldextract.extract(netloc).subdomain 
    if subdomain == "":
        return 0
    return subdomain.count('.') + 1
df['num_subdomains'] = df['netloc'].apply(lambda net: get_num_subdomains(net))

In [9]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'[A-Za-z]+')
def tokenize_domain(netloc: str) -> str:
    split_domain = tldextract.extract(netloc)
    no_tld = str(split_domain.subdomain +'.'+ split_domain.domain)
    return " ".join(map(str,tokenizer.tokenize(no_tld)))
         
df['domain_tokens'] = df['netloc'].apply(lambda net: tokenize_domain(net))
df['path_tokens'] = df['path'].apply(lambda path: " ".join(map(str,tokenizer.tokenize(path))))

In [10]:
from nltk.tokenize import RegexpTokenizer
tok= RegexpTokenizer(r'[A-Za-z0-9]+')
df['tokenized_url'] = df['url'].map(lambda x: tok.tokenize(x))

In [11]:
#drop unecessary columns
cols_to_drop = ['netloc', 'domain', 'path', 'params', 'query', 'scheme',
       'fragment', 'directory', 'file', 'path_tokens']
df.drop(cols_to_drop, axis=1, inplace=True)

## model

### before training

In [12]:
#label_id
from sklearn.preprocessing import LabelEncoder
y = df['label']
le = LabelEncoder()
le.fit(y)
y = le.transform(y)
df['label_id'] = y

In [13]:
X = df.drop(columns=['label', 'label_id', 'url'])
y = df['label_id']

In [14]:
#pipeline
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

class Converter(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        return self

    def transform(self, data_frame):
        return data_frame.values.ravel()
    
    
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
numeric_features = X.select_dtypes(include=['int64']).columns
numeric_transformer = Pipeline(steps=[
    ('scaler', MinMaxScaler())])

from sklearn.feature_extraction.text import TfidfVectorizer
categorical_features = ['tld']
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

from sklearn.preprocessing import OneHotEncoder
vectorizer_features = ['domain_tokens']#, 'tokenized_url']
vectorizer_transformer = Pipeline(steps=[
    ('con', Converter()),
    ('tf', TfidfVectorizer())])

urltoken_features = ['tokenized_url']#X.select_dtypes(include=['float64']).columns
urltoken_transformer = Pipeline(steps=[
    ('con', Converter()),
    ('tfidf', TfidfVectorizer(ngram_range=(1, 1), max_features=1000)),
    ('scaler', MinMaxScaler())
])

from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
        ('domvec', vectorizer_transformer, ['domain_tokens']),
        ('urlvec', urltoken_transformer, urltoken_features)
    ])

### training

In [16]:
from sklearn.svm import SVC

svc_hyp = {
    'C': 25.406936492978463,
    'coef0': 6,
    'degree': 2,
    'gamma': 1.3190980166240491,
    'kernel': 'rbf',
    'probability': True,
    'shrinking': True
}
model = Pipeline(steps=[('preprocessor', preprocessor), 
                              ('classifier', SVC(**svc_hyp, random_state=42))
                               ])       

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, f1_score, precision_score, recall_score
import pandas as pd

CV = 10
entries = []

# Perform cross-validation training
cv_scores = cross_val_score(model, X, y, cv=CV)

# Fit the model on the entire dataset
model.fit(X, y)

# Make predictions and calculate scores
y_pred = model.predict(X)
y_scores = model.decision_function(X)
probs = model.predict_proba(X)[:, 1]

# Calculate evaluation metrics
conf_matrix = confusion_matrix(y, y_pred)
accuracy = accuracy_score(y, y_pred)
roc_auc = roc_auc_score(y, y_scores)
f1 = f1_score(y, y_pred)
precision = precision_score(y, y_pred)
recall = recall_score(y, y_pred)

In [None]:
# # Store the results in the entries list
# entries.append((accuracy, roc_auc, f1, precision, recall))

# # Print confusion matrix
# print("Confusion matrix for SVM")
# print(conf_matrix)

# # Create a DataFrame with the results
# bayes_df = pd.DataFrame(entries, columns=['accuracy', 'roc_auc', 'f1', 'precision', 'recall'])
# bayes_df

### after training

In [None]:
import numpy as np
threshold = 1-0.758840
predictions = np.where(probs > threshold, 1, 0)

# Evaluate the model using various metrics
precision = precision_score(y, predictions)
recall = recall_score(y, predictions)
f1 = f1_score(y, predictions)
accuracy = accuracy_score(y, predictions)
roc_auc = roc_auc_score(y, probs)

# Calculate false negative rate and false positive rate
tn, fp, fn, tp = confusion_matrix(y, predictions).ravel()
false_negative_rate = fn / (fn + tp)
false_positive_rate = fp / (fp + tn)

# Print the evaluation metrics
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)
print("Accuracy:", accuracy)
print("ROC AUC Score:", roc_auc)
print("False Negative Rate:", false_negative_rate)
print("False Positive Rate:", false_positive_rate)

# Print the confusion matrix
confusion_mat = confusion_matrix(y, predictions)
print("Confusion Matrix:")
print(confusion_mat)

Precision: 0.9997414461777127
Recall: 0.9993969156543465
F1-score: 0.999569151227919
Accuracy: 0.9995619414753811
ROC AUC Score: 0.9999207361039587
False Negative Rate: 0.0006030843456534849
False Positive Rate: 0.0002673558506371981
Confusion Matrix:
[[11218     3]
 [    7 11600]]


In [None]:
y_pred

array([1, 0, 0, ..., 1, 1, 1])

# save model

In [None]:
import joblib
joblib.dump((model, threshold),'svm_model.joblib' )

['svm_model.joblib']

# load the model

In [None]:
loaded_model, thresh = joblib.load('svm_model.joblib')

In [None]:
loaded_model.predict(X)

array([1, 0, 0, ..., 1, 1, 1])