In [2]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import importlib
import preprocessing
importlib.reload(preprocessing)
from preprocessing import FEATURE_NAMES

def model_train(X, y):
    model = LogisticRegression(max_iter=1000)
    model.fit(X, y)
    return model

def model_predict(model, X_test):
    threshold=0.05
    y_proba = model.predict_proba(X_test)[:, 1]
    y_pred = (y_proba >= threshold).astype(int)
    return y_pred

def cost(tn, fp, fn, tp):
    return (fp * 1 + fn * 10) / (tn + fp + fn + tp)

def information_criterium(cost, features_num):
    return cost + 0.003 * (features_num / len(FEATURE_NAMES))

def eval(data, verbose=True, feature_names=None, n_splits=5):

    if feature_names is None:
        feature_names = FEATURE_NAMES 

    X = data[feature_names]
    y = data['Labels']

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    total_cost = 0
    n_samples = 0

    accuracies = []
    all_y_true = []
    all_y_pred = []

    for fold, (train_idx, test_idx) in enumerate(skf.split(X, y), 1):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        model = model_train(X_train, y_train)
        y_pred = model_predict(model, X_test)

        tn, fp, fn, tp = confusion_matrix(y_test, y_pred, labels=[0, 1]).ravel()
        curr_cost = cost(tn, fp, fn, tp)
        total_cost += curr_cost

        accuracies.append(accuracy_score(y_test, y_pred))
        all_y_true.extend(y_test)
        all_y_pred.extend(y_pred)

        if verbose:
            acc = accuracy_score(y_test, y_pred)
            print(f"Fold {fold} Accuracy: {acc:.4f} | FP={fp} FN={fn} | Fold cost={curr_cost}")

    if verbose:
        print("\n=== Cross-validated results ===")
        print(f"Mean Accuracy: {sum(accuracies)/n_splits:.4f}")
        print(f"Mean custom cost per sample: {total_cost / n_splits:.4f}\n")
        print("Overall Confusion Matrix:")
        print(confusion_matrix(all_y_true, all_y_pred))
        print("\nClassification Report:")
        print(classification_report(all_y_true, all_y_pred))

    return total_cost / n_splits

def select_features(data, feature_names, verbose=True, n_splits=5, threshold_info=None):
    current_features = feature_names.copy()

    base_cost = eval(data, feature_names=current_features, verbose=False, n_splits=n_splits)
    best_info = information_criterium(base_cost, len(current_features))

    improving = True
    while improving and len(current_features) > 1:
        improving = False
        info_if_removed = {}

        for feat in current_features:
            test_features = [f for f in current_features if f != feat]
            cost_without_feat = eval(data, feature_names=test_features, verbose=False, n_splits=n_splits)
            info_score = information_criterium(cost_without_feat, len(test_features))
            info_if_removed[feat] = info_score

        feat_to_remove = min(info_if_removed, key=info_if_removed.get)
        min_info = info_if_removed[feat_to_remove]

        if min_info < best_info:
            if verbose:
                print(f"Removing '{feat_to_remove}' improves info criterium: {best_info:.6f} → {min_info:.6f}")
            current_features.remove(feat_to_remove)
            best_info = min_info
            improving = True
        else:
            if verbose:
                print("No further improvement by removing any single feature.")
            break

        if threshold_info is not None and best_info <= threshold_info:
            if verbose:
                print(f"Threshold information criterium reached: {best_info:.6f} <= {threshold_info:.6f}")
            break

    if verbose:
        print("\nSelected features:")
        print(current_features)
        print(f"Final information criterium: {best_info:.6f}")

    return current_features


data = pd.read_pickle("data_with_features.pkl")

selected_feats = select_features(data, FEATURE_NAMES, verbose=True, n_splits=5)
eval(data, True, selected_feats)


Removing 'domain_url_length_ratio' improves info criterium: 0.103182 → 0.102757
No further improvement by removing any single feature.

Selected features:
['length', 'slash_num', 'dot_num', 'dash_num', 'https', 'digits_in_url_num', 'subdomain_num', 'domains_num', 'suffix_num', 'domain_length', 'digits_in_domain', 'digits_in_subdomain', 'digits_in_suffix', 'is_www', 'is_com', 'has_com', 'has_org', 'num_freaky_tld', 'num_country_tld_in_subdomain', 'num_country_tld_in_domain', 'num_country_tld_in_suffix', 'popular_domain_in_domain', 'popular_domain_in_subdomain', 'popular_suffix_in_domain', 'popular_suffix_in_subdomain']
Final information criterium: 0.102757
Fold 1 Accuracy: 0.9384 | FP=625 FN=53 | Fold cost=0.105
Fold 2 Accuracy: 0.9446 | FP=559 FN=50 | Fold cost=0.09627272727272727
Fold 3 Accuracy: 0.9430 | FP=577 FN=50 | Fold cost=0.0979090909090909
Fold 4 Accuracy: 0.9435 | FP=569 FN=53 | Fold cost=0.0999090909090909
Fold 5 Accuracy: 0.9439 | FP=563 FN=54 | Fold cost=0.100272727272727

np.float64(0.09987272727272727)

In [26]:
from features import FEATURES

def get_model(selected_features):
    X = data[selected_feats]
    y = data['Labels']
    model = model_train(X, y)
    return model
    
def add_feature(df, feature):
    df[feature.__name__] = feature(df[TARGET])
    
def get_data(df):
    TARGET = "URLs"
    for feature in FEATURES:
        add_feature(df, feature)
    return df

def eval_model(model, data):
    y_pred = model_predict(model, data[selected_feats])

    y_test = data["Labels"]

    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)

    cm = confusion_matrix(y_test, y_pred, labels=[0, 1])
    tn, fp, fn, tp = cm.ravel()

    print("Confusion Matrix:")
    print(cm)

    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

final_model = get_model(selected_feats)
    
out_of_db = pd.read_csv('Many-urls.csv')
out_of_db = get_data(out_of_db)

eval_model(final_model, out_of_db)

out_of_db2 = pd.read_csv('Many-many-urls.csv')
out_of_db2 = get_data(out_of_db2)

eval_model(final_model, out_of_db2)

Accuracy: 0.5241189279935767
Confusion Matrix:
[[ 48607 378421]
 [ 12758 382224]]

Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.11      0.20    427028
           1       0.50      0.97      0.66    394982

    accuracy                           0.52    822010
   macro avg       0.65      0.54      0.43    822010
weighted avg       0.65      0.52      0.42    822010

Accuracy: 0.47588107200642327
Confusion Matrix:
[[ 12758 382224]
 [ 48607 378421]]

Classification Report:
              precision    recall  f1-score   support

           0       0.21      0.03      0.06    394982
           1       0.50      0.89      0.64    427028

    accuracy                           0.48    822010
   macro avg       0.35      0.46      0.35    822010
weighted avg       0.36      0.48      0.36    822010

