In [38]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import importlib
import preprocessing
importlib.reload(preprocessing)
from preprocessing import FEATURE_NAMES

def model_train(X, y):
    model = LogisticRegression(max_iter=1000)
    model.fit(X, y)
    return model

def model_predict(model, X_test):
    threshold=0.05
    y_proba = model.predict_proba(X_test)[:, 1]
    y_pred = (y_proba >= threshold).astype(int)
    return y_pred

def cost(tn, fp, fn, tp):
    return (fp * 1 + fn * 10) / (tn + fp + fn + tp)

def information_criterium(cost, features_num):
    return cost + 0.003 * (features_num / len(FEATURE_NAMES))

def eval(data, verbose=True, feature_names=None, n_splits=5):

    if feature_names is None:
        feature_names = FEATURE_NAMES 

    X = data[feature_names]
    y = data['Labels']

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    total_cost = 0
    n_samples = 0

    accuracies = []
    all_y_true = []
    all_y_pred = []

    for fold, (train_idx, test_idx) in enumerate(skf.split(X, y), 1):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        model = model_train(X_train, y_train)
        y_pred = model_predict(model, X_test)

        tn, fp, fn, tp = confusion_matrix(y_test, y_pred, labels=[0, 1]).ravel()
        curr_cost = cost(tn, fp, fn, tp)
        total_cost += curr_cost

        accuracies.append(accuracy_score(y_test, y_pred))
        all_y_true.extend(y_test)
        all_y_pred.extend(y_pred)

        if verbose:
            acc = accuracy_score(y_test, y_pred)
            print(f"Fold {fold} Accuracy: {acc:.4f} | FP={fp} FN={fn} | Fold cost={curr_cost}")

    if verbose:
        print("\n=== Cross-validated results ===")
        print(f"Mean Accuracy: {sum(accuracies)/n_splits:.4f}")
        print(f"Mean custom cost per sample: {total_cost / n_splits:.4f}\n")
        print("Overall Confusion Matrix:")
        print(confusion_matrix(all_y_true, all_y_pred))
        print("\nClassification Report:")
        print(classification_report(all_y_true, all_y_pred))

    return total_cost / n_splits

def select_features(data, feature_names, verbose=True, n_splits=5, threshold_info=None):
    current_features = feature_names.copy()

    base_cost = eval(data, feature_names=current_features, verbose=False, n_splits=n_splits)
    best_info = information_criterium(base_cost, len(current_features))

    improving = True
    while improving and len(current_features) > 1:
        improving = False
        info_if_removed = {}

        for feat in current_features:
            test_features = [f for f in current_features if f != feat]
            cost_without_feat = eval(data, feature_names=test_features, verbose=False, n_splits=n_splits)
            info_score = information_criterium(cost_without_feat, len(test_features))
            info_if_removed[feat] = info_score

        feat_to_remove = min(info_if_removed, key=info_if_removed.get)
        min_info = info_if_removed[feat_to_remove]

        if min_info < best_info:
            if verbose:
                print(f"Removing '{feat_to_remove}' improves info criterium: {best_info:.6f} → {min_info:.6f}")
            current_features.remove(feat_to_remove)
            best_info = min_info
            improving = True
        else:
            if verbose:
                print("No further improvement by removing any single feature.")
            break

        if threshold_info is not None and best_info <= threshold_info:
            if verbose:
                print(f"Threshold information criterium reached: {best_info:.6f} <= {threshold_info:.6f}")
            break

    if verbose:
        print("\nSelected features:")
        print(current_features)
        print(f"Final information criterium: {best_info:.6f}")

    return current_features


data = pd.read_pickle("data_with_features.pkl")

selected_feats = select_features(data, FEATURE_NAMES, verbose=True, n_splits=5)
eval(data, True, selected_feats)

   Labels                                               URLs  length  \
0       1   https://www.ujhyjhujhyjhyuj.ga/CC_POSTALE/f2a83/      48   
1       1  https://stitch-statichosting-prod.s3.amazonaws...     156   
2       0            https://www.reservoirgroup.com/careers/      39   
3       0  https://www.camosy.com/themes/juicy/images/doo...      64   
4       0                       https://www.liveapps.com.au/      28   

   slash_num  dot_num  dash_num  https  http  digits_in_url_num  \
0          5       48         0      1     0                  3   
1          5      156         2      1     0                 33   
2          4       39         0      1     0                  0   
3          6       64         0      1     0                  0   
4          3       28         0      1     0                  0   

   subdomain_num  domains_num  suffix_num  digits_in_domain  \
0              1            1           1                 0   
1              4            1         

ValueError: too many values to unpack (expected 2)