In [1]:
from pathlib import Path
import pandas as pd
import json

# Define the directory where the processed data and feature sets are stored
# This should match the path used in the preprocessing notebook
data_dir = Path("processed_data")

# Load the preprocessed training and test datasets
# These were created in the preprocessing notebook as train.csv and test.csv
train_df = pd.read_csv(data_dir / "train.csv")
test_df = pd.read_csv(data_dir / "test.csv")

# Load the feature name sets (all_features, selected_features, etc.)
# The JSON file contains four different feature lists and the target_features
with open(data_dir / "feature_sets.json", "r", encoding="utf-8") as f:
    feature_sets = json.load(f)


In [2]:
all_features = feature_sets["all_features"]
features_without_targets = feature_sets["features_without_targets"]
selected_features = feature_sets["selected_features"]
selected_features_without_targets = feature_sets["selected_features_without_targets"]

target_column = "label"

print(features_without_targets, selected_features_without_targets)

['URLLength', 'DomainLength', 'IsDomainIP', 'URLSimilarityIndex', 'TLDLength', 'NoOfSubDomain', 'HasObfuscation', 'NoOfObfuscatedChar', 'ObfuscationRatio', 'NoOfLettersInURL', 'LetterRatioInURL', 'NoOfDegitsInURL', 'DegitRatioInURL', 'NoOfEqualsInURL', 'NoOfQMarkInURL', 'NoOfAmpersandInURL', 'NoOfOtherSpecialCharsInURL', 'SpacialCharRatioInURL', 'IsHTTPS', 'LineOfCode', 'LargestLineLength', 'HasTitle', 'DomainTitleMatchScore', 'HasFavicon', 'Robots', 'IsResponsive', 'NoOfURLRedirect', 'NoOfSelfRedirect', 'HasDescription', 'NoOfPopup', 'NoOfiFrame', 'HasExternalFormSubmit', 'HasSocialNet', 'HasSubmitButton', 'HasHiddenFields', 'HasPasswordField', 'Bank', 'Pay', 'Crypto', 'HasCopyrightInfo', 'NoOfImage', 'NoOfCSS', 'NoOfJS', 'NoOfSelfRef', 'NoOfEmptyRef', 'NoOfExternalRef', 'TLD'] ['URLSimilarityIndex', 'HasSocialNet', 'HasCopyrightInfo', 'HasDescription', 'IsHTTPS', 'DomainTitleMatchScore', 'HasSubmitButton', 'IsResponsive', 'HasHiddenFields', 'HasFavicon', 'HasTitle', 'DegitRatioInURL'

In [3]:
feature_set_map = {
    "all_features": all_features,
    "features_without_targets": features_without_targets,
    "selected_features": selected_features,
    "selected_features_without_targets": selected_features_without_targets,
}

# use a function to get data from different feature sets
def get_data(feature_set_name: str):
    feature_names = feature_set_map[feature_set_name]

    X_train = train_df[feature_names]
    X_test = test_df[feature_names]
    y_train = train_df[target_column]
    y_test = test_df[target_column]

    return X_train, y_train, X_test, y_test

In [6]:
from pathlib import Path
import json
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Directory to save evaluation results for tuned logistic regression
results_dir = Path("results_logreg_tuned")
results_dir.mkdir(exist_ok=True)


def build_logreg_pipeline(X_sample):
    """
    Build a preprocessing + logistic regression pipeline based on the dtypes of X_sample.

    Numeric columns: standardized with StandardScaler
    Categorical columns: one-hot encoded with OneHotEncoder.
    """
    numeric_cols = X_sample.select_dtypes(include=["number", "bool"]).columns.tolist()
    categorical_cols = X_sample.select_dtypes(include=["object", "category"]).columns.tolist()

    numeric_transformer = StandardScaler()
    categorical_transformer = OneHotEncoder(handle_unknown="ignore")

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, numeric_cols),
            ("cat", categorical_transformer, categorical_cols),
        ]
    )

    clf = LogisticRegression(max_iter=1000)

    model = Pipeline(
        steps=[
            ("preprocessor", preprocessor),
            ("classifier", clf),
        ]
    )
    return model


# Hyperparameter grid for logistic regression
param_grid = {
    "classifier__C": [0.1, 1.0, 10.0],
    "classifier__class_weight": [None, "balanced"],
}

# Feature set names to evaluate
feature_set_names = [
    "all_features",
    "features_without_targets",
    "selected_features",
    "selected_features_without_targets",
]

results = []
total_sets = len(feature_set_names)
bar_len = 20

for idx, name in enumerate(feature_set_names, start=1):
    # Simple text progress bar for current feature set
    filled = int(bar_len * idx / total_sets)
    bar = "[" + "#" * filled + "-" * (bar_len - filled) + "]"
    print(f"\n{bar} {idx}/{total_sets}  feature_set = {name}")

    # Get train/test data for this feature set
    X_train, y_train, X_test, y_test = get_data(name)

    # Build the base pipeline
    base_model = build_logreg_pipeline(X_train)

    # Grid search with 3-fold cross-validation using macro F1 as the scoring metric
    grid_search = GridSearchCV(
        estimator=base_model,
        param_grid=param_grid,
        cv=3,
        scoring="f1_macro",
        n_jobs=-1,
        verbose=0,
    )

    # Fit grid search on the training data
    grid_search.fit(X_train, y_train)

    # Convert full CV results to DataFrame
    cv_results_df = pd.DataFrame(grid_search.cv_results_)

    # Save the full CV table for this feature set (for later use in the report)
    cv_results_path = results_dir / f"logreg_cv_results_{name}.csv"
    cv_results_df.to_csv(cv_results_path, index=False)

    # Create a compact table showing (C, class_weight, mean_test_score)
    short_table = cv_results_df[
        [
            "param_classifier__C",
            "param_classifier__class_weight",
            "mean_test_score",
            "std_test_score",
        ]
    ].sort_values(by="mean_test_score", ascending=False)

    print("CV results (sorted by mean_test_score):")
    print(short_table.to_string(index=False))

    # Use the best estimator to predict on the test set
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)

    # Compute evaluation metrics on the test set
    metrics = {
        "feature_set": name,
        "best_params": json.dumps(grid_search.best_params_),
        "cv_best_score_f1_macro": grid_search.best_score_,
        "test_accuracy": accuracy_score(y_test, y_pred),
        "test_precision_macro": precision_score(y_test, y_pred, average="macro", zero_division=0),
        "test_recall_macro": recall_score(y_test, y_pred, average="macro", zero_division=0),
        "test_f1_macro": f1_score(y_test, y_pred, average="macro", zero_division=0),
    }
    results.append(metrics)

    print("Best params:", grid_search.best_params_)
    print(
        "Test metrics -> "
        f"accuracy: {metrics['test_accuracy']:.6f}, "
        f"precision_macro: {metrics['test_precision_macro']:.6f}, "
        f"recall_macro: {metrics['test_recall_macro']:.6f}, "
        f"f1_macro: {metrics['test_f1_macro']:.6f}"
    )

# Collect all results into a DataFrame and save to CSV
results_df = pd.DataFrame(results)
results_df.to_csv(results_dir / "logreg_tuned_feature_sets_metrics.csv", index=False)
results_df



[#####---------------] 1/4  feature_set = all_features
CV results (sorted by mean_test_score):
 param_classifier__C param_classifier__class_weight  mean_test_score  std_test_score
                10.0                       balanced         0.999897        0.000028
                 1.0                       balanced         0.999892        0.000031
                10.0                           None         0.999881        0.000041
                 1.0                           None         0.999875        0.000033
                 0.1                       balanced         0.999816        0.000031
                 0.1                           None         0.999783        0.000043
Best params: {'classifier__C': 10.0, 'classifier__class_weight': 'balanced'}
Test metrics -> accuracy: 0.999894, precision_macro: 0.999907, recall_macro: 0.999876, f1_macro: 0.999892

[##########----------] 2/4  feature_set = features_without_targets
CV results (sorted by mean_test_score):
 param_classifier_

Unnamed: 0,feature_set,best_params,cv_best_score_f1_macro,test_accuracy,test_precision_macro,test_recall_macro,test_f1_macro
0,all_features,"{""classifier__C"": 10.0, ""classifier__class_wei...",0.999897,0.999894,0.999907,0.999876,0.999892
1,features_without_targets,"{""classifier__C"": 10.0, ""classifier__class_wei...",0.999908,0.999915,0.999926,0.999901,0.999913
2,selected_features,"{""classifier__C"": 10.0, ""classifier__class_wei...",0.999827,0.999809,0.999827,0.999783,0.999805
3,selected_features_without_targets,"{""classifier__C"": 10.0, ""classifier__class_wei...",0.999805,0.999809,0.999833,0.999777,0.999805


In [8]:
train_df.shape

(188636, 52)

In [9]:
from pathlib import Path
import json
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Directory to save evaluation results for tuned random forest
results_dir_rf = Path("results_rf_tuned")
results_dir_rf.mkdir(exist_ok=True)


def build_rf_pipeline(X_sample):
    """
    Build a preprocessing + random forest pipeline based on the dtypes of X_sample.

    Numeric columns: passed through without scaling (random forest is scale-invariant).
    Categorical columns: one-hot encoded with OneHotEncoder.
    """
    numeric_cols = X_sample.select_dtypes(include=["number", "bool"]).columns.tolist()
    categorical_cols = X_sample.select_dtypes(include=["object", "category"]).columns.tolist()

    # For random forest, numeric features do not need scaling, so we use "passthrough"
    numeric_transformer = "passthrough"
    categorical_transformer = OneHotEncoder(handle_unknown="ignore")

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, numeric_cols),
            ("cat", categorical_transformer, categorical_cols),
        ]
    )

    clf = RandomForestClassifier(random_state=42, n_jobs=-1)

    model = Pipeline(
        steps=[
            ("preprocessor", preprocessor),
            ("classifier", clf),
        ]
    )
    return model


# Hyperparameter grid for random forest
param_grid_rf = {
    "classifier__n_estimators": [100, 300],
    "classifier__max_depth": [None, 10, 20],
    "classifier__min_samples_leaf": [1, 5],
}

# Feature set names to evaluate (same as for logistic regression)
feature_set_names = [
    "all_features",
    "features_without_targets",
    "selected_features",
    "selected_features_without_targets",
]

rf_results = []
total_sets = len(feature_set_names)
bar_len = 20

for idx, name in enumerate(feature_set_names, start=1):
    # Simple text progress bar for current feature set
    filled = int(bar_len * idx / total_sets)
    bar = "[" + "#" * filled + "-" * (bar_len - filled) + "]"
    print(f"\n{bar} {idx}/{total_sets}  feature_set = {name}")

    # Get train/test data for this feature set
    X_train, y_train, X_test, y_test = get_data(name)

    # Build the base pipeline
    base_model = build_rf_pipeline(X_train)

    # Grid search with 3-fold cross-validation using macro F1 as the scoring metric
    grid_search_rf = GridSearchCV(
        estimator=base_model,
        param_grid=param_grid_rf,
        cv=3,
        scoring="f1_macro",
        n_jobs=-1,
        verbose=0,
    )

    # Fit grid search on the training data
    grid_search_rf.fit(X_train, y_train)

    # Convert full CV results to DataFrame
    cv_results_rf_df = pd.DataFrame(grid_search_rf.cv_results_)

    # Save the full CV table for this feature set (for later use in the report)
    cv_results_path = results_dir_rf / f"rf_cv_results_{name}.csv"
    cv_results_rf_df.to_csv(cv_results_path, index=False)

    # Create a compact table showing (n_estimators, max_depth, min_samples_leaf, mean_test_score)
    short_table_rf = cv_results_rf_df[
        [
            "param_classifier__n_estimators",
            "param_classifier__max_depth",
            "param_classifier__min_samples_leaf",
            "mean_test_score",
            "std_test_score",
        ]
    ].sort_values(by="mean_test_score", ascending=False)

    print("CV results for Random Forest (sorted by mean_test_score):")
    print(short_table_rf.to_string(index=False))

    # Use the best estimator to predict on the test set
    best_rf_model = grid_search_rf.best_estimator_
    y_pred_rf = best_rf_model.predict(X_test)

    # Compute evaluation metrics on the test set
    metrics_rf = {
        "feature_set": name,
        "best_params": json.dumps(grid_search_rf.best_params_),
        "cv_best_score_f1_macro": grid_search_rf.best_score_,
        "test_accuracy": accuracy_score(y_test, y_pred_rf),
        "test_precision_macro": precision_score(y_test, y_pred_rf, average="macro", zero_division=0),
        "test_recall_macro": recall_score(y_test, y_pred_rf, average="macro", zero_division=0),
        "test_f1_macro": f1_score(y_test, y_pred_rf, average="macro", zero_division=0),
    }
    rf_results.append(metrics_rf)

    print("Best params:", grid_search_rf.best_params_)
    print(
        "Test metrics -> "
        f"accuracy: {metrics_rf['test_accuracy']:.6f}, "
        f"precision_macro: {metrics_rf['test_precision_macro']:.6f}, "
        f"recall_macro: {metrics_rf['test_recall_macro']:.6f}, "
        f"f1_macro: {metrics_rf['test_f1_macro']:.6f}"
    )

# Collect all random forest results into a DataFrame and save to CSV
rf_results_df = pd.DataFrame(rf_results)
rf_results_df.to_csv(results_dir_rf / "rf_tuned_feature_sets_metrics.csv", index=False)
rf_results_df



[#####---------------] 1/4  feature_set = all_features
CV results for Random Forest (sorted by mean_test_score):
 param_classifier__n_estimators param_classifier__max_depth  param_classifier__min_samples_leaf  mean_test_score  std_test_score
                            100                        None                                   1         0.999973    7.655545e-06
                            300                        None                                   1         0.999968    1.325957e-05
                            100                          20                                   1         0.999968    1.325957e-05
                            300                          20                                   1         0.999962    1.531108e-05
                            300                          20                                   5         0.999946    2.025540e-05
                            100                        None                                   5         0.999946

Unnamed: 0,feature_set,best_params,cv_best_score_f1_macro,test_accuracy,test_precision_macro,test_recall_macro,test_f1_macro
0,all_features,"{""classifier__max_depth"": null, ""classifier__m...",0.999973,0.999979,0.999981,0.999975,0.999978
1,features_without_targets,"{""classifier__max_depth"": null, ""classifier__m...",0.999962,1.0,1.0,1.0,1.0
2,selected_features,"{""classifier__max_depth"": 20, ""classifier__min...",0.999913,0.99983,0.999852,0.999802,0.999827
3,selected_features_without_targets,"{""classifier__max_depth"": null, ""classifier__m...",0.999913,0.999852,0.99987,0.999827,0.999848
