# Prepare the setup

In [3]:
# Import packages
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, ConfusionMatrixDisplay

In [2]:
# Load the dataset
df = pd.read_csv("../data/old_training_dataset.csv")

First, we manually converted the dataset into a csv format because it is much easier to work with it.

We are already familiar with data being in a tabular format, and since we encountered some issues with the original one, the arff, we found this solution as being quick and suitable.

Initially we tried to make the conversion in code, but it did not work either.

# Initial exploration

In [None]:
# See the first 5 rows
df.head()

From the initial dataset, we decided to remove the first column that represented the index of each row, as it was unnecessary for the later implementation.

In [None]:
# Print information about data
df.info()

We can see each column name with all the non-null values and their type. As we already noted in the dataset description, we expected everything to be integer.

In [None]:
df.describe()

We can see basic statistical information such as count, mean, standard deviation, minimum value, the quartiles, and maximum value.

In [None]:
# Check for null values
df.isna().sum()

In this case, we don't have any missing values, an information we have already observed above.

In [None]:
# Check for duplicates
df.duplicated().sum()

We expected to have duplicates because the dataset contains values like -1, 0 and 1. If we drop them, we lose the entire information that we need to work with.

In [None]:
# df.fillna(df.median(), inplace=True)  # or df.dropna() if very few rows

In [None]:
# Check target class
df['Result'].value_counts()

In [None]:
# Convert types
# df = pd.get_dummies(df, drop_first=True)

# Remove constant or duplicate columns
# df = df.loc[:, df.nunique() > 1]

## Check for class imbalance

In [None]:
# Class distribution plot
sns.countplot(x='Result', data=df)

We wanted to highlight and create a visualization for the information we just checked in the previous cell.

As we can see, there is not a big difference between -1 and 1 in our target column, therefore we move on.

# More visualizations

## Feature Correlation Heatmap

In [None]:
plt.figure(figsize=(14, 10))
sns.heatmap(df.corr(), cmap="coolwarm", annot=False)
plt.title("Feature Correlation Heatmap")
plt.show()

This visualization shows us the relationship between all features including the target 'Result'.

It is useful because it helps us detect multicollinearity. We learnt that too much correlation can affect some machine learning models, including, for example, linear ones. It could impact the future Logistic Regression model we want to implement, because it assumes feature independence.

This is not the case. What we focused on were the bright pink or red points found outside the diagonal. They represent a strong correlation between variables.

## Feature Correlation with 'Result'

In [None]:
corr_with_target = df.corr()['Result'].drop('Result').sort_values()
corr_with_target.plot(kind='barh', figsize=(8, 10), title="Feature Correlation with 'Result'")
plt.show()

We selected a horizontal bar because it is easier to read as we have quite a lot of features.

What is important to mention:
- Some positively correlate features are 'SSLfinal_State', 'URL_of_Anchor', and 'Prefix_Suffix'
- Some negatively correlated features are 'Domain_registration_length' and 'Shortining_Service'

These are candidates for stronger predictors

What does not seem relevant at all are 'Favicon' and 'popUpWindnow'.

## Boxplot per class
### 'SSLfinal_State' by 'Result'

In [None]:
feature = "SSLfinal_State"
sns.boxplot(x='Result', y=feature, data=df)
plt.title(f"{feature} by Class")
plt.show()

In [None]:
sns.countplot(x='SSLfinal_State', data=df)

## Pairplot with two highly relevant features vs 'Result'

In [None]:
sns.pairplot(df[['SSLfinal_State','URL_of_Anchor','Result']], hue="Result")

This visualization shows how 'SSLfinal_State' and 'URL_of_Anchor' vary between 'Result' values.

- ... are grouped at -1
- ... shift toward 1

There is visible a strong separability.

# Split the data

In [None]:
# Separate the target first
X = df.drop('Result', axis=1)
y = df['Result']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

We applied an 80%-20% split using a random seed of 42. We included this seed as well, because we want to have consistent results. In this way, we make sure that we get the same split every time we run the code.

In [None]:
# Plot distributions
df.hist(figsize=(12, 10))
plt.tight_layout()
plt.show()

We mainly included this plot because we wanted to check if scaling is required. As we can see, it is not required for most of the models because all the features are already in the same numeric range.

However, we will scale the data because we also plan to use distance-based models.

## Prepare a scaled version

In [None]:
# Scale the features
scaler = StandardScaler()

# Fit only on training data, transform both
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Training phase

In [None]:
np.random.seed(42)

# Models to train
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "KNeighborsClassifier": KNeighborsClassifier(),
    "DecisionTreeClassifier": DecisionTreeClassifier(),
    "RandomForestClassifier": RandomForestClassifier()
}

# Scaled or not
use_scaled = {
    "LogisticRegression": True,
    "KNeighborsClassifier": True,
    "DecisionTreeClassifier": False,
    "RandomForestClassifier": False
}

results = []

# Train and evaluate
for name, model in models.items():
    print(f"\nTraining: {name}")

    if use_scaled[name]:
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        X_input = X_test_scaled
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        X_input = X_test

    # Metrics
    results.append({
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, zero_division=0),
        "Recall": recall_score(y_test, y_pred, zero_division=0),
        "F1 Score": f1_score(y_test, y_pred, zero_division=0)
    })

    # Confusion Matrix
    ConfusionMatrixDisplay.from_estimator(model, X_input, y_test)
    plt.title(f"{name} - Confusion Matrix")
    plt.show()

    # Feature Importance if available
    if hasattr(model, 'feature_importances_'):
        importances = model.feature_importances_
        indices = np.argsort(importances)[::-1]
        feature_names = X.columns if not use_scaled[name] else X.columns  # same columns
        plt.figure(figsize=(10, 5))
        sns.barplot(x=importances[indices], y=feature_names[indices])
        plt.title(f"{name} - Feature Importance")
        plt.show()

# Show results
results_df = pd.DataFrame(results)
print(results_df)