In [1]:
import numpy as np
import pandas as pd

from sklearn.utils.class_weight import compute_class_weight
from autogluon.multimodal import MultiModalPredictor

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_df = pd.read_csv('./data/train.csv')
test_df = pd.read_csv('./data/test.csv')

In [3]:
train_df['URL'] = train_df['URL'].apply(lambda x: x.replace('[.]', '.'))
test_df['URL'] = test_df['URL'].apply(lambda x: x.replace('[.]', '.'))

In [4]:
predictor = MultiModalPredictor(label='label', problem_type='binary')

In [5]:
weights = compute_class_weight(class_weight='balanced', classes=np.unique(train_df['label']), y=train_df['label'].values)
weights /= weights.sum()
weights = list(weights)

In [6]:
predictor.fit(
    train_data=train_df,
    presets='best_quality',
    time_limit=None,
    column_types = {'URL':'text'},
    seed=42,
    hyperparameters={
        "model.hf_text.checkpoint_name": "kmack/malicious-url-detection",
        "env.per_gpu_batch_size": 32,
        "optimization.patience": 3,
        "optimization.loss_function": "focal_loss",
        "optimization.focal_loss.alpha": weights,
    }
)

No path specified. Models will be saved in: "AutogluonModels/ag-20250222_052123"
AutoGluon Version:  1.2
Python Version:     3.10.13
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #17~24.04.2-Ubuntu SMP PREEMPT_DYNAMIC Mon Jan 20 22:48:29 UTC 2
CPU Count:          32
Pytorch Version:    2.5.1
CUDA Version:       12.1
Memory Avail:       61.06 GB / 78.38 GB (77.9%)
Disk Space Avail:   1556.34 GB / 1831.67 GB (85.0%)

AutoMM starts to create your model. ✨✨✨

To track the learning progress, you can open a terminal and launch Tensorboard:
    ```shell
    # Assume you have installed tensorboard
    tensorboard --logdir /home/apic/python/dacon_project/malicious_url/AutogluonModels/ag-20250222_052123
    ```

Seed set to 42
GPU Count: 1
GPU Count to be Used: 1
GPU 0 Name: NVIDIA GeForce RTX 3060
GPU 0 Memory: 1.02GB/12.0GB (Used/Total)

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU ava

Epoch 0:   2%|▏         | 3738/216410 [03:18<3:08:03, 18.85it/s]           


Detected KeyboardInterrupt, attempting graceful shutdown ...


NameError: name 'exit' is not defined

In [9]:
preds = predictor.predict_proba(
    test_df
)

Predicting: |                                                                                    | 0/? [00:00<…

In [11]:
submission_df = pd.read_csv('./data/sample_submission.csv')

In [13]:
submission_df['probability'] = preds[1]

In [16]:
submission_df.to_csv('submission.csv', index=False)