In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from transformers import AutoTokenizer, AutoModel

from sklearn.utils import shuffle
import torch
import matplotlib.pyplot as plt

In [2]:
if torch.cuda.is_available():
  device = "cuda"
else:
  device = "cpu"

print(f'There are {torch.cuda.device_count()} GPU(s) available.')
print('Device name:', torch.cuda.get_device_name(0))

There are 1 GPU(s) available.
Device name: NVIDIA RTX A5000


In [3]:
benign_data = pd.read_csv("/home/vikrant/Desktop/Thesis/Thesis_Projects/URL_detection/Benign.csv")

malicious_data = pd.read_csv("/home/vikrant/Desktop/Thesis/Thesis_Projects/URL_detection/Malicious.csv")

df1 = pd.DataFrame(benign_data)
df2 = pd.DataFrame(malicious_data)

x = df1.sample(25000)
y = df2.sample(25000)

data = pd.concat([x,y], axis=0)
data = shuffle(data)
data

Unnamed: 0,url,label
11641,na1688b2140.standrewsacademy.org/nab/?memberid=1,1
34427,art-bin.com/art/or_weltypreface.html,0
38177,www.linuxconfig.org/Bash_scripting_Tutorial,0
14088,members.tripod.com/isczurko/index.htm,0
1639,home.comcast.net/~chgrimes/ukloginitefelgeiras...,1
...,...,...
13305,www.batt-vf.tk/cert/,1
697,www.c-siron.com/feeder_panels2.htm,0
16579,home.neopets.com/templates/homepage.phtml?pet_...,0
32844,www.baremetalsoft.com/baregrep/,0


In [4]:
texts = data['url'].values
labels = data['label'].values

In [5]:
# Split the data into 80% training and 20% test set
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=2018)

# Further split the training data into 75% training and 25% validation
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=2018)


In [6]:
# Load SimCSE-BERT model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('princeton-nlp/sup-simcse-roberta-large')
model = AutoModel.from_pretrained('princeton-nlp/sup-simcse-roberta-large').to(device)

  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


In [7]:
from torch.cuda.amp import autocast, GradScaler
# Use mixed precision
scaler = GradScaler()

def embed_texts(texts, tokenizer, model, max_length=128, batch_size=16):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        inputs = tokenizer(batch_texts.tolist(), padding=True, truncation=True, max_length=max_length, return_tensors='pt').to('cuda')
        
        with torch.no_grad():
            with autocast():
                outputs = model(**inputs)
                batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()  # Using CLS token representation
        
        embeddings.append(batch_embeddings)
        torch.cuda.empty_cache()  # Clear GPU cache

    return np.vstack(embeddings)

In [8]:
# Generate embeddings
X_train_embeds = embed_texts(X_train, tokenizer, model)
X_val_embeds = embed_texts(X_val, tokenizer, model)
X_test_embeds = embed_texts(X_test, tokenizer, model)

In [9]:
# Define base models
base_models = [
    ('rf', RandomForestClassifier(n_estimators=100, random_state=2018)),
    ('gb', GradientBoostingClassifier(n_estimators=100, random_state=2018))
]

# Define the meta-model
meta_model = LogisticRegression(random_state=2018)

# Create the StackingClassifier
stacked_model = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=5)

In [None]:
# Fit the final model
stacked_model.fit(X_train_embeds, y_train)

In [None]:
# Make predictions on the validation set
val_predictions = stacked_model.predict(X_val_embeds)

# Evaluate the model on the validation set
print("Validation Classification Report:")
print(classification_report(y_val, val_predictions, digits=4))

# Make predictions on the test set
test_predictions = stacked_model.predict(X_test_embeds)

# Evaluate the model on the test set
print("Test Classification Report:")
print(classification_report(y_test, test_predictions, digits=4))