In [None]:
import urllib.request
import os

# Download the dataset 

if(not os.path.exists('url_svmlight.tar.gz')):
    download = 'https://www.sysnet.ucsd.edu/projects/url/url_svmlight.tar.gz'
    print('Downloading the dataset from the following URL: ', download )
    urllib.request.urlretrieve(download, 'url_svmlight.tar.gz')
    print('The dataset has been downloaded successfully.')
    # Unzip the downloaded file in the dataset directory
    os.system('tar -xvzf ' + 'url_svmlight.tar.gz')
    print('The dataset has been unzipped successfully.')
else:
    print('The dataset has already been downloaded and unzipped.')

In [1]:
import torch

# Load the GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device:', device)

Device: cuda


In [2]:
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_svmlight_file
import torch
from torch.utils.data import DataLoader, TensorDataset

print('Processing data...')

# Step 1: Read Data
data_dir = 'url_svmlight'
feature_types_file = os.path.join(data_dir, 'FeatureTypes')

# Only the first 10 days are used for training
svm_files = [os.path.join(data_dir, f'Day{i}.svm') for i in range(11)]

# Read feature types
with open(feature_types_file, 'r') as file:
    feature_indices = [int(idx) for idx in file.readlines()]

# Step 2: Process Data
all_labels = []
all_features = []
for svm_file in svm_files:
    labels, features = load_svmlight_file(svm_file)
    all_labels.extend(labels)
    all_features.extend(features)

Processing data...


In [3]:
all_labels = np.array(all_labels)
print(type(all_labels))
all_features = np.array(all_features).reshape(-1, 1).astype(np.float32)
print(type(all_features))

print('Data processed successfully.')

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
Data processed successfully.


In [4]:
print(all_labels)
print(all_features)

[<1x3231949 sparse matrix of type '<class 'numpy.float64'>'
 	with 111 stored elements in Compressed Sparse Row format>
 <1x3231949 sparse matrix of type '<class 'numpy.float64'>'
 	with 128 stored elements in Compressed Sparse Row format>
 <1x3231949 sparse matrix of type '<class 'numpy.float64'>'
 	with 104 stored elements in Compressed Sparse Row format> ...
 <1x3231953 sparse matrix of type '<class 'numpy.float64'>'
 	with 99 stored elements in Compressed Sparse Row format>
 <1x3231953 sparse matrix of type '<class 'numpy.float64'>'
 	with 113 stored elements in Compressed Sparse Row format>
 <1x3231953 sparse matrix of type '<class 'numpy.float64'>'
 	with 104 stored elements in Compressed Sparse Row format>]
[[-1.]
 [-1.]
 [-1.]
 ...
 [-1.]
 [ 1.]
 [ 1.]]


In [5]:

# Step 3: Split Data
X_train, X_test, y_train, y_test = train_test_split(all_features, all_labels, test_size=0.2, random_state=42)

print('Train data shape:', X_train.shape, y_train.shape)
print('Test data shape:', X_test.shape, y_test.shape)

Train data shape: (172800, 1) (172800,)
Test data shape: (43200, 1) (43200,)


In [6]:
# Step 4: Normalize Data
scaler = StandardScaler(with_mean=False)  # With sparse data, we don't want to center it
X_train = scaler.fit_transform(X_train.reshape(-1, 1))
X_test = scaler.transform(X_test.reshape(-1, 1))

In [9]:
# Step 5: Create DataLoader
batch_size = 64

# Convert NumPy arrays to appropriate data types
X_train = X_train.astype(np.float32)
X_test = X_test.astype(np.float32)

# Convert NumPy arrays to PyTorch tensors
X_train_tensor = torch.Tensor(X_train)
y_train_tensor = torch.Tensor(y_train).unsqueeze(1)
X_test_tensor = torch.Tensor(X_test)
y_test_tensor = torch.sparse.FloatTensor(torch.LongTensor(y_test).unsqueeze(1))

# Create datasets
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

# Create DataLoader objects
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint8, and bool.

In [37]:
import torch.nn as nn

# Define the neural network architecture
class URLClassifier(nn.Module):
    def __init__(self):
        super(URLClassifier, self).__init__()
        self.fc1 = nn.Linear(1, 128)
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(0.2)
        self.fc2 = nn.Linear(128, 64)
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(0.2)
        self.fc3 = nn.Linear(64, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.dropout1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.dropout2(x)
        x = self.fc3(x)
        x = self.sigmoid(x)
        return x

# Create an instance of the neural network
model = URLClassifier()
model.to(device)
print(model)

URLClassifier(
  (fc1): Linear(in_features=1, out_features=128, bias=True)
  (relu1): ReLU()
  (dropout1): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=128, out_features=64, bias=True)
  (relu2): ReLU()
  (dropout2): Dropout(p=0.2, inplace=False)
  (fc3): Linear(in_features=64, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)


In [38]:
# Move data and labels to the same device
X_train = X_train.to(device)
y_train = y_train.to(device)
X_test = X_test.to(device)
y_test = y_test.to(device)

# Define the loss function and optimizer
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Train the model
num_epochs = 10
for epoch in range(num_epochs):
    # Forward pass
    outputs = model(X_train)
    loss = criterion(outputs, y_train.unsqueeze(1))

    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # Print the loss for the current epoch
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# Evaluate the model on the test set
model.eval()
with torch.no_grad():
    test_outputs = model(X_test)
    test_loss = criterion(test_outputs, y_test.unsqueeze(1))
    print(f'Test Loss: {test_loss.item():.4f}')

AttributeError: 'numpy.ndarray' object has no attribute 'to'