# **Dataset and DataLoader**

In [1]:
from sklearn.datasets import make_classification
import torch

In [3]:
X, y = make_classification(
    n_samples=100, 
    n_features=10,
    n_informative=2, 
    n_redundant=0,
    n_classes=2,
    random_state=42
)

In [4]:
X

array([[ 1.3727107 , -2.42387933,  0.76041466,  1.07954187, -1.88954073,
        -0.44618343, -0.45230632,  0.78580016, -1.58390282,  0.42545756],
       [ 2.36867367, -0.53086877,  1.04416088,  2.25661188,  1.18839327,
        -0.0164229 ,  2.52693243,  0.68189149, -0.48943944,  1.84670733],
       [-0.05319823,  0.44426331,  1.1593298 ,  1.85605469, -0.37482081,
        -0.2403254 ,  0.71095997, -1.08106333, -0.36096617,  0.61593561],
       [ 0.93343952,  1.45338448, -0.52286003,  0.68811892, -0.36283856,
         2.29889812, -0.44550252, -0.42018682,  1.57957215, -0.28178461],
       [-1.15806823, -0.37144087, -0.77781669,  0.86561977, -2.07339023,
         1.24608519, -0.34268759, -1.11057585, -1.40751169,  1.75227044],
       [ 1.28008347,  0.8896308 ,  1.06548038,  1.28938375, -1.48556037,
         1.03184454,  0.26705027, -0.51728845,  0.08228399,  1.40934744],
       [-1.40735658,  0.62834551, -0.89725437, -1.56826626,  0.79166269,
         1.15811087,  0.62411982,  0.07580456

In [5]:
X.shape

(100, 10)

In [6]:
y

array([1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0,
       0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1])

In [7]:
X = torch.tensor(X, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.long)

In [8]:
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):

    def __init__(self, features, labels):

        self.features = features
        self.labels = labels

    def __len__(self):
        return self.features.shape[0]
    
    def __getitem__(self, index):
        return self.features[index], self.labels[index]
    

In [10]:
dataset = CustomDataset(features=X, labels=y)


In [13]:
len(dataset), dataset[0]

(100,
 (tensor([ 1.3727, -2.4239,  0.7604,  1.0795, -1.8895, -0.4462, -0.4523,  0.7858,
          -1.5839,  0.4255]),
  tensor(1)))

In [14]:
dataloader = DataLoader(dataset, batch_size=5, shuffle=True)

In [17]:
for batch_features, batch_labels in dataloader:

    print(batch_features)
    print(batch_labels)
    print("-"*80)

tensor([[-0.1501, -1.4308, -0.6811, -0.1171,  1.8009, -0.6764, -0.0402,  0.8406,
          0.1281, -0.6526],
        [ 0.7786,  0.5434,  0.5706, -1.2095,  0.7514,  0.0993, -1.6694, -0.7633,
         -0.6626, -1.8049],
        [ 1.5723,  1.5775,  0.2790,  1.4992,  0.2192, -0.3078,  0.2494,  0.6079,
         -0.0953,  0.1866],
        [ 1.0827, -1.0352, -1.1979, -0.9802, -0.7929, -0.5303, -0.1070,  1.9647,
         -0.5536,  0.0353],
        [-2.5859, -0.0241, -0.1444, -0.4093,  0.0592,  2.5734,  0.0139, -0.5737,
          0.1981, -0.5469]])
tensor([1, 1, 1, 1, 0])
--------------------------------------------------------------------------------
tensor([[ 2.5912, -0.9076,  0.7554,  0.2447, -0.0502,  0.2705, -0.2389,  0.5009,
         -0.5768, -0.9776],
        [-2.1711,  0.7258,  0.2239, -0.0486,  0.1292,  2.4458,  0.1094, -0.7905,
          0.4810,  0.4715],
        [ 1.0226,  0.2544, -0.4119, -1.0832, -0.1557,  0.0866,  1.1678, -0.4876,
          0.3376, -0.4326],
        [-1.2076, -0.6

In [18]:
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

In [19]:
df = pd.read_csv('https://raw.githubusercontent.com/gscdit/Breast-Cancer-Detection/refs/heads/master/data.csv')
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,radius_se,texture_se,perimeter_se,area_se,smoothness_se,compactness_se,concavity_se,concave points_se,symmetry_se,fractal_dimension_se,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.0186,0.0134,0.01389,0.003532,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.7456,0.7869,4.585,94.03,0.00615,0.04006,0.03832,0.02058,0.0225,0.004571,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,0.4956,1.156,3.445,27.23,0.00911,0.07458,0.05661,0.01867,0.05963,0.009208,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,0.7572,0.7813,5.438,94.44,0.01149,0.02461,0.05688,0.01885,0.01756,0.005115,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [20]:
df.drop(columns=['id', 'Unnamed: 32'], inplace= True)
df.head()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,radius_se,texture_se,perimeter_se,area_se,smoothness_se,compactness_se,concavity_se,concave points_se,symmetry_se,fractal_dimension_se,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.0186,0.0134,0.01389,0.003532,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.7456,0.7869,4.585,94.03,0.00615,0.04006,0.03832,0.02058,0.0225,0.004571,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,0.4956,1.156,3.445,27.23,0.00911,0.07458,0.05661,0.01867,0.05963,0.009208,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,0.7572,0.7813,5.438,94.44,0.01149,0.02461,0.05688,0.01885,0.01756,0.005115,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


### train test split

In [21]:
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, 1:], df.iloc[:, 0], test_size=0.2)

### scaling

In [22]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [23]:
X_train, y_train

(array([[-0.33708191, -1.15375641, -0.34056741, ..., -0.89154804,
         -0.74701599,  0.93070081],
        [-0.57157448, -0.28486479, -0.57789321, ..., -0.9160658 ,
         -0.78846335, -0.37508138],
        [-0.47834249, -0.5546785 , -0.54100318, ..., -1.36136964,
         -1.56799258, -1.23509464],
        ...,
        [ 0.73367332, -0.10879991,  0.69317294, ...,  0.47072007,
         -0.14443512, -0.65216603],
        [-0.12236582,  1.02761888, -0.15775686, ..., -0.85231962,
         -0.37877213, -1.15379435],
        [ 0.06692337,  0.11299611,  0.08571727, ...,  0.61016483,
         -0.29109502,  0.52309324]], shape=(455, 30)),
 336    B
 480    B
 315    B
 239    M
 286    B
       ..
 427    B
 517    M
 167    M
 243    B
 99     M
 Name: diagnosis, Length: 455, dtype: object)

### Label Encoding 

In [24]:
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.transform(y_test)

### Numpy arrays to PyTorch tensors

In [25]:
X_train_tensor = torch.from_numpy(X_train.astype(np.float32))
X_test_tensor = torch.from_numpy(X_test.astype(np.float32))
y_train_tensor = torch.from_numpy(y_train.astype(np.float32))
y_test_tensor = torch.from_numpy(y_test.astype(np.float32))

In [26]:
X_test_tensor.shape, y_train_tensor.shape

(torch.Size([114, 30]), torch.Size([455]))

In [28]:
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):

  def __init__(self, features, labels):

    self.features = features
    self.labels = labels

  def __len__(self):

    return len(self.features)

  def __getitem__(self, idx):

    return self.features[idx], self.labels[idx]

In [29]:
train_dataset = CustomDataset(X_train_tensor, y_train_tensor)
test_dataset = CustomDataset(X_test_tensor, y_test_tensor)

In [30]:
train_dataset[10]

(tensor([-0.7383, -0.0539, -0.7455, -0.7104, -0.8089, -0.5393, -0.4597, -0.9241,
          0.7984, -0.0364, -0.4372,  0.4157, -0.3387, -0.4387,  0.0593, -0.2998,
          0.3755, -0.5287, -0.2368, -0.1792, -0.6774,  0.2103, -0.6361, -0.6461,
         -0.3652, -0.4023, -0.1098, -0.7668,  0.2142, -0.0681]),
 tensor(0.))

In [31]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)

### Defining the model

In [32]:
import torch.nn as nn


class MySimpleNN(nn.Module):

  def __init__(self, num_features):

    super().__init__()
    self.linear = nn.Linear(num_features, 1)
    self.sigmoid = nn.Sigmoid()

  def forward(self, features):

    out = self.linear(features)
    out = self.sigmoid(out)

    return out

### Important Parameters

In [33]:
learning_rate = 0.1
epochs = 25

In [34]:
# create model
model = MySimpleNN(X_train_tensor.shape[1])

# define optimizer
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

# define loss function
loss_function = nn.BCELoss()

### Training Pipeline

In [35]:

# define loop
for epoch in range(epochs):

  for batch_features, batch_labels in train_loader:

    # forward pass
    y_pred = model(batch_features)

    # loss calculate
    loss = loss_function(y_pred, batch_labels.view(-1,1))

    # clear gradients
    optimizer.zero_grad()

    # backward pass
    loss.backward()

    # parameters update
    optimizer.step()

  # print loss in each epoch
  print(f'Epoch: {epoch + 1}, Loss: {loss.item()}')

Epoch: 1, Loss: 0.11706425249576569
Epoch: 2, Loss: 0.14067991077899933
Epoch: 3, Loss: 0.21245799958705902
Epoch: 4, Loss: 0.1404586136341095
Epoch: 5, Loss: 0.0688457041978836
Epoch: 6, Loss: 0.11112374067306519
Epoch: 7, Loss: 0.11461646854877472
Epoch: 8, Loss: 0.12712877988815308
Epoch: 9, Loss: 0.11883382499217987
Epoch: 10, Loss: 0.04554257169365883
Epoch: 11, Loss: 0.02412101812660694
Epoch: 12, Loss: 0.03562203049659729
Epoch: 13, Loss: 0.007184027694165707
Epoch: 14, Loss: 0.011932742781937122
Epoch: 15, Loss: 0.019152788445353508
Epoch: 16, Loss: 0.018185468390583992
Epoch: 17, Loss: 0.05057894438505173
Epoch: 18, Loss: 0.053384535014629364
Epoch: 19, Loss: 0.031292837113142014
Epoch: 20, Loss: 0.0071564712561666965
Epoch: 21, Loss: 0.2796573340892792
Epoch: 22, Loss: 0.15290233492851257
Epoch: 23, Loss: 0.10286828130483627
Epoch: 24, Loss: 0.16980186104774475
Epoch: 25, Loss: 0.024532949551939964


In [36]:
# Model evaluation using test_loader
model.eval()  # Set the model to evaluation mode
accuracy_list = []

with torch.no_grad():
    for batch_features, batch_labels in test_loader:
        # Forward pass
        y_pred = model(batch_features)
        y_pred = (y_pred > 0.8).float()  # Convert probabilities to binary predictions

        # Calculate accuracy for the current batch
        batch_accuracy = (y_pred.view(-1) == batch_labels).float().mean().item()
        accuracy_list.append(batch_accuracy)

# Calculate overall accuracy
overall_accuracy = sum(accuracy_list) / len(accuracy_list)
print(f'Accuracy: {overall_accuracy:.4f}')


Accuracy: 0.9488
