In [31]:
# import libraries
import torch
import torch.nn as nn
import numpy as np

# Dataset

In [32]:
# Import dataset (comes with seaborn)
import seaborn as sns
iris = sns.load_dataset('iris')

# Convert from pandas dataframe to tensor | Final column is the outcome variable
data = torch.tensor(iris[iris.columns[0:4]].values).float()

# Transform species to number
labels = torch.zeros(len(data), dtype=torch.long)

# Labels[iris.species=='setosa'] = 0 # don't need!
labels[iris.species == 'versicolor'] = 1
labels[iris.species == 'virginica'] = 2

labels # The data is not randomized!

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2])

# Separate data into train and test

In [33]:
#  (no devset here)

# How many training examples
prop_training = .8 # In proportion, not percent
n_training = int(len(labels) * prop_training)

# Initialize a boolean vector to select data and labels | Each value = 1 row (sample) in dataset | True: Go to training set, False: Go to test set
train_test_bool = np.zeros(shape=len(labels), dtype=bool)
train_test_bool0 = np.zeros(shape=len(labels), dtype=bool)

# Is this the correct way to select samples?
train_test_bool0[range(n_training)] = True
print(train_test_bool0, '\n')

item_2_use_4_train = np.random.choice(range(len(labels)), n_training, replace=False)
train_test_bool[item_2_use_4_train] = True
# This is better, but why?
print(train_test_bool)


[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False] 

[ True  True  True  True  True  True  True  True  True  True  True  True
  True Fals

# Test whether it's balanced

In [34]:
print(f'Average of full data: {torch.mean(labels.float())}') # = 1 by definition
print('')

print(f'Average of training data: {torch.mean(labels[train_test_bool].float())}') # Should be 1
print('')

print(f'Average of test data: {torch.mean(labels[~train_test_bool].float())}') # Should be 1

Average of full data: 1.0

Average of training data: 1.0083333253860474

Average of test data: 0.9666666388511658


# Create the ANN model

In [35]:
# Model architecture
ANN_iris = nn.Sequential(
  nn.Linear(in_features=4, out_features=64),  # Input layer
  nn.ReLU(),                                  # Activation
  nn.Linear(in_features=64, out_features=64), # Hidden layer
  nn.ReLU(),                                  # Activation
  nn.Linear(in_features=64, out_features=3),  # Output layer
)

# Loss function | Include Softmax function 
loss_func = nn.CrossEntropyLoss()

# Optimizer
optimizer = torch.optim.SGD(params=ANN_iris.parameters(), lr=0.01)

# Evaluate Train/Test set

In [36]:
# Entire dataset
print(data.shape)

# Training set
train_set = data[train_test_bool, :]
print(train_set.shape)

# Test set
test_set = data[~train_test_bool, :]
print(test_set.shape)

torch.Size([150, 4])
torch.Size([120, 4])
torch.Size([30, 4])


# Train and test the model

In [37]:
num_epochs = 1000

# Initialize losses
losses = torch.zeros(num_epochs)
ongoing_acc = []

# Loop over epochs
for epoch_i in range(num_epochs):
  # Forward pass
  y_hat = ANN_iris(train_set)

  # Compute loss
  loss = loss_func(y_hat, labels[train_test_bool])
  losses[epoch_i] = loss

  # Backprop
  optimizer.zero_grad()
  loss.backward()
  optimizer.step()

  # Compute accuracy
  matches = torch.argmax(y_hat, axis=1) == labels[train_test_bool]   # Booleans (True/False)
  matches_numeric = matches.float()                                  # Convert to numbers (0/1)
  accuracy_pct = 100 * torch.mean(matches_numeric)                   # Average and *100
  ongoing_acc.append(accuracy_pct)                                   # Add to list of accuracies

# Compute train and test accuracies

In [38]:
# Final forward pass using TRAINING data
predictions = ANN_iris(train_set)
pred_labels = torch.argmax(predictions, axis=1)     # Vector of the index of the largest value in each row (sample)
train_acc = 100 * torch.mean((pred_labels == labels[train_test_bool]).float())

# Final forward pass using TEST data
predictions = ANN_iris(test_set)
pred_labels = torch.argmax(predictions, axis=1)     # Vector of the index of the largest value in each row (sample)
test_acc = 100 * torch.mean((pred_labels == labels[~train_test_bool]).float())

# Report accuracies
print(f'Final TRAIN accuracy: {train_acc}')
print(f'Final TEST accuracy: {test_acc}')

Final TRAIN accuracy: 97.5
Final TEST accuracy: 100.0


In [39]:
# normally also inspect losses and accuracy by epoch, etc etc etc.

# Additional explorations
1) Randomly assigning data samples to be in the train vs test phase produced a statistical balance, but it was 
   not perfect. Write an algorithm that will guarantee a balance of flower types while also randomly assigning
   samples to be in train vs. test.

2) Revert the code to its original form -- with the strong imbalance in flower types. Then train the model. What are
   the train and test accuracies? Compute the accuracy separately for each type of flower to see whether the model
   learned some categories, or whether it performed equally on all three categories. Are you surprised at the results? 