<a href="https://colab.research.google.com/github/emilyzfliu/vis-sounds/blob/main/vis_sounds.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import os
import pickle
from math import pi
import torch
import imageio
from torch.autograd import Variable
from torch.utils.data import Dataset
from tqdm import tqdm
import cv2

In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
!unzip /content/drive/MyDrive/vis_sound_files/vis-data-simple.zip
!cp /content/drive/MyDrive/vis_sound_files/train.txt .
!cp /content/drive/MyDrive/vis_sound_files/test.txt .

Archive:  /content/drive/MyDrive/vis_sound_files/vis-data-simple.zip
replace vis-data-simple/.DS_Store? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


## Project Overview:

1. Predicting material from sound (spectrogram) only
2. Predicting material(s) from images only
3. Combined prediction

## Dataset
We use the Greatest Hits dataset at https://andrewowens.com/vis/.

In [4]:
class VisSoundsDataset(Dataset):
  def __init__(self, split = 'train', mode='sound'):
    super(Dataset, self).__init__()
    assert split in ['train', 'val', 'test']
    self.split = split
    assert mode in ['sound', 'image', 'fc_agg', 'tt_agg']
    self.mode = mode

    if mode == 'sound':
      self.transforms = transforms.Compose([
          transforms.ToPILImage(),
          transforms.Resize(224),
          transforms.ToTensor(),
          transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
        ])
    elif mode == 'image':
      self.transforms = transforms.Compose([
          transforms.ToPILImage(),
          transforms.RandomHorizontalFlip(0.5),
          transforms.RandomVerticalFlip(0.5),
          transforms.Resize(224),
          transforms.ToTensor(),
          transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
        ])

    data = self.load_data()

    if self.split == 'train':
      data = data[:int(0.8*len(data))]
      labels = data[:int(0.8*len(data))]
    elif self.split == 'val':
      data = data[int(0.8*len(data)):]
      labels = data[int(0.8*len(data)):]

    self.n = len(data)
    print(f'{self.n} datapoints')
    self.data = data

  def load_data(self):
    if self.split in ['train', 'val']:
      splits = 'train.txt'
    else:
      assert self.split == 'test'
      splits = 'test.txt'

    with open(splits, 'r') as f:
      files = f.readlines()
    
    data = []

    if self.mode == 'image':
      for file in files:
        file = file.strip()
        data.append(file)
    elif self.mode == 'sound':
      for file in files:
        file = file.strip()  # remove line break
        try:
          with open(f'vis-data-simple/{file}/{file}_metadata.txt', 'r') as f:
            n_sounds = int(f.readline())
          for i in range(n_sounds):
            data.append((file, i))
        except:
          print(file)
          #assert False
    #print(data)
    
    return data
  
  def gen_labels(self, file, idx=None):
    if idx is None:
      ret = np.zeros((18,))
      with open(f'vis-data-simple/{file}/{file}_metadata.txt', 'r') as f:
        n_sounds = int(f.readline())
      for i in range(n_sounds):
        with open(f'vis-data-simple/{file}/{file}_labels_{i}.txt', 'r') as f:
          vals = f.readlines()
          mat_id = int(vals[0])
          ret[mat_id] = 1
      ret = torch.Tensor(ret)
    else:
      with open(f'vis-data-simple/{file}/{file}_labels_{idx}.txt', 'r') as f:
        vals = f.readlines()
        ret = int(vals[0])
    return ret
  
  def load_image_from_file(self, fname):
    img = imageio.imread(fname, pilmode='RGB')
    if self.transforms is not None:
      img = self.transforms(img)
    #img = cv2.resize(img, (224, 224))
    #img = np.moveaxis(img, 2, 0)
    #if np.max(img) > 1:
    #  img = img/255.
    return img#torch.Tensor(img)

  def __getitem__(self, item):
    if self.mode == 'sound':
      vid_id, hit_id = self.data[item]
      img = self.load_image_from_file(f'vis-data-simple/{vid_id}/{vid_id}_spec_{hit_id}.png')
      label = self.gen_labels(vid_id, hit_id)
      return img, label
    elif self.mode == 'image':
      vid_id = self.data[item]
      img = self.load_image_from_file(f'vis-data-simple/{vid_id}/{vid_id}_image.png')
      label = self.gen_labels(vid_id)
      return img, label
    elif self.mode == 'fc_agg':
      pass
    elif self.mode == 'tt_agg':
      pass
    return None
      
  def __len__(self):
      return self.n

# Training Setup

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
# You might not have tqdm, which gives you nice progress bars
!pip install tqdm
from tqdm.notebook import tqdm
import os
import copy
import pandas as pd
import PIL 
from torch.utils.data import DataLoader
  
# Detect if we have a GPU available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
    print("Using the GPU!")
else:
    print("WARNING: Could not find GPU! Using CPU only")
    print("You may want to try to use the GPU in Google Colab by clicking in:")
    print("Runtime > Change Runtime type > Hardware accelerator > GPU.")

Using the GPU!


## Model Init

In [6]:
def initialize_model(model_name, num_classes, resume_from = None, use_pretrained = False):
    # Initialize these variables which will be set in this if statement. Each of these
    #   variables is model specific.
    # The model (nn.Module) to return
    model_ft = None
    # The input image is expected to be (input_size, input_size)
    input_size = 0
    
    # By default, all parameters will be trained (useful when you're starting from scratch)
    # Within this function you can set .requires_grad = False for various parameters, if you
    # don't want to learn them

    if model_name == "resnet":
        """ Resnet18
        """
        model_ft = models.resnet18(pretrained=use_pretrained)
        num_ftrs = model_ft.fc.in_features
        model_ft.fc = nn.Linear(num_ftrs, num_classes)
        input_size = 224
        
    elif model_name == "resnet50":
        """ Resnet50
        """
        model_ft = models.resnet50(pretrained=use_pretrained)
        num_ftrs = model_ft.fc.in_features
        model_ft.fc = nn.Linear(num_ftrs, num_classes)
        input_size = 224

    elif model_name == "alexnet":
        """ Alexnet
        """
        model_ft = models.alexnet(pretrained=use_pretrained)
        num_ftrs = model_ft.classifier[6].in_features
        model_ft.classifier[6] = nn.Linear(num_ftrs,num_classes)
        input_size = 224

    elif model_name == "vgg":
        """ VGG11_bn
        """
        model_ft = models.vgg11_bn(pretrained=use_pretrained)
        num_ftrs = model_ft.classifier[6].in_features
        model_ft.classifier[6] = nn.Linear(num_ftrs,num_classes)
        input_size = 224

    elif model_name == "squeezenet":
        """ Squeezenet
        """
        model_ft = models.squeezenet1_0(pretrained=use_pretrained)
        model_ft.classifier[1] = nn.Conv2d(512, num_classes, kernel_size=(1,1), stride=(1,1))
        model_ft.num_classes = num_classes
        input_size = 224

    elif model_name == "densenet":
        """ Densenet
        """
        model_ft = models.densenet121(pretrained=use_pretrained)
        num_ftrs = model_ft.classifier.in_features
        model_ft.classifier = nn.Linear(num_ftrs, num_classes) 
        input_size = 224

    else:
        raise Exception("Invalid model name!")
    
    if resume_from is not None:
        print("Loading weights from %s" % resume_from)
        model_ft.load_state_dict(torch.load(resume_from))
    
    return model_ft, input_size

## Dataloader Init

In [7]:
from torchvision.transforms.functional import to_grayscale

def get_image_transforms(mode):
    # How to transform the image when you are loading them.
    # you'll likely want to mess with the transforms on the training set.
    
    # we convert the image to a [C,H,W] tensor, then normalize it to values with a given mean/stdev. These normalization constants
    # are derived from aggregating lots of data and happen to produce better results.
    transform = transforms.Compose([
            #transforms.Grayscale(num_output_channels=3),
            #transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ])
    return transform

def get_dataloaders(input_size, batch_size, mode, shuffle = True):
    train_data = VisSoundsDataset(split = 'train', mode = mode)
    train_data = VisSoundsDataset(split = 'val', mode = mode)
    test_data = VisSoundsDataset(split = 'test', mode = mode)

    train_loader = DataLoader(train_data, batch_size = batch_size, shuffle = shuffle, num_workers = 4)
    val_loader = DataLoader(train_data, batch_size = batch_size, shuffle = False, num_workers = 4)
    test_loader = DataLoader(test_data, batch_size = batch_size, shuffle = False, num_workers = 4)
    loaders = {
        'train': train_loader,
        'val': val_loader,
        'test': test_loader
    }
    return loaders

## Model Train

In [23]:
def train_model(model, dataloaders, criterion, optimizer, save_dir = None, save_all_epochs=False, num_epochs=25, eval_mode='multiclass'):
    '''
    model: The NN to train
    dataloaders: A dictionary containing at least the keys 
                 'train','val' that maps to Pytorch data loaders for the dataset
    criterion: The Loss function
    optimizer: The algorithm to update weights 
               (Variations on gradient descent)
    num_epochs: How many epochs to train for
    save_dir: Where to save the best model weights that are found, 
              as they are found. Will save to save_dir/weights_best.pt
              Using None will not write anything to disk
    save_all_epochs: Whether to save weights for ALL epochs, not just the best
                     validation error epoch. Will save to save_dir/weights_e{#}.pt
    '''
    os.makedirs(save_dir, exist_ok=True)
    since = time.time()

    val_acc_history = []
    train_acc_history = []
    
    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch + 1, num_epochs))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            # TQDM has nice progress bars
            for inputs, labels in tqdm(dataloaders[phase]):
                inputs = inputs.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    # Get model outputs and calculate loss
                    outputs = model(inputs)
                    outputs = outputs.to(device)
                    loss = criterion(outputs, labels)

                    # torch.max outputs the maximum value, and its index
                    # Since the input is batched, we take the max along axis 1
                    # (the meaningful outputs)
                    if eval_mode == 'multiclass':
                      _, preds = torch.max(outputs, 1)
                    elif eval_mode == 'multilabel':
                      preds = torch.sigmoid(outputs > 0.5).long()

                    # backprop + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

            epoch_loss = running_loss / len(dataloaders[phase].dataset)
            epoch_acc = running_corrects.double() / len(dataloaders[phase].dataset)
            

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))

            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())
            if phase == 'train':
                train_acc_history.append(epoch_acc)
            if phase == 'val':
                val_acc_history.append(epoch_acc)
            if save_all_epochs:
                torch.save(model.state_dict(), os.path.join(save_dir, f'weights_{epoch}.pt'))

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # save and load best model weights
    torch.save(best_model_wts, os.path.join(save_dir, 'weights_best_val_acc.pt'))
    torch.save(model.state_dict(), os.path.join(save_dir, 'weights_last.pt'.format(epoch)))
    model.load_state_dict(best_model_wts)
    return model, val_acc_history, train_acc_history

## Training Params

In [9]:
def make_optimizer(model, learning_rate, print_parameters=False):
    # Get all the parameters
    params_to_update = model.parameters()
    if print_parameters:
      print("Params to learn:")
      for name, param in model.named_parameters():
          if param.requires_grad == True:
              print("\t",name)

 
    optimizer = optim.SGD(params_to_update, lr=learning_rate, momentum=0.9)
    return optimizer

def get_loss(mode='sound'):
    if mode=='image':
      criterion = nn.BCEWithLogitsLoss()
    # Create an instance of the loss function
    else:
      criterion = nn.CrossEntropyLoss()
    print(criterion)
    return criterion

In [10]:
# Models to choose from [resnet, alexnet, vgg, squeezenet, densenet]
# You can add your own, or modify these however you wish!
model_name = 'resnet'

# Number of classes in the dataset, normal, benign, malignant
num_classes = 18

# Batch size for training (change depending on how much memory you have)
batch_size = 32

# Shuffle the input data?
shuffle_datasets = True

# Number of epochs to train for 
num_epochs = 1

# Learning rate
learning_rate = 1e-3

### IO
# Path to a model file to use to start weights at
resume_from = None

# Whether to use a pretrained model, trained for classification in Imagenet-1k 
pretrained = False

# Save all epochs so that you can select the model from a particular epoch
save_all_epochs = False

# Whether to use early stopping (load the model with best accuracy), or not
early_stopping = True

# Directory to save weights to
save_dir = f'content/drive/MyDrive/vis_sound_files/'

## Train Sound

In [18]:
model_1, input_size = initialize_model(model_name = model_name, num_classes = num_classes, resume_from=resume_from, use_pretrained=pretrained)

In [19]:
dataloaders = get_dataloaders(input_size, batch_size, 'sound', shuffle_datasets)
criterion = get_loss(mode='sound')

# Move the model to the gpu if needed
model_1 = model_1.to(device)

optimizer_1 = make_optimizer(model_1, learning_rate)

16205 datapoints
4052 datapoints
6627 datapoints
CrossEntropyLoss()


In [22]:
#Train the model!
trained_model_1, validation_history_1, train_history_1 = train_model(model=model_1, 
                                                                     dataloaders=dataloaders, 
                                                                     criterion=criterion, 
                                                                     optimizer=optimizer_1,
                                                                     save_dir=f'{save_dir}/sound_models/{model_name}', 
                                                                     save_all_epochs=save_all_epochs, 
                                                                     num_epochs=num_epochs)
del model_1, optimizer_1, trained_model_1

Epoch 1/1
----------


  0%|          | 0/127 [00:00<?, ?it/s]

tensor([[ 4.4690e-01, -2.7075e-01,  3.0569e-01,  1.3659e-01,  8.6691e-01,
         -3.5627e-01, -7.7525e-01,  3.4409e-01, -3.9074e-03, -2.5021e-01,
          8.1025e-01,  2.5922e-01, -4.0711e-01, -1.1283e+00, -1.3343e-01,
          2.1299e-01, -1.1236e-01,  2.2437e-01],
        [ 3.9716e-01, -2.2104e-01,  4.2184e-01,  2.0639e-02,  8.8838e-01,
         -4.1274e-01, -9.8694e-01,  2.7872e-01,  1.8154e-01, -2.2712e-01,
          6.5148e-01,  2.5505e-01, -4.3202e-01, -1.1551e+00, -3.6576e-02,
          2.3404e-01, -1.8384e-01,  1.7510e-01],
        [ 4.0292e-01, -2.1852e-01,  4.7008e-01,  2.5264e-01,  9.6615e-01,
         -5.4299e-01, -1.1346e+00,  6.1670e-01,  6.9484e-02, -2.2224e-01,
          9.8058e-01,  4.7180e-02, -3.8635e-01, -1.3031e+00, -2.2950e-01,
          3.2052e-01, -1.7713e-01,  2.1226e-01],
        [ 4.1622e-01, -2.7679e-01,  5.3658e-01,  5.9987e-02,  8.9860e-01,
         -6.3005e-01, -1.0084e+00,  4.3337e-01,  2.3833e-01, -9.1285e-02,
          9.2201e-01,  1.5282e-01, -4.1

AssertionError: ignored

## Train Image

In [11]:
model_1, input_size = initialize_model(model_name = model_name, num_classes = num_classes, resume_from=resume_from, use_pretrained=pretrained)

In [15]:
dataloaders = get_dataloaders(input_size, batch_size, 'image', shuffle_datasets)
criterion = get_loss(mode='image')

# Move the model to the gpu if needed
model_1 = model_1.to(device)

optimizer_1 = make_optimizer(model_1, learning_rate)

586 datapoints
147 datapoints
244 datapoints
BCEWithLogitsLoss()


In [16]:
#Train the model!
trained_model_1, validation_history_1, train_history_1 = train_model(model=model_1, 
                                                                     dataloaders=dataloaders, 
                                                                     criterion=criterion, 
                                                                     optimizer=optimizer_1,
                                                                     save_dir=f'{save_dir}/image_models/{model_name}', 
                                                                     save_all_epochs=save_all_epochs, 
                                                                     num_epochs=num_epochs,
                                                                     eval_mode='multilabel')
del model_1, optimizer_1, trained_model_1

Epoch 1/1
----------


  0%|          | 0/5 [00:00<?, ?it/s]

train Loss: 0.6664 Acc: 16.0068


  0%|          | 0/5 [00:00<?, ?it/s]

val Loss: 0.6640 Acc: 16.0068

Training complete in 0m 2s
Best val Acc: 16.006803


In [None]:
# Load your final model, that we will use for the rest of the PSET.
if early_stopping:
  weights_file = save_dir + '/weights_best_val_acc.pt'
else:
  weights_file = save_dir + '/weights_last.pt'
model_yours, _ = initialize_model(model_name = model_name, num_classes = num_classes, resume_from=resume_from, use_pretrained=pretrained)

# Move the model to the gpu if needed
model_yours = model_yours.to(device)

# Load weights for model_yours
model_yours.load_state_dict(torch.load(weights_file))

# set models to eval mode
model_yours = model_yours.eval()



In [None]:
def evaluate(model, dataloader, criterion, is_labelled = False, generate_labels = True, k = 5):
    # If is_labelled, we want to compute loss, top-1 accuracy and top-5 accuracy
    # If generate_labels, we want to output the actual labels
    # Set the model to evaluate mode
    model.eval()
    running_loss = 0
    running_top1_correct = 0
    running_top5_correct = 0
    predicted_labels = []
    gt_labels = []

    # Iterate over data.
    # TQDM has nice progress bars
    for inputs, labels in tqdm(dataloader):
        inputs = inputs.to(device)
        labels = labels.to(device)
        tiled_labels = torch.stack([labels.data for i in range(k)], dim=1) 
        # Makes this to calculate "top 5 prediction is correct"
        # [[label1 label1 label1 label1 label1], [label2 label2 label2 label label2]]

        # forward
        # track history if only in train
        with torch.set_grad_enabled(False):
            # Get model outputs and calculate loss
            outputs = model(inputs)
            if is_labelled:
                loss = criterion(outputs, labels)

            # torch.topk outputs the maximum values, and their indices
            # Since the input is batched, we take the max along axis 1
            # (the meaningful outputs)
            _, preds = torch.topk(outputs, k=k, dim=1)
            if generate_labels:
                # We want to store these results
                nparr = preds.cpu().detach().numpy()
                predicted_labels.extend([list(nparr[i]) for i in range(len(nparr))])
                gt_labels.extend(np.array(labels.cpu()))

        if is_labelled:
            # statistics
            running_loss += loss.item() * inputs.size(0)
            # Check only the first prediction
            running_top1_correct += torch.sum(preds[:, 0] == labels.data)
            # Check all 5 predictions
            running_top5_correct += torch.sum(preds == tiled_labels)
        else:
            pass

    # Only compute loss & accuracy if we have the labels
    if is_labelled:
        epoch_loss = float(running_loss / len(dataloader.dataset))
        epoch_top1_acc = float(running_top1_correct.double() / len(dataloader.dataset))
        epoch_top5_acc = float(running_top5_correct.double() / len(dataloader.dataset))
    else:
        epoch_loss = None
        epoch_top1_acc = None
        epoch_top5_acc = None
    
    # Return everything
    return epoch_loss, epoch_top1_acc, gt_labels, predicted_labels  

In [None]:
# Get data on the validation set
# Setting this to false will be a little bit faster
generate_validation_labels = True
val_loss_yours, val_top1_yours, _, val_labels_yours = evaluate(model_yours, dataloaders['val'], criterion, is_labelled = True, generate_labels = generate_validation_labels, k = 1)
# Get predictions for the test set
test_loss_yours, test_top1_yours, _, test_labels_yours = evaluate(model_yours, dataloaders['test'], criterion, is_labelled = True, generate_labels = generate_validation_labels, k = 1)

print("Your Trained model: ")
print("Val Top-1 Accuracy: {}".format(val_top1_yours))
print("Test Top-1 Accuracy: {}".format(test_top1_yours))