<a href="https://colab.research.google.com/github/goromal/FANet_Evaluation/blob/main/fanet_eval_main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# FANet Evaluation - 6.862 Project



*   David Elatov
*   Dayne Howard
*   Andrew Torgesen



## Setup

### CUDA

1. Go to **Menu > Runtime > Change runtime type** and make sure that GPU is enabled.
2. Run the commands below to ensure that the GPU (and CUDA) is operational.

In [1]:
! nvidia-smi

Tue Apr  6 14:06:56 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.67       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   65C    P8    11W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
import torch
torch.cuda.is_available()

True

### Repository

Mount drive, clone repo, navigate to repo, and change working directory to access repo files. **Run ONCE per computing session.**

In [1]:
import os
from google.colab import drive
drive.mount('/content/gdrive')
%cd gdrive/MyDrive/
if not os.path.exists('FANet_Evaluation'):
  print('Repo not present. Cloning...')
  ! git clone https://github.com/goromal/FANet_Evaluation.git
  %cd FANet_Evaluation/
else:
  print('Repo already present. Updating...')
  %cd FANet_Evaluation/
  ! git pull origin main
from model.test_model import *
test()

Mounted at /content/gdrive
/content/gdrive/MyDrive
Repo already present. Updating...
/content/gdrive/MyDrive/FANet_Evaluation
From https://github.com/goromal/FANet_Evaluation
 * branch            main       -> FETCH_HEAD
Already up to date.
SUCCESS


### Python Packages

In [None]:
! pip install oyaml
! pip install torchstat

Collecting oyaml
  Downloading https://files.pythonhosted.org/packages/37/aa/111610d8bf5b1bb7a295a048fc648cec346347a8b0be5881defd2d1b4a52/oyaml-1.0-py2.py3-none-any.whl
Installing collected packages: oyaml
Successfully installed oyaml-1.0
Collecting torchstat
  Downloading https://files.pythonhosted.org/packages/bc/fe/f483b907ca80c90f189cd892bb2ce7b2c256010b30314bbec4fc17d1b5f1/torchstat-0.0.7-py3-none-any.whl
Installing collected packages: torchstat
Successfully installed torchstat-0.0.7


## FANet-18 Initial FPS esting

In [None]:
import torch
import oyaml as yaml
from torchstat import stat
import time,os

from model.fanet import FANet

In [None]:
network = FANet(backbone='resnet18')
network.cuda()
network.eval()
t_cnt = 0.0
with torch.no_grad():
  input = torch.rand((1,3,1024,2048)).cuda()
  
  torch.cuda.synchronize()
  x = network(input)
  x = network(input)
  
  torch.cuda.synchronize()
  torch.cuda.synchronize()
  start_ts = time.time()

  for i in range(100):
    x = network(input)
  
  torch.cuda.synchronize()
  end_ts = time.time()

  t_cnt = end_ts-start_ts

print('FANet-18 Performance (FPS): %f' % (100.0/t_cnt))

FANet-18 Performance (FPS): 40.201780


# Evaluation Pipeline Training Decomposition

This is how they train a model, from start to finish.

In [13]:
import sys
sys.path.insert(0, '/content/gdrive/MyDrive/FANet_Evaluation/evaluation') # so that the evaluation pipeline's internal imports work
import tensorflow as tf
from evaluation.utils.params import get_params
from evaluation.utils.dirs import create_exp_dirs
from evaluation.utils.misc import timeit
import scipy.misc as misc # for image resizing
from tqdm import tqdm # progress bar visualization
import time # for timing

In [14]:
# Usable Models
from evaluation.models.dilation_mobilenet import DilationMobileNet # << using this network as an example
# etc...there's like 15 of them

# Metrics for measuring performance (mIoU, etc.)
from evaluation.metrics.metrics import Metrics

In [None]:
# Argument class to instantiate a model
class ModelTrainArgs(object):
    def __init__(self):
        # MODEL ARGS
        self.img_width = 1024
        self.img_height = 512
        self.num_channels = 3 # 3 channels for color images
        # data dir contains pre-processed weights.npy, X_train.npy, Y_train.npy, X_val.npy, Y_val.npy
        self.data_dir = '/content/gdrive/MyDrive/full_cityscapes_res' # DATA LOCATED IN "My Drive/full_cityscapes_res"
        self.weighted_loss = True
        self.batch_size = 4
        self.learning_rate = 0.0001
        
        # TRAIN ARGS
        self.data_mode = "experiment"
        self.num_classes = 20 # for CityScapes
        self.test_every = 10 # validation performed every 10 training epochs
        

In [None]:
# Set parameters
args = ModelTrainArgs()

# Reset the graph
tf.reset_default_graph()

# Create the sess
gpu_options = tf.GPUOptions(allow_growth=True)
sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options, allow_soft_placement=True))

# Create Model class and "build" it
with sess.as_default():
    with tf.variable_scope('network') as scope:
        model = DilationMobileNet(args)
        model.build()

# Instantiate training components
sess.run(tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()))
train_data = None
train_data_len = None
val_data = None
val_data_len = None
num_iterations_training_per_epoch = None
num_iterations_validation_per_epoch = None
metrics = Metrics(args.num_classes)

# Training batch generator
def train_generator():
    global args, num_iterations_training_per_epoch, train_data_len, train_data
    start = 0
    idx = np.random.choice(train_data_len, num_iterations_training_per_epoch * args.batch_size, replace=True)
    while True:
        # select the mini_batches
        mask = idx[start:start + args.batch_size]
        x_batch = train_data['X'][mask]
        y_batch = train_data['Y'][mask]

        # update start idx
        start += args.batch_size

        yield x_batch, y_batch

        if start >= train_data_len:
            return

# Load training and validation data
print("Loading Training data..")
train_data_wrongsize = {'X': np.load(args.data_dir + "X_train.npy"), 'Y': np.load(args.data_dir + "Y_train.npy")}
X = []
Y = []
train_data = dict()
for i in range(train_data_wrongsize['X'].shape[0]):
    X.append(misc.imresize(train_data_wrongsize['X'][i, ...], (args.img_height, args.img_width)))
    Y.append(misc.imresize(train_data_wrongsize['Y'][i, ...], (args.img_height, args.img_width), 'nearest'))
train_data['X'] = np.asarray(X)
train_data['Y'] = np.asarray(Y)
train_data_len = train_data['X'].shape[0]
num_iterations_training_per_epoch = (train_data_len + args.batch_size - 1) // args.batch_size
print("Train-shape-x -- " + str(train_data['X'].shape) + " " + str(train_data_len))
print("Train-shape-y -- " + str(train_data['Y'].shape))
print("Num of iterations on training data in one epoch -- " + str(num_iterations_training_per_epoch))
print("Training data is loaded")

print("Loading Validation data..")
val_data = {'X': np.load(args.data_dir + "X_val.npy"), 'Y': np.load(args.data_dir + "Y_val.npy")}
val_data['Y_large'] = self.val_data['Y']
val_data_len = val_data['X'].shape[0] - val_data['X'].shape[0] % args.batch_size
num_iterations_validation_per_epoch = (val_data_len + args.batch_size - 1) // args.batch_size
print("Val-shape-x -- " + str(val_data['X'].shape) + " " + str(val_data_len))
print("Val-shape-y -- " + str(val_data['Y'].shape))
print("Num of iterations on validation data in one epoch -- " + str(num_iterations_validation_per_epoch))
print("Validation data is loaded")

# Train
print("Training mode will begin NOW ..")
for cur_epoch in range(self.model.global_epoch_tensor.eval(self.sess) + 1, self.args.num_epochs + 1, 1):

    # init tqdm and get the epoch value
    tt = tqdm(train_generator(), total=num_iterations_training_per_epoch, desc="epoch-" + str(cur_epoch) + "-")

    # init the current iterations
    cur_iteration = 0

    # init acc and loss lists
    loss_list = []
    acc_list = []

    # loop by the number of iterations
    for x_batch, y_batch in tt:

        # get the cur_it for the summary
        cur_it = model.global_step_tensor.eval(sess)

        # Feed data into the network
        feed_dict = {model.x_pl: x_batch,
                     model.y_pl: y_batch,
                     model.is_training: True}

        # Run the feed forward but the last iteration finalize what you want to do
        if cur_iteration < num_iterations_training_per_epoch - 1:

            # run the feed_forward
            _, loss, acc, summaries_merged = sess.run(
                        [model.train_op, model.loss, model.accuracy, model.merged_summaries],
                        feed_dict=feed_dict)
            # log loss and acc
            loss_list += [loss]
            acc_list += [acc]

        else:
            # run the feed_forward
            _, loss, acc, summaries_merged, segmented_imgs = self.sess.run(
                            [model.train_op, model.loss, model.accuracy,
                             model.merged_summaries, model.segmented_summary],
                             feed_dict=feed_dict)

            # log loss and acc
            loss_list += [loss]
            acc_list += [acc]
            total_loss = np.mean(loss_list)
            total_acc = np.mean(acc_list)

            # Update the Global step
            model.global_step_assign_op.eval(session=sess, feed_dict={model.global_step_input: cur_it + 1})

            # Update the Cur Epoch tensor
            # it is the last thing because if it is interrupted it repeat this
            model.global_epoch_assign_op.eval(session=sess, feed_dict={model.global_epoch_input: cur_epoch + 1})

            # print in console
            tt.close()
            print("epoch-" + str(cur_epoch) + "-" + "loss:" + str(total_loss) + "-" + " acc:" + str(total_acc)[:6])

            # Break the loop to finalize this epoch
            break

        # Update the Global step
        model.global_step_assign_op.eval(session=sess, feed_dict={model.global_step_input: cur_it + 1})

        # update the cur_iteration
        cur_iteration += 1

    # Test the model on validation set
    if cur_epoch % args.test_every == 0:
        step = model.global_step_tensor.eval(sess)
        epoch = model.global_epoch_tensor.eval(sess)
        print("Validation at step:" + str(step) + " at epoch:" + str(epoch) + " ..")

        # init tqdm and get the epoch value
        tt = tqdm(range(num_iterations_validation_per_epoch), total=num_iterations_validation_per_epoch,
                  desc="Val-epoch-" + str(epoch) + "-")

        # init acc and loss lists
        loss_list = []
        acc_list = []
        inf_list = []

        # idx of minibatch
        idx = 0

        # reset metrics
        metrics.reset()

        # get the maximum iou to compare with and save the best model
        max_iou = model.best_iou_tensor.eval(self.sess)

        # loop by the number of iterations
        for cur_iteration in tt:
            # load minibatches
            x_batch = val_data['X'][idx:idx + args.batch_size]
            y_batch = val_data['Y'][idx:idx + args.batch_size]

            # update idx of minibatch
            idx += args.batch_size

            # Feed this variables to the network
            feed_dict = {model.x_pl: x_batch,
                         model.y_pl: y_batch,
                         model.is_training: False}

            # Run the feed forward but the last iteration finalize what you want to do
            if cur_iteration < num_iterations_validation_per_epoch - 1:

                start = time.time()
                # run the feed_forward

                out_argmax, loss, acc, summaries_merged = sess.run(
                    [model.out_argmax, model.loss, model.accuracy, model.merged_summaries],
                    feed_dict=feed_dict)

                end = time.time()
                # log loss and acc
                loss_list += [loss]
                acc_list += [acc]
                inf_list += [end - start]

                # log metrics
                metrics.update_metrics_batch(out_argmax, y_batch)

            else:
                start = time.time()
                # run the feed_forward
                out_argmax, acc, segmented_imgs = sess.run(
                        [test_model.out_argmax, test_model.accuracy, test_model.segmented_summary],
                        feed_dict=feed_dict)

                end = time.time()
                # log loss and acc
                acc_list += [acc]
                inf_list += [end - start]
                # log metrics
                metrics.update_metrics_batch(out_argmax, y_batch)
                # mean over batches
                total_acc = np.mean(acc_list)
                mean_iou = metrics.compute_final_metrics(num_iterations_validation_per_epoch)
                mean_iou_arr = metrics.iou
                mean_inference = str(np.mean(inf_list)) + '-seconds'

                # print in console
                tt.close()
                print("Val-epoch-" + str(epoch) + "-" +
                      "acc:" + str(total_acc)[:6] + "-mean_iou:" + str(mean_iou))
                print("Last_max_iou: " + str(max_iou))
                if mean_iou > max_iou:
                    print("This validation got a new best iou. so we will save this one")
                    # Set the new maximum
                    model.best_iou_assign_op.eval(session=sess, feed_dict={model.best_iou_input: mean_iou})
                else:
                    print("Hmm, not the best validation epoch :/..")
                break

                # Break the loop to finalize this epoch
        

# Finish session
self.sess.close()