In [None]:
import os
import cv2
import glob
import copy
import numpy as np
from collections import OrderedDict
import logging

import torch
import torch.nn as nn

from torchvision import transforms
from torch.utils.data import DataLoader
from torch2trt import torch2trt
from onnx2trt import get_engine, allocate_buffers, do_inference

from layers import disp_to_depth
from utils import readlines
import datasets
import networks
import time
from thop import profile, clever_format

import PIL.Image as pil
from PIL import ImageDraw, ImageFont
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.cm as cm
%matplotlib inline

In [None]:
class Net:
    def __init__(self, name=None, encoder=None, decoder=None, encoder_pth=None, decoder_pth=None, model=None):
        self.name = name
        self.height = encoder_pth["height"] if encoder_pth is not None else None
        self.width = encoder_pth["width"] if encoder_pth is not None else None
        self.encoder_pth = encoder_pth
        self.decoder_pth = decoder_pth
        self.encoder = encoder
        self.decoder = decoder
        self.model = model
        
    def set_net(self, name, encoder=None, decoder=None, encoder_pth=None, decoder_pth=None, model=None):
        self.name = name
        self.height = encoder_pth["height"] if encoder_pth is not None else None
        self.width = encoder_pth["width"] if encoder_pth is not None else None
        self.encoder_pth = encoder_pth
        self.decoder_pth = decoder_pth
        self.encoder = encoder
        self.decoder = decoder
        self.model = model
    
    def get_encoder(self):
        if self.encoder == None:
            logging.warning('Must set encoder first')
        return self.encoder
    
    def get_decoder(self):
        if self.decoder == None:
            logging.warning('Must set decoder first')
        return self.decoder
    
    def get_height(self):
        if self.height == None:
            logging.warning('Must set height first')
        return self.height
    
    def get_width(self):
        if self.width == None:
            logging.warning('Must set width first')
        return self.width
    
    def get_name(self):
        if self.name == None:
            logging.warning('Must set name first')
        return self.name
    
    def get_model(self):
        if self.model == None:
            logging.warning('Must set model first')
        return Depth(self.encoder, self.decoder)
    
    def eval(self):
        self.encoder.eval()
        self.decoder.eval()
    
    def to_device(self, no_cuda=False):
        if torch.cuda.is_available() and not no_cuda:
            self.device = torch.device("cuda")
        else:
            self.device = torch.device("cpu")
            
        self.encoder.to(self.device)
        self.decoder.to(self.device)

In [None]:
class Model:
    def __init__(self):
        self.encoder_dict = {}
        self.decoder_dict = {}
        self.is_set_Net = False
        self.splits_dir = os.path.join(os.path.expanduser("~"), "depth", "monodepth2", "splits")
        self.data_path = "/work/garin0115/datasets/kitti_data/"
        # Models which were trained with stereo supervision were trained with a nominal
        # baseline of 0.1 units. The KITTI rig has a baseline of 54cm. Therefore,
        # to convert our stereo predictions to real-world scale we multiply our depths by 5.4.
        self.STEREO_SCALE_FACTOR = 5.4
        self.eval_split = "eigen"
        self.MIN_DEPTH = 1e-3
        self.MAX_DEPTH = 80
        self.disable_median_scaling = False
        self.pred_depth_scale_factor = 1
        self.CMAP = 'plasma'
        self.side_map = {"2": 2, "3": 3, "l": 2, "r": 3}
        
        self.no_cuda = False
        self.ext = "jpg"
        self.split_folder = os.path.join(os.path.expanduser("~"), "depth", "monodepth2", "splits", self.eval_split)
        if torch.cuda.is_available() and not self.no_cuda:
            self.device = torch.device("cuda")
        else:
            self.device = torch.device("cpu")
    
    def get_model_dict(self, encoder_dict, decoder_dict):
        self.encoder_dict = encoder_dict
        self.decoder_dict = decoder_dict
    

            
    
    def load_model(self, name, is_torch2trt=False):
        assert self.encoder_dict, "Must load encoder dict first"
        assert self.decoder_dict, "Must load decoder dict first"
        
        is_finetune = True if name.split("_")[-1] == "finetune" else False
        if is_finetune:
            load_weights_folder = self.get_modelPath(name[:-9], is_finetune)
        else:
            load_weights_folder = self.get_modelPath(name)
        
        encoder_path = os.path.join(load_weights_folder, "encoder.pth")
        decoder_path = os.path.join(load_weights_folder, "depth.pth")
        encoder_pth = torch.load(encoder_path)
        decoder_pth = torch.load(decoder_path)
        
        encoder = self.encoder_dict[name.split("_")[0]]
        decoder = self.decoder_dict[name]
        encoder.load_state_dict({k: v for k, v in encoder_pth.items() if k in encoder.state_dict()})
        decoder.load_state_dict(decoder_pth)
        if is_torch2trt == True:
            x = torch.ones((1, 3, 256, 832)).to(self.device)
            encoder = torch2trt(encoder.to(self.device), [x]).cpu()
        
        return encoder, decoder, encoder_pth, decoder_pth
    
        
    
    def get_modelPath(self, name, is_finetune=False):
        load_weights_folder = os.path.join("/work", "garin0115", "models", name+"_256x832", "models")
        if not os.path.isdir(load_weights_folder):
            load_weights_folder = os.path.join(os.path.expanduser("~"), 
                                               "depth", 
                                               "monodepth2",
                                               "models", 
                                               name+"_256x832", 
                                               "models")
        
        assert os.path.isdir(load_weights_folder), "Cannot find a folder at {}".format(load_weights_folder)

        print("[info] Loading weights from {}".format(load_weights_folder))
            
        if is_finetune:
            load_weights_folder = os.path.join(load_weights_folder, "weights_29")
        elif name == "resnet18_oneLayer":
            load_weights_folder = os.path.join(load_weights_folder, "weights_18")
        else:
            load_weights_folder = os.path.join(load_weights_folder, "weights_19")
        
        return load_weights_folder
    
    def get_dataLoader(self, height, width):
        filenames  = readlines(os.path.join(self.splits_dir, self.eval_split, "test_files.txt"))
        dataset    = datasets.KITTIRAWDataset(data_path =self.data_path, 
                                           filenames =filenames,
                                           height    =height, 
                                           width     =width,
                                           frame_idxs=[0], 
                                           num_scales=4, 
                                           is_train  =False)
        dataLoader = DataLoader(dataset    =dataset,
                                batch_size =16,
                                shuffle    =False,
                                num_workers=16,
                                pin_memory =True,
                                drop_last  =False)
        return dataLoader
    
    def batch_evaluate_depth(self, save_CSV=False, is_torch2trt=False):
        if self.is_set_Net == False:
            self.is_set_Net = True
            self.set_Net()
            
        results = []
        
        for net in self.nets:
            disps, time_min, time_avg = self.evaluate_depth(net)
            result = self.calculate_metric(net.get_name(), disps, time_min, time_avg)
            results.append(result)
            
            disps, time_min, time_avg = self.evaluate_onnx_depth(net.get_name(), fp16_mode=False)
            result = self.calculate_metric(net.get_name()+"_trt32", disps, time_min, time_avg)
            results.append(result)
            
            disps, time_min, time_avg = self.evaluate_onnx_depth(net.get_name(), fp16_mode=True)
            result = self.calculate_metric(net.get_name()+"_trt16", disps, time_min, time_avg)
            results.append(result)
        
        if save_CSV:
            import csv
            # 開啟輸出的 CSV 檔案
            with open('result.csv', 'w', newline='') as csvfile:
                # 建立 CSV 檔寫入器
                writer = csv.writer(csvfile)

                # 寫入一列資料
#                 writer.writerow(['Model', 'Height', 'Width', "abs_rel", "sq_rel", "rmse", "rmse_log", "a1", "a2", "a3", 
#                               'Best FPS', 'Avg FPS', 'Parameters', 'params_enc', 'params_dec', 'FLOPs', 'fl_enc', 'fl_dec'])
                writer.writerow(['Model', 'Height', 'Width', "abs_rel", "sq_rel", "rmse", "rmse_log", "a1", "a2", "a3", 
                              'Best FPS', 'Avg FPS'])

                # 寫入另外幾列資料
                for res in results:
                    writer.writerow(res)
            
    def calculate_metric(self, name, pred_disps, time_min, time_avg):
        gt_path = os.path.join(self.splits_dir, self.eval_split, "gt_depths.npz")
        gt_depths = np.load(gt_path, fix_imports=True, encoding='latin1', allow_pickle=True)["data"]
        
        errors = []
        ratios = []

        for i in range(pred_disps.shape[0]):

            gt_depth = gt_depths[i]
            gt_height, gt_width = gt_depth.shape[:2]

            pred_disp = pred_disps[i]
            pred_disp = cv2.resize(pred_disp, (gt_width, gt_height))
            pred_depth = 1 / pred_disp

            if self.eval_split == "eigen":
                mask = np.logical_and(gt_depth > self.MIN_DEPTH, gt_depth < self.MAX_DEPTH)

                crop = np.array([0.40810811 * gt_height, 0.99189189 * gt_height,
                                 0.03594771 * gt_width,  0.96405229 * gt_width]).astype(np.int32)
                crop_mask = np.zeros(mask.shape)
                crop_mask[crop[0]:crop[1], crop[2]:crop[3]] = 1
                mask = np.logical_and(mask, crop_mask)

            else:
                mask = gt_depth > 0

            pred_depth = pred_depth[mask]
            gt_depth = gt_depth[mask]

            pred_depth *= self.pred_depth_scale_factor
            if not self.disable_median_scaling:
                ratio = np.median(gt_depth) / np.median(pred_depth)
                ratios.append(ratio)
                pred_depth *= ratio

            pred_depth[pred_depth < self.MIN_DEPTH] = self.MIN_DEPTH
            pred_depth[pred_depth > self.MAX_DEPTH] = self.MAX_DEPTH

            errors.append(self.compute_errors(gt_depth, pred_depth))

        if not self.disable_median_scaling:
            ratios = np.array(ratios)
            med = np.median(ratios)
            print(" Scaling ratios | med: {:0.3f} | std: {:0.3f}".format(med, np.std(ratios / med)))

        mean_errors = np.array(errors).mean(0)
        print("[info] {}".format(name))
        print(" best FPS: ", 1/time_min)
        print(" avg FPS: ", 1/time_avg)
        print("\n  " + ("{:>8} | " * 7).format("abs_rel", "sq_rel", "rmse", "rmse_log", "a1", "a2", "a3"))
        print(("&{: 8.3f}  " * 7).format(*mean_errors.tolist()) + "\\\\")
        print("\n-> Done!")

        
#         flops_enc, params_enc = profile(encoder, inputs=(input_color, ))
#         flops_dec, params_dec = profile(decoder, inputs=(*tuple(features), ))
#         a, b, c, d, e, f = clever_format([params_enc+params_dec, 
#                                           params_enc, 
#                                           params_dec, 
#                                           flops_enc+flops_dec, 
#                                           flops_enc, 
#                                           flops_dec], "%.3f")

        result = []
        result.append(name)
        result.append(256)
        result.append(832)
        for i in mean_errors:
            result.append(i)
        result.append(1/time_min)
        result.append(1/time_avg)
#         for i in [a, b, c, d, e, f]:
#             result.append(i)
    
        return result
        

    
    def evaluate_depth(self, net):
        dataLoader = self.get_dataLoader(net.get_height(), net.get_width())
        print("[info] Model {}".format(net.get_name()))
        
        model = net.get_model()
        model.to(self.device)
        model.eval()
        
        pred_disps = []

        print("[info] Computing predictions with size {}x{}".format(
            net.get_width(), net.get_height()))
        
        time_min = float('inf')
        time_avg = 0
        avg_FPS = 0
        with torch.no_grad():
            for i in range(5): #跑十次算FPS
                for data in dataLoader:
                    input_color = data[("color", 0, 0)].cuda()

                    start_time = time.time()
                    output = model(input_color)
                    total_time = time.time() - start_time
                    pred_disp, _ = disp_to_depth(output[("disp", 0)], self.MIN_DEPTH, self.MAX_DEPTH)
                    pred_disp = pred_disp[:, 0].cpu().numpy()
            #                 pred_disp = pred_disp[:, 0].numpy()

                    if i == 0:
                        pred_disps.append(pred_disp)
                    time_avg += total_time
                    if total_time < time_min:
                        time_min = total_time

                time_avg /= len(dataLoader)
                avg_FPS += time_avg
            time_avg = avg_FPS / 10
            
        pred_disps = np.concatenate(pred_disps)
        
        return pred_disps, time_min, time_avg
    
    def evaluate_onnx_depth(self, name, fp16_mode=True):
        dataLoader = self.get_dataLoader(256, 832)
        print("[info] Model {}".format(name))
        

        pred_disps = []

        print("[info] Computing predictions with size {}x{}".format(
            256, 832))
        
        
        
        onnx_path = os.path.join("/work", 
                                  "garin0115", 
                                  "models", 
                                  name+"_256x832", 
                                  "models", 
                                  "weights_19", 
                                  name+".onnx")
        if fp16_mode:
            engine_path = os.path.join("/work", 
                                      "garin0115", 
                                      "models",
                                      "trt16_models",
                                      name+".trt")
        else:
            engine_path = os.path.join("/work", 
                                      "garin0115", 
                                      "models",
                                      "trt_models",
                                      name+".trt")
        #engine
        engine = get_engine(fp16_mode=False, onnx_file_path=onnx_path, engine_file_path=engine_path, save_engine=False)
        # Create the context for this engine
        context = engine.create_execution_context()
        # Allocate buffers for input and output
        inputs, outputs, bindings, stream = allocate_buffers(engine)
        time_min = float('inf')
        time_avg = 0
        avg_FPS = 0
        
        for data in dataLoader:
            input_images = data[("color", 0, 0)].numpy()
            batch_pred_disp = []
            total_time = 0
            for input_image in input_images:
                input_image = np.expand_dims(input_image, axis=0).reshape(-1)
                inputs[0].host = input_image
                start_time = time.time()
                trt_outputs = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)
                end_time = time.time() - start_time
                total_time += end_time
                pred_disp, _ = disp_to_depth(trt_outputs[-1], self.MIN_DEPTH, self.MAX_DEPTH)
                pred_disp = pred_disp.reshape(1, 256, 832)
                batch_pred_disp.append(pred_disp)
                
                if end_time < time_min:
                    time_min = end_time
            total_time /= len(input_images)
            time_avg += total_time
                
            pred_disps.append(np.concatenate(batch_pred_disp, axis=0))
            
        time_avg /= len(dataLoader)
       
        
        
        pred_disps = np.concatenate(pred_disps)
        
        return pred_disps, time_min, time_avg
        

    
    def set_Net(self, is_torch2trt=False):
        self.nets = []
        for name in self.decoder_dict:
            print("[Info] Deal with {} model\n".format(name))
            net = Net()
            encoder, decoder, encoder_pth, decoder_pth = self.load_model(name)
            net.set_net(name, 
                        copy.deepcopy(encoder), 
                        copy.deepcopy(decoder), 
                        copy.deepcopy(encoder_pth), 
                        copy.deepcopy(decoder_pth))
            self.nets.append(net)
            print()
            if is_torch2trt:
                print("[Info] Deal with {} to trt model\n".format(name))
                inputs = torch.ones((1, 3, net.get_height(), net.get_width()))#.to(self.device)
                model = net.get_model()
                #model.to(self.device)
                
                print("### Converting model to trt")
                model_trt = torch2trt(encoder, [inputs])
                print("### Convert complete")
                
                #model_trt.cpu()
                
        
                self.nets.append(Net(name+"_trt", 
                                 encoder_pth=copy.deepcopy(encoder_pth), 
                                 decoder_pth=copy.deepcopy(decoder_pth), 
                                 model=model_trt))
                print()
    
        
    
    def inference_depth(self, column=2, is_torch2trt=False, is_onnx=False):
        lines = readlines(os.path.join(self.split_folder, "test_files.txt"))
        if self.is_set_Net == False:
            self.is_set_Net = True
            self.set_Net(is_torch2trt)
        with torch.no_grad():
            for i in np.random.choice(len(lines), 10, replace=False):
                folder, frame_id, side = lines[i].split()
                frame_id = int(frame_id)  
                image_path = os.path.join(self.data_path, folder, 
                                          "image_0{}".format(self.side_map[side]), 
                                          "data", 
                                          "{:010d}.jpg".format(frame_id))
                input_image = pil.open(image_path).convert('RGB')
                original_width, original_height = input_image.size
                
                
                result = OrderedDict()
                result["Input"] = input_image
#                 result["Mask"] = self.seg_img(input_image)
                
                for net in self.nets:
                    net.to_device()
                    net.eval()
                    input_image = pil.open(image_path).convert('RGB')
                    input_image_resized = input_image.resize((net.get_width(), net.get_height()), pil.LANCZOS)
                    input_image_torch = transforms.ToTensor()(input_image_resized).unsqueeze(0)
                    input_image_torch = input_image_torch.to(self.device)
                    
                    features = net.get_encoder()(input_image_torch)
                    outputs = net.get_decoder()(*tuple(features))
                    
                    disp = outputs[("disp", 0)]
                    disp_resized = torch.nn.functional.interpolate(
                            disp, (original_height, original_width), mode="bilinear", align_corners=False)
                    disp_resized_np = disp_resized.squeeze().cpu().detach().numpy()
                    result["{}".format(net.get_name())] = disp_resized_np
                    
                    if is_onnx:
                        name = net.get_name()
                        onnx_path = os.path.join("/work", 
                                      "garin0115", 
                                      "models", 
                                      name+"_256x832", 
                                      "models", 
                                      "weights_19", 
                                      name+".onnx")
                        engine16_path = os.path.join("/work", 
                                                  "garin0115", 
                                                  "models",
                                                  "trt16_models",
                                                  name+".trt")

                        engine_path = os.path.join("/work", 
                                                  "garin0115", 
                                                  "models",
                                                  "trt_models",
                                                  name+".trt")
                        
                        input_image = input_image.resize((832, 256), pil.LANCZOS)
                        input_image = np.array(input_image).transpose((2, 0, 1)).astype(np.float32) / 255.
                        print(input_image.shape)
                        input_image = np.expand_dims(input_image, axis=0).reshape(-1)


                        #engine16
                        engine16 = get_engine(fp16_mode=True, onnx_file_path=onnx_path, engine_file_path=engine16_path, save_engine=False)
                        # Create the context for this engine
                        context16 = engine16.create_execution_context()
                        # Allocate buffers for input and output
                        inputs16, outputs16, bindings16, stream16 = allocate_buffers(engine16)
                        inputs16[0].host = input_image
                        trt_outputs16 = do_inference(context16, bindings=bindings16, inputs=inputs16, outputs=outputs16, stream=stream16) # numpy data

                        #engine
                        engine = get_engine(fp16_mode=False, onnx_file_path=onnx_path, engine_file_path=engine_path, save_engine=False)
                        # Create the context for this engine
                        context = engine.create_execution_context()
                        # Allocate buffers for input and output
                        inputs, outputs, bindings, stream = allocate_buffers(engine)
                        inputs[0].host = input_image
                        trt_outputs = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream) # numpy data

                        result["{}".format(net.get_name()+"_trt32")] = trt_outputs[-1].reshape((net.get_height(), net.get_width()))
                        result["{}".format(net.get_name()+"_trt16")] = trt_outputs16[-1].reshape((net.get_height(), net.get_width()))
                
                self.quick_show(result, column=column)
    
    def inference_segment_sky(self):
        result = OrderedDict()
        lines = readlines(os.path.join(self.split_folder, "test_files.txt"))
        for i in np.random.choice(len(lines), 10, replace=False):
            folder, frame_id, side = lines[i].split()
            frame_id = int(frame_id)  
            image_path = os.path.join(self.data_path, folder, 
                                      "image_0{}".format(self.side_map[side]), 
                                      "data", 
                                      "{:010d}.jpg".format(frame_id))
            input_image = pil.open(image_path).convert('RGB')
            
            result["Input_{}".format(i)] = input_image
            result["Mask_{}".format(i)] = self.seg_img(input_image)
            
        self.quick_show(result, column=4)
            
    
    def seg_img(self, image):
        image = cv2.cvtColor(np.asarray(image),cv2.COLOR_RGB2BGR) 
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3,3))
        thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)[1]
        thresh_dilation = cv2.dilate(thresh, kernel, anchor=(-1,-1), iterations=8)

        edges = cv2.Canny(gray, 1, 100)
        edges_dilation = cv2.dilate(edges, kernel, anchor=(-1,-1), iterations=8)

        mask = thresh_dilation | edges_dilation
        mask_dilation = cv2.dilate(mask, kernel, anchor=(-1,-1), iterations=8)
        segImg = 255 - mask_dilation 
        segImg[image.shape[0]//3:, :] = 0
            
        return segImg
                
    def evaluate_pose(self):
        pass            
    
    def inference_pose(self):
        pass
    
    def quick_show(self, result, column=2):
        row = len(result) // column
        if len(result) % column > 0:
            row += 1
        plt.figure(figsize=(column*3*3, row*1*3+1))
        for idx, key in enumerate(result):
            
            if key.split("_")[0] == "Input":
                plt.subplot(row, column, idx+1)
                plt.imshow(result[key])
                plt.title(key, fontsize=22)
                continue
                
            plt.subplot(row, column, idx+1)
            if key.split("_")[0] == "Mask":
                plt.imshow(result[key], cmap="gray")
            else:
                vmax = np.percentile(result[key], 95)
                plt.imshow(result[key], cmap=self.CMAP, vmax=vmax)
                
            if key == "resnet18_simplify2my3":
                plt.title(key, fontsize=22, color="red")
            elif key == "resnet18_skip2Conv":
                plt.title(key, fontsize=22, color="blue")
            else:
                plt.title(key, fontsize=22)
            plt.axis("off")
        plt.tight_layout(pad=0.5, w_pad=0.1, h_pad=0.1)
        
    def make_grid(self, result, column=2):
        pass
    
    def make_vedio(self, file_name, video_output_folder, column=2):
        # 取得資料夾中所有影像檔案路徑
        kitti_depth_folder = '/work/garin0115/datasets/kitti_data/'+file_name+'/image_02'
        filenames = glob.glob(kitti_depth_folder+'/*/*.jpg')

        # 將檔案路徑排序
        filenames.sort()
        num_images = len(filenames)
        print("Total images: {}".format(num_images))
        
        if self.is_set_Net == False:
            self.is_set_Net = True
            self.set_Net()
    
        num_model = len(self.nets)
        num_column = column
        num_row = num_model // column if num_model % column == 0 else num_model // column + 1
        
        fourcc = cv2.VideoWriter_fourcc(*'XVID')
        out = cv2.VideoWriter(video_output_folder+'/disp_{}.avi'.format(file_name.split('/')[-1]), 
                              fourcc, 
                              15.0, 
                              (original_width*num_column, original_height*num_row))
        
        
        for idx in range(num_images):
            res = []
            input_image = pil.open(img).convert('RGB')
            input_image = np.array(input_image)
            cv2.putText(input_image, "Input", (10, 40), cv2.FONT_HERSHEY_TRIPLEX, 1.5, (255, 255, 255), 2, cv2.LINE_AA)
            res.append(input_image[:, :, ::-1])

            for net in self.nets:
                name = net.get_name()
                disp = show[name][idx]
                vmax = np.percentile(disp, 95)
                normalizer = mpl.colors.Normalize(vmin=disp.min(), vmax=vmax)
                mapper = cm.ScalarMappable(norm=normalizer, cmap=CMAP)
                colormapped_im = (mapper.to_rgba(disp)[:, :, :3] * 255).astype(np.uint8)
                cv2.putText(colormapped_im, name, (10, 40), cv2.FONT_HERSHEY_TRIPLEX, 1.2, (255, 255, 255), 2, cv2.LINE_AA)
                cv2.putText(colormapped_im, "FPS "+show["{}_FPS".format(name)][idx], (10, 100), cv2.FONT_HERSHEY_TRIPLEX, 1.5, (0, 255, 0), 2, cv2.LINE_AA)
                im = pil.fromarray(colormapped_im[:, :, ::-1])
                res.append(im)
            
            
            result = []
            for i in range(num_row):
                result.append(np.hstack(res[num_row * num_column: (num_row+1) * num_column]))
            result = np.vstack(result)


            out.write(result)
        out.release()    
        
        
    
    def calc_param(self, net):
        net_params = filter(lambda p: p.requires_grad, net.parameters())
        weight_count = 0
        for param in net_params:
            weight_count += np.prod(param.size())
        return weight_count
    
    def compute_errors(self, gt, pred):
        """Computation of error metrics between predicted and ground truth depths
        """
        thresh = np.maximum((gt / pred), (pred / gt))
        a1 = (thresh < 1.25     ).mean()
        a2 = (thresh < 1.25 ** 2).mean()
        a3 = (thresh < 1.25 ** 3).mean()

        rmse = (gt - pred) ** 2
        rmse = np.sqrt(rmse.mean())

        rmse_log = (np.log(gt) - np.log(pred)) ** 2
        rmse_log = np.sqrt(rmse_log.mean())

        abs_rel = np.mean(np.abs(gt - pred) / gt)

        sq_rel = np.mean(((gt - pred) ** 2) / gt)

        return abs_rel, sq_rel, rmse, rmse_log, a1, a2, a3
    
    def batch_post_process_disparity(self, l_disp, r_disp):
        """Apply the disparity post-processing method as introduced in Monodepthv1
        """
        _, h, w = l_disp.shape
        m_disp = 0.5 * (l_disp + r_disp)
        l, _ = np.meshgrid(np.linspace(0, 1, w), np.linspace(0, 1, h))
        l_mask = (1.0 - np.clip(20 * (l - 0.05), 0, 1))[None, ...]
        r_mask = l_mask[:, :, ::-1]
        return r_mask * l_disp + l_mask * r_disp + (1.0 - l_mask - r_mask) * m_disp
    
    #save to ONNX model
    def save_ONNX(self):
        if self.is_set_Net == False:
            self.is_set_Net = True
            self.set_Net()
            
        for j in range(len(self.nets)):
            self.nets[j].to_device()
            self.nets[j].eval()
            x = torch.randn(1, 3, self.nets[j].get_height(), self.nets[j].get_width(), requires_grad=True).to(self.device)
            path = self.get_modelPath(self.nets[j].get_name())
            encoder = self.nets[j].get_encoder()
            decoder = self.nets[j].get_decoder()
            depth_model = Depth(encoder, decoder, output_list=True)
            
            # Export the model
            torch.onnx.export(depth_model,               # model being run
                              x,                         # model input (or a tuple for multiple inputs)
                              path+"/"+self.nets[j].get_name()+".onnx",   # where to save the model (can be a file or file-like object)
                              export_params=True,        # store the trained parameter weights inside the model file
                              opset_version=10,          # the ONNX version to export the model to
                              verbose=True,
                              do_constant_folding=True,  # whether to execute constant folding for optimization
                              input_names = ['input'],   # the model's input names
                              output_names = ['output']) # the model's output names


# ONNX2TRT

In [None]:
fp16_mode = True
print("Model Name           FPS")
for name in decoder_dict:
    if name == "resnet18_oneLayer":
        onnx_path = os.path.join("/work", 
                          "garin0115", 
                          "models", 
                          name+"_256x832", 
                          "models", 
                          "weights_18", 
                          name+".onnx")
    else:
        onnx_path = os.path.join("/work", 
                                  "garin0115", 
                                  "models", 
                                  name+"_256x832", 
                                  "models", 
                                  "weights_19", 
                                  name+".onnx")
    if fp16_mode:
        engine_path = os.path.join("/work", 
                                  "garin0115", 
                                  "models",
                                  "trt16_models",
                                  name+".trt")
    else:
        engine_path = os.path.join("/work", 
                                  "garin0115", 
                                  "models",
                                  "trt_models",
                                  name+".trt")

    engine = get_engine(fp16_mode=fp16_mode, onnx_file_path=onnx_path, engine_file_path=engine_path, save_engine=True)

    # Create the context for this engine
    context = engine.create_execution_context()

    # Allocate buffers for input and output
    inputs, outputs, bindings, stream = allocate_buffers(engine) # input, output: host # bindings


    # Load data to the buffer
    image_path = "assets/test_image.jpg"
    input_image = pil.open(image_path).convert('RGB').resize((832, 256), pil.LANCZOS)
    input_image = np.array(input_image).transpose((2, 0, 1)).astype(np.float32) / 255.
    input_image = np.expand_dims(input_image, axis=0)
    inputs[0].host = input_image.reshape(-1)

    # inputs[1].host = ... for multiple input
    t1 = time.time()
    for i in range(100):
        trt_outputs = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream) # numpy data
    t2 = time.time()
    
    print("{}       {}".format(name, 100/(t2-t1)))

# Dict 

In [None]:
encoder_dict = {
    "resnet18":networks.ResnetEncoder(18, False)
}
decoder_dict = {
#     "resnet18_my3":networks.MYDecoder(encoder_dict["resnet18"].num_ch_enc),
#     "resnet18_my3_smooth":networks.MYDecoder(encoder_dict["resnet18"].num_ch_enc),
#     "resnet18_my3_concatDepth":networks.MYDecoder(encoder_dict["resnet18"].num_ch_enc, concatDepth=True),
    

#     "resnet18_my3_skipSky":networks.MYDecoder(encoder_dict["resnet18"].num_ch_enc),
#     "resnet18_my3_firstConv":networks.MYDecoder(encoder_dict["resnet18"].num_ch_enc, firstConv=True),
    
#     "resnet18_my3_firstConv_skipSky":networks.MYDecoder(encoder_dict["resnet18"].num_ch_enc, firstConv=True),
#     "resnet18_my3_firstConv_skipSky_conv11":networks.MYDecoder(encoder_dict["resnet18"].num_ch_enc, firstConv=True, conv11=True),
#     "resnet18_my3_skyLoss":networks.MYDecoder(encoder_dict["resnet18"].num_ch_enc),
    
#     "resnet18_my3_skipSky_skyLoss":networks.MYDecoder(encoder_dict["resnet18"].num_ch_enc),  
#     "resnet18_my3_nomask":networks.MYDecoder(encoder_dict["resnet18"].num_ch_enc, kernel_size=35),
#     "resnet18_my3_skipSky_finetune":networks.MYDecoder(encoder_dict["resnet18"].num_ch_enc),
#     "resnet18_my3_finetune":networks.MYDecoder(encoder_dict["resnet18"].num_ch_enc),
    
    "resnet18":networks.DepthDecoder(encoder_dict["resnet18"].num_ch_enc),
#     "resnet18_skyLoss":networks.DepthDecoder(encoder_dict["resnet18"].num_ch_enc),
#     "resnet18_skipSky_skyLoss":networks.DepthDecoder(encoder_dict["resnet18"].num_ch_enc),
    
#     "resnet18_pw":networks.DepthDecoder(encoder_dict["resnet18"].num_ch_enc, pw=True),
    
#     "resnet18_skipFirstConv":networks.DepthDecoder(encoder_dict["resnet18"].num_ch_enc, skipFirstConv=True),
#     "resnet18_skipFirstConv_skipSky":networks.DepthDecoder(encoder_dict["resnet18"].num_ch_enc, skipFirstConv=True),
    
    "resnet18_skip2Conv":networks.DepthDecoder(encoder_dict["resnet18"].num_ch_enc, skip2Conv=True),
#     "resnet18_skip2Conv_skipSky":networks.DepthDecoder(encoder_dict["resnet18"].num_ch_enc, skip2Conv=True),
#     "resnet18_skip2Conv_skyLoss":networks.DepthDecoder(encoder_dict["resnet18"].num_ch_enc, skip2Conv=True),
    
#     "resnet18_oneLayer":networks.DepthDecoder(encoder_dict["resnet18"].num_ch_enc, oneLayer=True),
    
    "resnet18_simplify2my3":networks.DepthDecoder(encoder_dict["resnet18"].num_ch_enc, pw=True, oneLayer=True),
#     "resnet18_simplify2my3_skyLoss":networks.DepthDecoder(encoder_dict["resnet18"].num_ch_enc, pw=True, oneLayer=True),
#     "resnet18_simplify2my3_skipSky":networks.DepthDecoder(encoder_dict["resnet18"].num_ch_enc, pw=True, oneLayer=True),
#     "resnet18_simplify2my3_skipSky_skyLoss":networks.DepthDecoder(encoder_dict["resnet18"].num_ch_enc, pw=True, oneLayer=True),
    
    
    
    
}

# Ensemble to one ONNX model

In [None]:
class Depth(nn.Module):
    def __init__(self, encoder, decoder, output_list=False):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.output_list = output_list
        
    def forward(self, inputs):
        feature = self.encoder(inputs)
        output = self.decoder(*tuple(feature))
        if self.output_list:
            list_output = []
            for key, value in output.items():
                list_output.append(value)
            output = list_output
        
        return output




In [None]:
Models = Model()
Models.get_model_dict(encoder_dict, decoder_dict)
cv2.setNumThreads(0)  # This speeds up evaluation 5x on our unix systems (OpenCV 3.3.1)

if False:
    Models.save_ONNX()

In [None]:
Models.batch_evaluate_depth(save_CSV=True, is_torch2trt=False)

In [None]:
Models.inference_depth(column=2, is_torch2trt=False, is_onnx=True)

In [None]:
Models.inference_segment_sky()

In [None]:
import onnx

# Load the ONNX model
model = onnx.load("alexnet.proto")

# Check that the IR is well formed
onnx.checker.check_model(model)

# Print a human readable representation of the graph
onnx.helper.printable_graph(model.graph)

In [None]:
import tensorrt
tensorrt.__version__

In [None]:
import onnx
onnx.__version__

# save Video

In [None]:
# 選擇要建立 video 的 data [TODO]
# file_name = '2011_10_03/2011_10_03_drive_0047_sync' #837
# file_name = '2011_09_30/2011_09_30_drive_0016_sync' #279
# file_name = '2011_09_29/2011_09_29_drive_0026_sync' #158
# file_name = '2011_09_28/2011_09_28_drive_0037_sync' #89
file_name = '2011_09_26/2011_09_26_drive_0036_sync' #803
# file_name = '2011_09_26/2011_09_26_drive_0023_sync' #474
# file_name = '2011_09_26/2011_09_26_drive_0020_sync' #86
# file_name = '2011_09_26/2011_09_26_drive_0013_sync' #144
# file_name = '2011_09_26/2011_09_26_drive_0002_sync' #77

# 選擇影片輸出資料夾 [TODO]
video_output_folder = os.path.join(os.path.expanduser("~"), 
                                                      "depth",
                                                      "monodepth2",
                                                      "video_result")

# 取得資料夾中所有影像檔案路徑
kitti_depth_folder = '/work/garin0115/datasets/kitti_data/'+file_name+'/image_02'
filenames = glob.glob(kitti_depth_folder+'/*/*.jpg')

# 將檔案路徑排序
filenames.sort()
num_images = len(filenames)
print("Total images: {}".format(num_images))

# Temp

In [1]:
import networks
import torch
from torchsummary import summary
import numpy as np

In [2]:
resnet18 = networks.ResnetEncoder(18, False).cuda()
summary(resnet18, (3, 256, 832), batch_size=16)

torch.Size([2, 64, 128, 416])
torch.Size([2, 64, 64, 208])
torch.Size([2, 128, 32, 104])
torch.Size([2, 256, 16, 52])
torch.Size([2, 512, 8, 26])
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [16, 64, 128, 416]           9,408
       BatchNorm2d-2         [16, 64, 128, 416]             128
              ReLU-3         [16, 64, 128, 416]               0
         MaxPool2d-4          [16, 64, 64, 208]               0
            Conv2d-5          [16, 64, 64, 208]          36,864
       BatchNorm2d-6          [16, 64, 64, 208]             128
              ReLU-7          [16, 64, 64, 208]               0
            Conv2d-8          [16, 64, 64, 208]          36,864
       BatchNorm2d-9          [16, 64, 64, 208]             128
             ReLU-10          [16, 64, 64, 208]               0
       BasicBlock-11          [16, 64, 64, 208]               0
           Conv2d-12 

In [3]:
resnet50 = networks.ResnetEncoder(50, False).cuda()
summary(resnet50, (3, 256, 832), batch_size=16)

torch.Size([2, 64, 128, 416])
torch.Size([2, 256, 64, 208])
torch.Size([2, 512, 32, 104])
torch.Size([2, 1024, 16, 52])
torch.Size([2, 2048, 8, 26])
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [16, 64, 128, 416]           9,408
       BatchNorm2d-2         [16, 64, 128, 416]             128
              ReLU-3         [16, 64, 128, 416]               0
         MaxPool2d-4          [16, 64, 64, 208]               0
            Conv2d-5          [16, 64, 64, 208]           4,096
       BatchNorm2d-6          [16, 64, 64, 208]             128
              ReLU-7          [16, 64, 64, 208]               0
            Conv2d-8          [16, 64, 64, 208]          36,864
       BatchNorm2d-9          [16, 64, 64, 208]             128
             ReLU-10          [16, 64, 64, 208]               0
           Conv2d-11         [16, 256, 64, 208]          16,384
      BatchNorm2d-

In [4]:
mobilenet = networks.MobileNet().cuda()
summary(mobilenet, (3, 256, 832), batch_size=16)

torch.Size([2, 64, 128, 416])
torch.Size([2, 128, 64, 208])
torch.Size([2, 256, 32, 104])
torch.Size([2, 512, 16, 52])
torch.Size([2, 1024, 8, 26])
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [16, 32, 128, 416]             864
       BatchNorm2d-2         [16, 32, 128, 416]              64
              ReLU-3         [16, 32, 128, 416]               0
         ConvBlock-4         [16, 32, 128, 416]               0
            Conv2d-5         [16, 32, 128, 416]             288
       BatchNorm2d-6         [16, 32, 128, 416]              64
              ReLU-7         [16, 32, 128, 416]               0
         ConvBlock-8         [16, 32, 128, 416]               0
            Conv2d-9         [16, 64, 128, 416]           2,048
      BatchNorm2d-10         [16, 64, 128, 416]             128
             ReLU-11         [16, 64, 128, 416]               0
        ConvBlock-1

In [5]:
mobilenetv2 = networks.MobileNetV2().cuda()
summary(mobilenetv2, (3, 256, 832), batch_size=16)

torch.Size([2, 16, 128, 416])
torch.Size([2, 24, 64, 208])
torch.Size([2, 32, 32, 104])
torch.Size([2, 96, 16, 52])
torch.Size([2, 1280, 8, 26])
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [16, 32, 128, 416]             864
       BatchNorm2d-2         [16, 32, 128, 416]              64
             ReLU6-3         [16, 32, 128, 416]               0
         ConvBlock-4         [16, 32, 128, 416]               0
            Conv2d-5         [16, 32, 128, 416]           1,024
       BatchNorm2d-6         [16, 32, 128, 416]              64
             ReLU6-7         [16, 32, 128, 416]               0
         ConvBlock-8         [16, 32, 128, 416]               0
            Conv2d-9         [16, 32, 128, 416]             288
      BatchNorm2d-10         [16, 32, 128, 416]              64
            ReLU6-11         [16, 32, 128, 416]               0
        ConvBlock-12  

In [6]:
mobilenetv3 = networks.MobileNetV3().cuda()
summary(mobilenetv3, (3, 256, 832), batch_size=16)

torch.Size([2, 16, 128, 416])
torch.Size([2, 24, 64, 208])
torch.Size([2, 40, 32, 104])
torch.Size([2, 112, 16, 52])
torch.Size([2, 960, 8, 26])
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [16, 16, 128, 416]             432
       BatchNorm2d-2         [16, 16, 128, 416]              32
            HSwish-3         [16, 16, 128, 416]               0
         ConvBlock-4         [16, 16, 128, 416]               0
            Conv2d-5         [16, 16, 128, 416]             144
       BatchNorm2d-6         [16, 16, 128, 416]              32
              ReLU-7         [16, 16, 128, 416]               0
         ConvBlock-8         [16, 16, 128, 416]               0
            Conv2d-9         [16, 16, 128, 416]             256
      BatchNorm2d-10         [16, 16, 128, 416]              32
        ConvBlock-11         [16, 16, 128, 416]               0
  MobileNetV3Unit-12  