In [2]:
import pandas as pd
import numpy as np
import torch.profiler
import logging

#%pip install (package)

In [3]:
#Chakra pytorch ET to Chakra ET code (not used for now)

import json
import logging
from typing import Any, Dict

from chakra.third_party.utils.protolib import encodeMessage as encode_message
from chakra.et_def.et_def_pb2 import (
    GlobalMetadata,
    Node as ChakraNode,
    AttributeProto as ChakraAttr,
    INVALID_NODE,
    COMP_NODE,
    COMM_COLL_NODE,
    #BOOL,
    #FLOAT,
    #UINT,
    #INT,
    #STRING,
    #BOOLS,
    #FLOATS,
    #UINTS,
    #INTS,
    #STRINGS,
    ALL_REDUCE,
    ALL_TO_ALL,
    ALL_GATHER,
    REDUCE_SCATTER,
    BROADCAST,
)

BOOL = bool
FLOAT = float
UINT = 4
INT = int
STRING = str
BOOLS = 6
FLOATS = 7
UINTS = 8
INTS = 9
STRINGS = 10

class PyTorch2ChakraConverter:
    def __init__(
            self,
            input_filename: str,
            output_filename: str,
            num_dims: int,
            logger: logging.Logger
    ) -> None:
        self.input_filename = input_filename
        self.output_filename = output_filename
        self.num_dims = num_dims
        self.logger = logger

    @staticmethod
    def get_node_type(node: Dict[str, Any]) -> int:
        if "c10d::" in node["name"]:
            return COMM_COLL_NODE
        if node["op_schema"] != "" or node["outputs"]:
            return COMP_NODE
        return INVALID_NODE

    @staticmethod
    def get_attr(
            pt_node: Dict[str, Any],
            attr_name: str,
            attr_type: int
    ) -> ChakraAttr:
        attr = ChakraAttr(name=attr_name, type=attr_type)

        if attr_name in pt_node.keys():
            if attr_type == BOOL:
                attr.b = pt_node[attr_name]
            elif attr_type == FLOAT:
                attr.f = pt_node[attr_name]
            elif attr_type == UINT:
                attr.u = pt_node[attr_name]
            elif attr_type == INT:
                attr.i = pt_node[attr_name]
            elif attr_type == STRING:
                attr.s = pt_node[attr_name]
            elif attr_type == BOOLS:
                attr.bools = pt_node[attr_name]
            elif attr_type == FLOATS:
                attr.floats = pt_node[attr_name]
            elif attr_type == UINTS:
                attr.uints = pt_node[attr_name]
            elif attr_type == INTS:
                attr.ints = pt_node[attr_name]
            elif attr_type == STRINGS:
                attr.strings = pt_node[attr_name]

        return attr

    def detect_type(self, node: Dict[str, Any]) -> str:
        if node["op_schema"] or node["outputs"]:
            return 'operator'
        else:
            return 'label'

    def get_comm_type(self, node: Dict[str, Any]) -> int:
        if node["name"] == "nccl:all_reduce":
            return ALL_REDUCE
        elif node["name"] == "nccl:all_to_all":
            return ALL_TO_ALL
        elif (node["name"] == "nccl:all_gather")\
            or (node["name"] == "nccl:_all_gather_base"):
            return ALL_GATHER
        elif (node["name"] == "nccl:reduce_scatter")\
            or (node["name"] == "nccl:_reduce_scatter_base"):
            return REDUCE_SCATTER
        elif node["name"] == "nccl:broadcast":
            return BROADCAST
        else:
            node_name = node["name"]
            raise ValueError(f"{node_name} is not supported")
        return INVALID_COMM

    # https://pytorch.org/docs/stable/tensors.html
    # https://github.com/pytorch/pytorch/blob/master/c10/util/Half.h
    def get_data_type_size(self, data_type: str) -> int:
        data_type_size_dict = {
                "Tensor(float32)": 4,
                "Tensor(float)": 4,
                "Tensor(float64)": 8,
                "Tensor(double)": 8,
                "Tensor(float16)": 2,
                "Tensor(half)": 2,
                "Tensor(bfloat16)": 2,
                "Tensor(complex64)": 8,
                "Tensor(complex128)": 16,
                "Tensor(uint8)": 1,
                "Tensor(int8)": 1,
                "Tensor(int16)": 2,
                "Tensor(short)": 2,
                "Tensor(int32)": 4,
                "Tensor(int)": 4,
                "Tensor(int64)": 8,
                "Tensor(long)": 8,
                "Tensor(c10::Half)": 2,
                "Tensor(unsigned char)": 1,
                "Tensor(long int)": 8,
        }
        try:
            data_type_size = data_type_size_dict[data_type]
            return data_type_size
        except:
            raise ValueError(f"{data_type} is unsupported")

    def get_comm_size(self, node: Dict[str, Any]) -> int:
        comm_size = 1
        for input_types in node["input_types"]:
            comm_size *= self.get_data_type_size(input_types)
        for input_shape_outer in node["input_shapes"]:
            for input_shape_inner in input_shape_outer:
                comm_size = comm_size * input_shape_inner
        return comm_size

    def dfs(
            self,
            node: Dict[str, Any],
            pytorch_et_data: Dict[str, Any],
            pt_node_dict: Dict[int, Dict[str, Any]]
    ) -> None:
        if self.detect_type(node) == 'operator':
            pt_node_dict[node['id']] = node
        else:
            for pt_node in pytorch_et_data["nodes"]:
                if pt_node['parent'] == node['id']:
                    self.dfs(pt_node, pytorch_et_data, pt_node_dict)

    def convert(self) -> None:
        pt_node_dict = {}
        ck_node_dict = {}
        record_param_comms_pt_node_dict = {}
        nccl_pt_node_dict = {}
        input_storage_id_node_id_dict = {}
        input_tensor_id_node_id_dict = {}
        output_storage_id_node_id_dict = {}
        output_tensor_id_node_id_dict = {}

        with open(self.input_filename, "r") as pytorch_et, \
                open(self.output_filename, "wb") as chakra_et:
            pytorch_et_data = json.load(pytorch_et)

            md = GlobalMetadata(
              attribute=[
                ChakraAttr(name="schema", type=STRING, s=pytorch_et_data["schema"]),
                ChakraAttr(name="pid", type=UINT, u=pytorch_et_data["pid"]),
                ChakraAttr(name="time", type=STRING, s=pytorch_et_data["time"]),
                ChakraAttr(name="start_ts", type=UINT, u=pytorch_et_data["start_ts"]),
                ChakraAttr(name="finish_ts", type=UINT, u=pytorch_et_data["finish_ts"])
              ]
            )
            encode_message(chakra_et, md)

            self.dfs(pytorch_et_data["nodes"][0], pytorch_et_data, pt_node_dict)

            self.logger.info("Identify communication nodes")
            for pt_node in pytorch_et_data["nodes"]:
                if "record_param_comms" in pt_node["name"]:
                    record_param_comms_pt_node_dict.update({pt_node["parent"]: pt_node})
                if "nccl:" in pt_node["name"]:
                    nccl_pt_node_dict.update({pt_node["parent"]: pt_node})

            self.logger.info("Convert PyTorch nodes to Chakra nodes")
            for pt_node_id, pt_node in pt_node_dict.items():
                for i in pt_node["inputs"]:
                    if isinstance(i, list) and len(i) == 6:
                        tensor_id = i[0]
                        storage_id = i[1]
                        if storage_id > 0:
                            input_storage_id_node_id_dict.setdefault(storage_id, []).append(pt_node["id"])
                        else:
                            input_tensor_id_node_id_dict.setdefault(tensor_id, []).append(pt_node["id"])
                for o in pt_node["outputs"]:
                    if isinstance(o, list) and len(o) == 6:
                        tensor_id = o[0]
                        storage_id = o[1]
                        if storage_id > 0:
                            output_storage_id_node_id_dict.setdefault(storage_id, []).append(pt_node["id"])
                        else:
                            output_tensor_id_node_id_dict.setdefault(tensor_id, []).append(pt_node["id"])

                ck_node = ChakraNode()
                ck_node.id = pt_node["id"]
                ck_node.name = pt_node["name"]
                ck_node.type = self.get_node_type(pt_node)
                ck_node.inputs = str(pt_node["inputs"])
                ck_node.input_shapes = str(pt_node["input_shapes"])
                ck_node.input_types = str(pt_node["input_types"])
                ck_node.outputs = str(pt_node["outputs"])
                ck_node.output_shapes = str(pt_node["output_shapes"])
                ck_node.output_types = str(pt_node["output_types"])

                attrs = [("fw_parent", UINT), ("fw_tid", UINT), ("op_schema", STRING),
                        ("parent", UINT), ("seq_id", INT), ("rf_id", UINT), ("scope", UINT), ("tid", UINT)]
                for attr_name, attr_type in attrs:
                    attr = self.get_attr(pt_node, attr_name, attr_type)
                    ck_node.attribute.append(attr)

                # Convert compute nodes
                if ck_node.type == COMP_NODE:
                    attr = ChakraAttr(name="runtime", type=INT)
                    if "dur" in pt_node.keys():
                        attr.i = pt_node["dur"]
                    else:
                        attr.i = 0
                    ck_node.attribute.append(attr)

                # Convert collective communication nodes
                elif ck_node.type == COMM_COLL_NODE:
                    if ck_node.id in record_param_comms_pt_node_dict.keys():
                        record_param_comms_pt_node = record_param_comms_pt_node_dict[ck_node.id]
                        nccl_pt_node = nccl_pt_node_dict[record_param_comms_pt_node["id"]]
                    else:
                        nccl_pt_node = nccl_pt_node_dict[ck_node.id]

                    attr = ChakraAttr(name="comm_type", type=INT)
                    attr.i = self.get_comm_type(nccl_pt_node)
                    ck_node.attribute.append(attr)

                    attr = ChakraAttr(name="comm_size", type=INT)
                    attr.i = self.get_comm_size(nccl_pt_node)
                    ck_node.attribute.append(attr)

                    attr = ChakraAttr(name="involved_dim", type=BOOLS)
                    for _ in range(self.num_dims):
                        attr.bools.append(True)
                    ck_node.attribute.append(attr)

                ck_node_dict[ck_node.id] = ck_node

            self.logger.info("Encode data dependency with storage IDs")
            for input_storage_id, child_node_ids in input_storage_id_node_id_dict.items():
                if input_storage_id in output_storage_id_node_id_dict:
                    parent_node_ids = output_storage_id_node_id_dict[input_storage_id]
                    for child_node_id in child_node_ids:
                        for parent_node_id in parent_node_ids:
                            child_node = ck_node_dict[child_node_id]
                            if (parent_node_id not in child_node.parent)\
                            and child_node.id != parent_node_id:
                                child_node.parent.append(parent_node_id)

                                # remove cycles
                                parent_node = ck_node_dict[parent_node_id]
                                if (parent_node_id in child_node.parent) and\
                                   (child_node_id in parent_node.parent):
                                   if child_node_id < parent_node_id:
                                       child_node.parent.remove(parent_node_id)
                                   else:
                                       parent_node.parent.remove(child_node_id)

            self.logger.info("Encode data dependency with tensor IDs")
            for input_tensor_id, child_node_ids in input_tensor_id_node_id_dict.items():
                if input_tensor_id in output_tensor_id_node_id_dict:
                    parent_node_ids = output_tensor_id_node_id_dict[input_tensor_id]
                    for child_node_id in child_node_ids:
                        for parent_node_id in parent_node_ids:
                            child_node = ck_node_dict[child_node_id]
                            if (parent_node_id not in child_node.parent)\
                            and child_node.id != parent_node_id:
                                child_node.parent.append(parent_node_id)

                                # remove cycles
                                parent_node = ck_node_dict[parent_node_id]
                                if (parent_node_id in child_node.parent) and\
                                   (child_node_id in parent_node.parent):
                                   if child_node_id < parent_node_id:
                                       child_node.parent.remove(parent_node_id)
                                   else:
                                       parent_node.parent.remove(child_node_id)

            self.logger.info("Write Chakra traces")
            for ck_node_id in sorted(ck_node_dict.keys()):
                ck_node = ck_node_dict[ck_node_id]
                encode_message(chakra_et, ck_node)

        self.logger.info("All Chakra nodes are written to the output file")


In [4]:
#HAR Model Code
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
import torchvision
from typing import Optional, Union

class CNN(nn.Module):

    def __init__(self,
                 d_input: int,
                 d_model: int,
                 d_output: int,
                 d_text: int,
                 seq_len: int):
        super().__init__()

        self.layer1 = nn.Conv1d(in_channels=d_input, out_channels=d_model, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm1d(d_model)
        self.act1 = nn.ReLU()

        self.layer2 = nn.Conv1d(in_channels=d_model, out_channels=d_model, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm1d(d_model)
        self.act2 = nn.ReLU()

        self.fc = nn.Linear(d_model*seq_len, d_output)
        self.text = nn.Linear(d_model*seq_len, d_text)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        b,t,c = x.size()

        out = self.layer1(x.permute(0,2,1))
        out = self.act1(self.bn1(out))

        out = self.layer2(out)
        out = self.act2(self.bn2(out))

        logits = self.fc(out.reshape(b,-1))
        text = self.text(out.reshape(b,-1))

        return logits, out.reshape(b,-1), text

class CNN_two_heads(nn.Module):

    def __init__(self,
                 d_input: int,
                 d_model: int,
                 d_class: int,
                 d_token: int,
                 d_text: int,
                 seq_len: int):
        super().__init__()

        self.layer1 = nn.Conv1d(in_channels=d_input, out_channels=d_model, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm1d(d_model)
        self.act1 = nn.ReLU()

        self.layer2 = nn.Conv1d(in_channels=d_model, out_channels=d_model, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm1d(d_model)
        self.act2 = nn.ReLU()

        self.fc1 = nn.Linear(d_model*seq_len, d_class)
        self.fc2 = nn.Linear(d_model*seq_len, d_token)
        self.text = nn.Linear(d_model*seq_len, d_text)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        b,t,c = x.size()

        out = self.layer1(x.permute(0,2,1))
        out = self.act1(self.bn1(out))

        out = self.layer2(out)
        out = self.act2(self.bn2(out))

        logits = self.fc1(out.reshape(b,-1))
        tokens = self.fc2(out.reshape(b,-1))

        text = self.text(out.reshape(b,-1))

        return logits, tokens, out.reshape(b,-1), text

class DNN(nn.Module):

    def __init__(self, input_dim, output_dim):
        super(DNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.act1 = nn.ReLU()
        self.dropout = nn.Dropout()

        self.fc2 = nn.Linear(128, 64)
        self.act2 = nn.ReLU()

        self.fc3 = nn.Linear(64, output_dim)
        self.act3 = nn.Softmax(dim=-1)

    def forward(self, x):
        b, t, c = x.size()
        x = x.reshape(b, -1)
        out = self.dropout(self.act1(self.fc1(x)))
        out = self.act2(self.fc2(out))
        out = self.act3(self.fc3(out))

        return out


class DeepConvLSTM(nn.Module):
    
    def __init__(self, n_hidden=128, n_layers=1, n_filters=64, 
                 n_classes=17, filter_size=5, in_channel=45, drop_prob=0.5):
        super(DeepConvLSTM, self).__init__()
        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.n_filters = n_filters
        self.n_classes = n_classes
        self.filter_size = filter_size
             
        self.conv1 = nn.Conv1d(in_channel, n_filters, filter_size)
        self.conv2 = nn.Conv1d(n_filters, n_filters, filter_size)
        self.conv3 = nn.Conv1d(n_filters, n_filters, filter_size)
        self.conv4 = nn.Conv1d(n_filters, n_filters, filter_size)
        
        self.lstm1  = nn.LSTM(n_filters, n_hidden, n_layers)
        self.lstm2  = nn.LSTM(n_hidden, n_hidden, n_layers)
        
        self.fc = nn.Linear(n_hidden, n_classes)

        self.dropout = nn.Dropout(drop_prob)
    
    def forward(self, x, hidden, batch_size):
        
        b, t, c = x.size()
        #x = x.view(-1, c, t)
        x = x.permute(0,2,1)
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = F.relu(self.conv4(x)) #b,c,t
        
        #x = x.view(x.size(-1), -1, self.n_filters) #t,b,c
        x = x.permute(2,0,1)
        x, hidden = self.lstm1(x, hidden)
        x, hidden = self.lstm2(x, hidden) #t,b,hidden_size
        
        #x = x.contiguous().view(-1, self.n_hidden) #t*b, hidden_size
        x = x.reshape(-1, self.n_hidden)
        x = self.dropout(x)
        x = self.fc(x) #t*b, class_num
        #out = x.view(batch_size, -1, self.n_classes)[:,-1,:] #b,t,class_num -> b,class_num
        out = x.reshape(-1, batch_size, self.n_classes).permute(1,0,2)[:,-1,:]
          
        return out, hidden
    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x n_hidden,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data

        hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(),
                weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda())
        
        return hidden

class FCN(nn.Module):
    def __init__(self,d_input,n_classes):
        super(FCN, self).__init__()
        self.n_classes = n_classes

        self.conv1 = nn.Conv1d(in_channels=d_input, out_channels=128, kernel_size=8, padding=3)
        self.bn1   = nn.BatchNorm1d(128)

        self.conv2 = nn.Conv1d(in_channels=128, out_channels=256, kernel_size=5, padding=2)
        self.bn2   = nn.BatchNorm1d(256)

        self.conv3 = nn.Conv1d(in_channels=256, out_channels=128, kernel_size=3, padding=1)
        self.bn3   = nn.BatchNorm1d(128)

        self.fc4   = nn.Linear(128,self.n_classes)


    def forward(self, x: torch.Tensor):
        b,t,c = x.size()

        x = F.relu(self.bn1(self.conv1(x.permute(0,2,1))))
        x = F.relu(self.bn2(self.conv2(x)))
        x = F.relu(self.bn3(self.conv3(x))) #b,128,t
    
        x = F.avg_pool1d(x,2) #b,128,t/2
        x = torch.mean(x,dim=2) #b,128
        x = x.view(-1,128)
        x = self.fc4(x)

        return F.log_softmax(x,1)

def correct_sizes(sizes):
	corrected_sizes = [s if s % 2 != 0 else s - 1 for s in sizes]
	return corrected_sizes

def pass_through(X):
	return X

class Inception(nn.Module):
	def __init__(self, in_channels, n_filters, kernel_sizes=[9, 19, 39], bottleneck_channels=32, activation=nn.ReLU(), return_indices=False):
		"""
		: param in_channels				Number of input channels (input features)
		: param n_filters				Number of filters per convolution layer => out_channels = 4*n_filters
		: param kernel_sizes			List of kernel sizes for each convolution.
										Each kernel size must be odd number that meets -> "kernel_size % 2 !=0".
										This is nessesery because of padding size.
										For correction of kernel_sizes use function "correct_sizes". 
		: param bottleneck_channels		Number of output channels in bottleneck. 
										Bottleneck wont be used if nuber of in_channels is equal to 1.
		: param activation				Activation function for output tensor (nn.ReLU()). 
		: param return_indices			Indices are needed only if we want to create decoder with InceptionTranspose with MaxUnpool1d. 
		"""
		super(Inception, self).__init__()
		self.return_indices=return_indices
		if in_channels > 1:
			self.bottleneck = nn.Conv1d(
								in_channels=in_channels, 
								out_channels=bottleneck_channels, 
								kernel_size=1, 
								stride=1, 
								bias=False
								)
		else:
			self.bottleneck = pass_through
			bottleneck_channels = 1

		self.conv_from_bottleneck_1 = nn.Conv1d(
										in_channels=bottleneck_channels, 
										out_channels=n_filters, 
										kernel_size=kernel_sizes[0], 
										stride=1, 
										padding=kernel_sizes[0]//2, 
										bias=False
										)
		self.conv_from_bottleneck_2 = nn.Conv1d(
										in_channels=bottleneck_channels, 
										out_channels=n_filters, 
										kernel_size=kernel_sizes[1], 
										stride=1, 
										padding=kernel_sizes[1]//2, 
										bias=False
										)
		self.conv_from_bottleneck_3 = nn.Conv1d(
										in_channels=bottleneck_channels, 
										out_channels=n_filters, 
										kernel_size=kernel_sizes[2], 
										stride=1, 
										padding=kernel_sizes[2]//2, 
										bias=False
										)
		self.max_pool = nn.MaxPool1d(kernel_size=3, stride=1, padding=1, return_indices=return_indices)
		self.conv_from_maxpool = nn.Conv1d(
									in_channels=in_channels, 
									out_channels=n_filters, 
									kernel_size=1, 
									stride=1,
									padding=0, 
									bias=False
									)
		self.batch_norm = nn.BatchNorm1d(num_features=4*n_filters)
		self.activation = activation

	def forward(self, X):
		# step 1
		Z_bottleneck = self.bottleneck(X)
		if self.return_indices:
			Z_maxpool, indices = self.max_pool(X)
		else:
			Z_maxpool = self.max_pool(X)
		# step 2
		Z1 = self.conv_from_bottleneck_1(Z_bottleneck)
		Z2 = self.conv_from_bottleneck_2(Z_bottleneck)
		Z3 = self.conv_from_bottleneck_3(Z_bottleneck)
		Z4 = self.conv_from_maxpool(Z_maxpool)
		# step 3 
		Z = torch.cat([Z1, Z2, Z3, Z4], axis=1)
		Z = self.activation(self.batch_norm(Z))
		if self.return_indices:
			return Z, indices
		else:
			return Z

class InceptionBlock(nn.Module):
	def __init__(self, in_channels, n_filters=32, kernel_sizes=[9,19,39], bottleneck_channels=32, use_residual=True, activation=nn.ReLU(), return_indices=False):
		super(InceptionBlock, self).__init__()
		self.use_residual = use_residual
		self.return_indices = return_indices
		self.activation = activation
		self.inception_1 = Inception(
							in_channels=in_channels,
							n_filters=n_filters,
							kernel_sizes=kernel_sizes,
							bottleneck_channels=bottleneck_channels,
							activation=activation,
							return_indices=return_indices
							)
		self.inception_2 = Inception(
							in_channels=4*n_filters,
							n_filters=n_filters,
							kernel_sizes=kernel_sizes,
							bottleneck_channels=bottleneck_channels,
							activation=activation,
							return_indices=return_indices
							)
		self.inception_3 = Inception(
							in_channels=4*n_filters,
							n_filters=n_filters,
							kernel_sizes=kernel_sizes,
							bottleneck_channels=bottleneck_channels,
							activation=activation,
							return_indices=return_indices
							)	
		if self.use_residual:
			self.residual = nn.Sequential(
								nn.Conv1d(
									in_channels=in_channels, 
									out_channels=4*n_filters, 
									kernel_size=1,
									stride=1,
									padding=0
									),
								nn.BatchNorm1d(
									num_features=4*n_filters
									)
								)

	def forward(self, X):
		if self.return_indices:
			Z, i1 = self.inception_1(X)
			Z, i2 = self.inception_2(Z)
			Z, i3 = self.inception_3(Z)
		else:
			Z = self.inception_1(X)
			Z = self.inception_2(Z)
			Z = self.inception_3(Z)
		if self.use_residual:
			Z = Z + self.residual(X)
			Z = self.activation(Z)
		if self.return_indices:
			return Z,[i1, i2, i3]
		else:
			return Z

class SELayer(nn.Module):
    def __init__(self, channel, reduction=16):
        super(SELayer, self).__init__()
        self.avg_pool = nn.AdaptiveAvgPool1d(1)
        self.fc = nn.Sequential(
            nn.Linear(channel, channel // reduction, bias=False),
            nn.ReLU(inplace=True),
            nn.Linear(channel // reduction, channel, bias=False),
            nn.Sigmoid()
        )

    def forward(self, x):
        b, c, _ = x.size()
        y = self.avg_pool(x).view(b, c)
        y = self.fc(y).view(b, c, 1)
        return x * y.expand_as(x)

class MLSTMfcn(nn.Module):
    def __init__(self, *, num_classes, max_seq_len, num_features,
                 num_lstm_out=128, num_lstm_layers=1, 
                 conv1_nf=128, conv2_nf=256, conv3_nf=128,
                 lstm_drop_p=0.8, fc_drop_p=0.3):
        super(MLSTMfcn, self).__init__()

        self.num_classes = num_classes
        self.max_seq_len = max_seq_len
        self.num_features = num_features

        self.num_lstm_out = num_lstm_out
        self.num_lstm_layers = num_lstm_layers

        self.conv1_nf = conv1_nf
        self.conv2_nf = conv2_nf
        self.conv3_nf = conv3_nf

        self.lstm_drop_p = lstm_drop_p
        self.fc_drop_p = fc_drop_p

        self.lstm = nn.LSTM(input_size=self.num_features, 
                            hidden_size=self.num_lstm_out,
                            num_layers=self.num_lstm_layers,
                            batch_first=True)
        
        self.conv1 = nn.Conv1d(self.num_features, self.conv1_nf, 8)
        self.conv2 = nn.Conv1d(self.conv1_nf, self.conv2_nf, 5)
        self.conv3 = nn.Conv1d(self.conv2_nf, self.conv3_nf, 3)

        self.bn1 = nn.BatchNorm1d(self.conv1_nf)
        self.bn2 = nn.BatchNorm1d(self.conv2_nf)
        self.bn3 = nn.BatchNorm1d(self.conv3_nf)

        self.se1 = SELayer(self.conv1_nf)  # ex 128
        self.se2 = SELayer(self.conv2_nf)  # ex 256

        self.relu = nn.ReLU()
        self.lstmDrop = nn.Dropout(self.lstm_drop_p)
        self.convDrop = nn.Dropout(self.fc_drop_p)

        self.fc = nn.Linear(self.conv3_nf+self.num_lstm_out, self.num_classes)
    
    def forward(self, x):
        ''' input x should be in size [B,T,F], where 
            B = Batch size
            T = Time samples
            F = features
        '''
        #seq_lens = torch.tensor(x.shape[1]).unsqueeze(0).expand(x.shape[0],-1).cuda()
        seq_lens = [x.shape[1]] * x.shape[0]
        x1 = nn.utils.rnn.pack_padded_sequence(x, seq_lens, 
                                               batch_first=True, 
                                               enforce_sorted=False)
        x1, (ht,ct) = self.lstm(x1)
        x1, _ = nn.utils.rnn.pad_packed_sequence(x1, batch_first=True, 
                                                 padding_value=0.0)
        x1 = x1[:,-1,:]
        
        x2 = x.transpose(2,1)
        x2 = self.convDrop(self.relu(self.bn1(self.conv1(x2))))
        x2 = self.se1(x2)
        x2 = self.convDrop(self.relu(self.bn2(self.conv2(x2))))
        x2 = self.se2(x2)
        x2 = self.convDrop(self.relu(self.bn3(self.conv3(x2))))
        x2 = torch.mean(x2,2)
        
        x_all = torch.cat((x1,x2),dim=1)
        x_out = self.fc(x_all)
        x_out = F.log_softmax(x_out, dim=1)

        return x_out

class resConv1dBlock(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride, layer_num):
        super(resConv1dBlock, self).__init__()
        self.layer_num = layer_num
        self.conv1 = nn.ModuleList([
            nn.Conv1d(in_channels = in_channels, out_channels = 2 * in_channels, kernel_size = kernel_size, stride = stride, padding = int((kernel_size - 1) / 2) )
            for i in range(layer_num)])

        self.bn1 = nn.ModuleList([
            nn.BatchNorm1d(2 * in_channels)
            for i in range(layer_num)])

        self.conv2 = nn.ModuleList([ 
            nn.Conv1d(in_channels = 2 * in_channels, out_channels = out_channels, kernel_size = kernel_size, stride = stride, padding = int((kernel_size - 1) / 2) )
            for i in range(layer_num)])

        self.bn2 = nn.ModuleList([
            nn.BatchNorm1d(out_channels)
            for i in range(layer_num)])

    def forward(self, x):
        for i in range(self.layer_num):
            tmp = F.relu(self.bn1[i](self.conv1[i](x)))
            x = F.relu(self.bn2[i](self.conv2[i](tmp)) + x)
        return x

class ResNet(nn.Module):
    def __init__(self, input_size, input_channel, num_label):
        super(ResNet, self).__init__()
        self.conv1 = nn.Conv1d(input_channel, 64, kernel_size = 1, stride = 1)
        self.res1 = resConv1dBlock(64, 64, kernel_size = 3, stride = 1, layer_num = 3)
        self.pool1 = nn.AvgPool1d(kernel_size = 2)

        self.conv2 = nn.Conv1d(64, 128, kernel_size = 1, stride = 1)
        self.res2 = resConv1dBlock(128, 128, kernel_size = 3, stride = 1, layer_num = 4)
        self.pool2 = nn.AvgPool1d(kernel_size = 2)

        self.conv3 = nn.Conv1d(128, 256, kernel_size = 1, stride = 1)
        self.res3 = resConv1dBlock(256, 256,  kernel_size = 3, stride = 1, layer_num = 7)
        self.pool3 = nn.AvgPool1d(kernel_size = 2)

        self.conv4 = nn.Conv1d(256, 128, kernel_size = 1, stride = 1)
        self.res4 = resConv1dBlock(128, 128, kernel_size = 3, stride = 1, layer_num = 4)
        self.pool = nn.AvgPool1d(kernel_size = int(input_size / 8))

        self.fc = nn.Linear(128, num_label)

    def forward(self, x):
        x = x.transpose(1, 2)
        x = F.relu(self.conv1(x))
        x = self.pool1(self.res1(x))
        x = F.relu(self.conv2(x))
        x = self.pool2(self.res2(x))
        x = F.relu(self.conv3(x))
        x = self.pool3(self.res3(x))
        x = F.relu(self.conv4(x))
        x = self.pool(self.res4(x))

        x = x.view(x.size(0), -1)
        return self.fc(x)

class MaCNN(nn.Module):
    def __init__(self, input_size, input_channel, num_label, sensor_num):
        super(MaCNN, self).__init__()
        self.in_channel = int(input_channel / sensor_num)
        self.start_conv = nn.ModuleList([nn.Conv1d(self.in_channel, 128, kernel_size = 3, stride = 1, padding = 1) for _ in range(sensor_num)])
        self.conv1 = nn.ModuleList([nn.Conv1d(128, 128, kernel_size = 3, stride = 1, padding = 1) for _ in range(sensor_num)])
        self.pool1 = nn.ModuleList([nn.AvgPool1d(kernel_size = 2) for _ in range(sensor_num)])
        self.conv2 = nn.ModuleList([nn.Conv1d(128, 128, kernel_size = 3, stride = 1, padding = 1) for _ in range(sensor_num)])
        self.pool2 = nn.ModuleList([nn.AvgPool1d(kernel_size = 2)for _ in range(sensor_num)])
        self.conv3 = nn.ModuleList([nn.Conv1d(128, 128, kernel_size = 3, stride = 1, padding = 1) for _ in range(sensor_num)])
        self.pool3 = nn.ModuleList([nn.AvgPool1d(kernel_size = 2)for _ in range(sensor_num)])
        self.conv4 = nn.ModuleList([nn.Conv1d(128, 128, kernel_size = 3, stride = 1, padding = 1) for _ in range(sensor_num)])
        self.pool4 = nn.ModuleList([nn.AvgPool1d(kernel_size = 2)for _ in range(sensor_num)])
        self.conv5 = nn.ModuleList([nn.Conv1d(128, 128, kernel_size = 3, stride = 1, padding = 1) for _ in range(sensor_num)])
        self.pool5 = nn.ModuleList([nn.AvgPool1d(kernel_size = 2)for _ in range(sensor_num)])


        self.end_conv = nn.ModuleList([nn.Conv1d(128, 1, kernel_size = 1, stride = 1) for _ in range(sensor_num)])
        self.Linear = nn.Linear(int(input_size / 32) * sensor_num, num_label)


    def forward(self, x):
        x = x.reshape(x.size(0), x.size(1), -1, self.in_channel).transpose(1, 3)
        sensor_num = x.size(2)
        x = list(x.split(1, 2))
        for i in range(sensor_num):
            x[i] = F.relu(self.start_conv[i](x[i].squeeze(2)))

            x[i] = F.relu(self.pool1[i](self.conv1[i](x[i])))
            x[i] = F.relu(self.pool2[i](self.conv2[i](x[i])))
            x[i] = F.relu(self.pool3[i](self.conv3[i](x[i])))
            x[i] = F.relu(self.pool4[i](self.conv4[i](x[i])))
            x[i] = F.relu(self.pool5[i](self.conv5[i](x[i])))
            x[i] = F.relu(self.end_conv[i](x[i])).squeeze(1)

            x[i] = x[i].view(x[i].size(0), -1)
        x = torch.cat(x, dim = -1)
        return self.Linear(x)

class Encoder2D(nn.Module):

    def __init__(self, num_classes):
        super(Encoder2D, self).__init__()

        self.resnet = torchvision.models.resnet101(pretrained=True)  # pretrained ImageNet ResNet-101

        num_ftrs = self.resnet.fc.in_features
        self.resnet.fc = nn.Linear(num_ftrs, num_classes)

        self.fine_tune()

    def forward(self, images):
        """
        Forward propagation.
        :param images: images, a tensor of dimensions (batch_size, 3, image_size, image_size)
        :return: encoded images
        """
        out = self.resnet(images)  
        return out

    def fine_tune(self, fine_tune=True):
        """
        Allow or prevent the computation of gradients for convolutional blocks 2 through 4 of the encoder.
        :param fine_tune: Allow?
        """
        for p in self.resnet.parameters():
            p.requires_grad = fine_tune
        # If fine-tuning, only fine-tune convolutional blocks 2 through 4
        #for c in list(self.resnet.children())[5:]:
        #    for p in c.parameters():
        #        p.requires_grad = fine_tune

def generate_original_PE(length: int, d_model: int) -> torch.Tensor:
    """Generate positional encoding as described in original paper.  :class:`torch.Tensor`
    Parameters
    ----------
    length:
        Time window length, i.e. K.
    d_model:
        Dimension of the model vector.
    Returns
    -------
        Tensor of shape (K, d_model).
    """
    PE = torch.zeros((length, d_model))

    pos = torch.arange(length).unsqueeze(1)

    PE[:, 0::2] = torch.sin(
        pos / torch.pow(1000, torch.arange(0, d_model, 2, dtype=torch.float32)/d_model))
    PE[:, 1::2] = torch.cos(
        pos / torch.pow(1000, torch.arange(1, d_model, 2, dtype=torch.float32)/d_model))

    return PE

def generate_regular_PE(length: int, d_model: int, period: Optional[int] = 96) -> torch.Tensor:
    """Generate positional encoding with a given period.
    Parameters
    ----------
    length:
        Time window length, i.e. K.
    d_model:
        Dimension of the model vector.
    period:
        Size of the pattern to repeat.
        Default is 24.
    Returns
    -------
        Tensor of shape (K, d_model).
    """
    PE = torch.zeros((length, d_model))

    pos = torch.arange(length, dtype=torch.float32).unsqueeze(1)
    PE = torch.sin(pos * 2 * np.pi / period)
    PE = PE.repeat((1, d_model))

    return PE

def generate_time_PE(length: int, d_model: int) -> torch.Tensor:
    """Generate positional encoding using time directly.  :class:`torch.Tensor`
    Parameters
    ----------
    length:
        Time window length, i.e. K.
    d_model:
        Dimension of the model vector.
    Returns
    -------
        Tensor of shape (K, d_model).
    """
    PE = torch.zeros((length, d_model))

    pos = torch.arange(length).unsqueeze(1)
    PE[:, :] = pos
    return PE

def generate_local_map_mask(chunk_size: int,
                            attention_size: int,
                            mask_future=False,
                            device: torch.device = 'cpu') -> torch.BoolTensor:
    """Compute attention mask as attention_size wide diagonal.
    Parameters
    ----------
    chunk_size:
        Time dimension size.
    attention_size:
        Number of backward elements to apply attention.
    device:
        torch device. Default is ``'cpu'``.
    Returns
    -------
        Mask as a boolean tensor.
    """
    local_map = np.empty((chunk_size, chunk_size))
    i, j = np.indices(local_map.shape)

    if mask_future:
        local_map[i, j] = (i - j > attention_size) ^ (j - i > 0)
    else:
        #local_map[i, j] = (np.abs(i - j) > attention_size) & ((np.abs(j - i) < 90) | (np.abs(j - i) > 102))
        local_map[i , j] = np.abs(i - j) > attention_size
        #local_map[i , j] = (np.abs(j - i) < 90) | (np.abs(j - i) > 102)

    return torch.BoolTensor(local_map).to(device)

class PositionwiseFeedForward(nn.Module):
    """Position-wise Feed Forward Network block from Attention is All You Need.
    Apply two linear transformations to each input, separately but indetically. We
    implement them as 1D convolutions. Input and output have a shape (batch_size, d_model).
    Parameters
    ----------
    d_model:
        Dimension of input tensor.
    d_ff:
        Dimension of hidden layer, default is 2048.
    """

    def __init__(self,
                 d_model: int,
                 d_ff: Optional[int] = 2048):
        """Initialize the PFF block."""
        super().__init__()

        self._linear1 = nn.Linear(d_model, d_ff)
        self._linear2 = nn.Linear(d_ff, d_model)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Propagate forward the input through the PFF block.
        Apply the first linear transformation, then a relu actvation,
        and the second linear transformation.
        Parameters
        ----------
        x:
            Input tensor with shape (batch_size, K, d_model).
        Returns
        -------
            Output tensor with shape (batch_size, K, d_model).
        """
        return self._linear2(F.relu(self._linear1(x)))

class MultiHeadAttention(nn.Module):
    """Multi Head Attention block from Attention is All You Need.
    Given 3 inputs of shape (batch_size, K, d_model), that will be used
    to compute query, keys and values, we output a self attention
    tensor of shape (batch_size, K, d_model).
    Parameters
    ----------
    d_model:
        Dimension of the input vector.
    q:
        Dimension of all query matrix.
    v:
        Dimension of all value matrix.
    h:
        Number of heads.
    attention_size:
        Number of backward elements to apply attention.
        Deactivated if ``None``. Default is ``None``.
    """

    def __init__(self,
                 d_model: int,
                 q: int,
                 v: int,
                 h: int,
                 attention_size: int = None):
        """Initialize the Multi Head Block."""
        super().__init__()

        self._h = h
        self._attention_size = attention_size

        # Query, keys and value matrices
        self._W_q = nn.Linear(d_model, q*self._h)
        self._W_k = nn.Linear(d_model, q*self._h)
        self._W_v = nn.Linear(d_model, v*self._h)

        # Output linear function
        self._W_o = nn.Linear(self._h*v, d_model)

        # Score placeholder
        self._scores = None

    def forward(self,
                query: torch.Tensor,
                key: torch.Tensor,
                value: torch.Tensor,
                mask: Optional[str] = None) -> torch.Tensor:
        """Propagate forward the input through the MHB.
        We compute for each head the queries, keys and values matrices,
        followed by the Scaled Dot-Product. The result is concatenated
        and returned with shape (batch_size, K, d_model).
        Parameters
        ----------
        query:
            Input tensor with shape (batch_size, K, d_model) used to compute queries.
        key:
            Input tensor with shape (batch_size, K, d_model) used to compute keys.
        value:
            Input tensor with shape (batch_size, K, d_model) used to compute values.
        mask:
            Mask to apply on scores before computing attention.
            One of ``'subsequent'``, None. Default is None.
        Returns
        -------
            Self attention tensor with shape (batch_size, K, d_model).
        """
        K = query.shape[1]

        # Compute Q, K and V, concatenate heads on batch dimension
        queries = torch.cat(self._W_q(query).chunk(self._h, dim=-1), dim=0)
        keys = torch.cat(self._W_k(key).chunk(self._h, dim=-1), dim=0)
        values = torch.cat(self._W_v(value).chunk(self._h, dim=-1), dim=0)

        # Scaled Dot Product
        self._scores = torch.bmm(queries, keys.transpose(1, 2)) / np.sqrt(K)

        # Compute local map mask
        if self._attention_size is not None:
            attention_mask = generate_local_map_mask(K, self._attention_size, mask_future=False, device=self._scores.device)
            self._scores = self._scores.masked_fill(attention_mask, float('-inf'))

        # Compute future mask
        if mask == "subsequent":
            future_mask = torch.triu(torch.ones((K, K)), diagonal=1).bool()
            future_mask = future_mask.to(self._scores.device)
            self._scores = self._scores.masked_fill(future_mask, float('-inf'))

        # Apply sotfmax
        self._scores = F.softmax(self._scores, dim=-1)

        attention = torch.bmm(self._scores, values)

        # Concatenat the heads
        attention_heads = torch.cat(attention.chunk(self._h, dim=0), dim=-1)

        # Apply linear transformation W^O
        self_attention = self._W_o(attention_heads)

        return self_attention

    @property
    def attention_map(self) -> torch.Tensor:
        """Attention map after a forward propagation,
        variable `score` in the original paper.
        """
        if self._scores is None:
            raise RuntimeError(
                "Evaluate the model once to generate attention map")
        return self._scores

class Encoder(nn.Module):
    """Encoder block from Attention is All You Need.
    Apply Multi Head Attention block followed by a Point-wise Feed Forward block.
    Residual sum and normalization are applied at each step.
    Parameters
    ----------
    d_model:
        Dimension of the input vector.
    q:
        Dimension of all query matrix.
    v:
        Dimension of all value matrix.
    h:
        Number of heads.
    attention_size:
        Number of backward elements to apply attention.
        Deactivated if ``None``. Default is ``None``.
    dropout:
        Dropout probability after each MHA or PFF block.
        Default is ``0.3``.
    chunk_mode:
        Swict between different MultiHeadAttention blocks.
        One of ``'chunk'``, ``'window'`` or ``None``. Default is ``'chunk'``.
    """

    def __init__(self,
                 d_model: int,
                 q: int,
                 v: int,
                 h: int,
                 attention_size: int = None,
                 dropout: float = 0.3,
                 chunk_mode: str = 'chunk'):
        """Initialize the Encoder block"""
        super().__init__()

        MHA = MultiHeadAttention

        self._selfAttention = MHA(d_model, q, v, h, attention_size=attention_size)
        self._feedForward = PositionwiseFeedForward(d_model)

        self._layerNorm1 = nn.LayerNorm(d_model)
        self._layerNorm2 = nn.LayerNorm(d_model)

        self._batchNorm1 = nn.BatchNorm1d(d_model)
        self._batchNorm2 = nn.BatchNorm1d(d_model)

        self._dopout = nn.Dropout(p=dropout)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Propagate the input through the Encoder block.
        Apply the Multi Head Attention block, add residual and normalize.
        Apply the Point-wise Feed Forward block, add residual and normalize.
        Parameters
        ----------
        x:
            Input tensor with shape (batch_size, K, d_model).
        Returns
        -------
            Output tensor with shape (batch_size, K, d_model).
        """
        # Self attention
        residual = x
        x = self._selfAttention(query=x, key=x, value=x)
        x = self._dopout(x)
        x = self._layerNorm1(x + residual)
        #x = self._batchNorm1((x+residual).transpose(1,2)).transpose(1,2)

        # Feed forward
        residual = x
        x = self._feedForward(x)
        x = self._dopout(x)
        x = self._layerNorm2(x + residual)
        #x = self._batchNorm2((x+residual).transpose(1,2)).transpose(1,2)

        return x

    @property
    def attention_map(self) -> torch.Tensor:
        """Attention map after a forward propagation,
        variable `score` in the original paper.
        """
        return self._selfAttention.attention_map

class Transformer(nn.Module):
    """Transformer model from Attention is All You Need.
    A classic transformer model adapted for sequential data.
    Embedding has been replaced with a fully connected layer,
    the last layer softmax is now a sigmoid.
    Attributes
    ----------
    layers_encoding: :py:class:`list` of :class:`Encoder.Encoder`
        stack of Encoder layers.
    layers_decoding: :py:class:`list` of :class:`Decoder.Decoder`
        stack of Decoder layers.
    Parameters
    ----------
    d_input:
        Model input dimension.
    d_model:
        Dimension of the input vector.
    d_output:
        Model output dimension.
    q:
        Dimension of queries and keys.
    v:
        Dimension of values.
    h:
        Number of heads.
    N:
        Number of encoder and decoder layers to stack.
    attention_size:
        Number of backward elements to apply attention.
        Deactivated if ``None``. Default is ``None``.
    dropout:
        Dropout probability after each MHA or PFF block.
        Default is ``0.3``.
    chunk_mode:
        Swict between different MultiHeadAttention blocks.
        One of ``'chunk'``, ``'window'`` or ``None``. Default is ``'chunk'``.
    pe:
        Type of positional encoding to add.
        Must be one of ``'original'``, ``'regular'`` or ``None``. Default is ``None``.
    """

    def __init__(self,
                 d_input: int,
                 d_model: int,
                 d_output: int,
                 q: int,
                 v: int,
                 h: int,
                 N: int,
                 attention_size: int = None,
                 dropout: float = 0.3,
                 chunk_mode: bool = True,
                 pe: str = None):
        """Create transformer structure from Encoder and Decoder blocks."""
        super().__init__()

        self._d_model = d_model

        self.layers_encoding = nn.ModuleList([Encoder(d_model,
                                                      q,
                                                      v,
                                                      h,
                                                      attention_size=attention_size,
                                                      dropout=dropout,
                                                      chunk_mode=chunk_mode) for _ in range(N)])
        #self.layers_decoding = nn.ModuleList([Decoder(d_model,
        #                                              q,
        #                                              v,
        #                                              h,
        #                                              attention_size=attention_size,
        #                                              dropout=dropout,
        #                                              chunk_mode=chunk_mode) for _ in range(N)])

        self._embedding = nn.Linear(d_input, d_model)
        self._linear = nn.Linear(d_model, d_output)
        self._embed_linear = nn.Linear(d_model, d_model)

        pe_functions = {
            'original': generate_original_PE,
            'regular': generate_regular_PE,
            'time': generate_time_PE,
        }

        if pe in pe_functions.keys():
            self._generate_PE = pe_functions[pe]
        elif pe is None:
            self._generate_PE = None
        else:
            raise NameError(
                f'PE "{pe}" not understood. Must be one of {", ".join(pe_functions.keys())} or None.')

        self.name = 'transformer'

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Propagate input through transformer
        Forward input through an embedding module,
        the encoder then decoder stacks, and an output module.
        Parameters
        ----------
        x:
            :class:`torch.Tensor` of shape (batch_size, K, d_input).
        Returns
        -------
            Output tensor with shape (batch_size, K, d_output).
        """
        K = x.shape[1]

        # Embeddin module
        encoding = self._embedding(x)

        # Add position encoding
        if self._generate_PE is not None:
            positional_encoding = self._generate_PE(K, self._d_model)
            #positional_encoding = self._embed_linear(positional_encoding.cuda())
            positional_encoding = positional_encoding.to(encoding.device)
            encoding.add_(positional_encoding)

        # Encoding stack
        for layer in self.layers_encoding:
            encoding = layer(encoding)

        # Decoding stack
        decoding = encoding
        '''
        # Add position encoding
        if self._generate_PE is not None:
            positional_encoding = self._generate_PE(K, self._d_model)
            positional_encoding = positional_encoding.to(decoding.device)
            decoding.add_(positional_encoding)

        for layer in self.layers_decoding:
            decoding = layer(decoding, encoding)
        '''
        # Output module
        output = self._linear(decoding)
        #output = torch.sigmoid(output)
        return output

class TransformerClassifier(nn.Module):

    def __init__(self,
                 d_input: int,
                 d_model: int,
                 d_output: int,
                 q: int,
                 v: int,
                 h: int,
                 N: int,
                 seq_len: int, 
                 d_text: int,
                 attention_size: int = None,
                 dropout: float = 0.3,
                 chunk_mode: bool = True,
                 pe: str = None):
        """Create transformer structure from Encoder and Decoder blocks."""
        super().__init__()
        self.transformer = Transformer(d_input, d_model, d_input, q, v, h, N, attention_size, dropout, chunk_mode, pe)
        self.classifier = nn.Linear(seq_len * d_input, d_output)
        self.text_classifier = nn.Linear(seq_len * d_input, d_text)

    def forward(self, x):
        out = self.transformer(x)
        b, t, c = out.size()
        pred = self.classifier(out.reshape(b, t * c))
        text = self.text_classifier(out.reshape(b, t * c))
        return pred, out.reshape(b,-1), text



In [5]:
#Reading the example data 
#Read README and download data 
path = 'data/subject101.dat' # set this to path of dat file, in this case we use subject101.dat
data = []
with open(path, 'r') as f:
    #transform dat into python list
    d = f.readlines()
    for i in d:
        k = i.rstrip().split(" ")
        data.append([float(i) for i in k]) 

a = torch.tensor([data])

In [6]:
#Setup CNN Model
features = 54 #number of features in data
seq_len = 376417 #number of sequences in data
number_of_class = 25
model_cnn = CNN(features, 64, number_of_class, 1, seq_len)

In [7]:
#Run model
model_cnn.forward(a)

(tensor([[nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
          nan]], grad_fn=<AddmmBackward0>),
 tensor([[nan, nan, nan,  ..., nan, nan, nan]], grad_fn=<ViewBackward0>),
 tensor([[nan]], grad_fn=<AddmmBackward0>))

In [8]:
#Setup DNN Model and run it
model_dnn = DNN(20326518, 25)
model_dnn.forward(a)

tensor([[nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
         nan]], grad_fn=<SoftmaxBackward0>)

In [9]:
#Get Execution Trace of CNN model and export pytorch ET as json
with torch.profiler.profile(with_stack=True, profile_memory=True) as prof:
    output = model_cnn(a)
print(prof.key_averages().table(sort_by="self_cpu_time_total"))
prof.export_chrome_trace("cnn_trace.json")

--------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                            Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls  
--------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
        aten::mkldnn_convolution        46.86%     187.642ms        46.90%     187.783ms      93.891ms     183.80 Mb           0 b             2  
                     aten::addmm        22.45%      89.898ms        22.46%      89.912ms      44.956ms         104 b         104 b             2  
                     aten::copy_        11.74%      47.002ms        11.74%      47.002ms      15.667ms           0 b           0 b             3  
         aten::native_batch_norm        10.76%      43.062ms        10.96%      43.873ms      21.936ms     183.80 Mb  

In [12]:
#Working on it ...
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
converter = PyTorch2ChakraConverter("cnn_trace.json", '1', 1, logger)
converter.convert()

KeyError: 'schema'