# ECOの実装と推論

## KineticsのDataLoaderを作成
前のnotebookで説明したDataLoaderの作成と動作確認

In [1]:
import os
import torch
from torch import nn
from dataloader import make_datapath_list, get_label_id_dictionary
from dataloader import VideoTransform, VideoDataset
from eco import ECO_2D, ECO_3D

In [2]:
output_dir = "../../datasets/ptca_datasets/chapter9/kinetics_videos"

In [14]:
video_list = make_datapath_list(output_dir)

# 前処理の設定
resize, crop_size = 224, 224
mean, std = [104, 117, 123], [1, 1, 1]
video_transform = VideoTransform(resize, crop_size, mean, std)

# ラベル名→IDのリストを取得
label_dictionary_path = "./kinetics_400_label_dictionary.csv"
label_id_dict, id_label_dict = get_label_id_dictionary(label_dictionary_path)

# Datasetの作成
val_dataset = VideoDataset(
    video_list,
    label_id_dict,
    num_segments=16,
    phase="val",
    transform=video_transform,
    img_tmpl="image_{:05d}.jpg"
)

# 動作確認
index = 0
item = val_dataset.__getitem__(index)
print("Size:", item[0].shape)
print("Label:", item[1])
print("Label ID:", item[2])
print("Movie Path:", item[3])

Size: torch.Size([16, 3, 224, 224])
Label: bungee jumping
Label ID: 40
Movie Path: ../datasets/chapter9/kinetics_videos/bungee jumping/dAeUFSdYG1I_000010_000020


DatasetをDataLoaderにする

In [4]:
batch_size = 8
val_dataloader = torch.utils.data.DataLoader(
    val_dataset,
    batch_size=batch_size,
    shuffle=True
)

batch_iterator = iter(val_dataloader)

# 動作確認
imgs_transformeds, labels, label_ids, dir_path = next(batch_iterator)
print(imgs_transformeds.shape)

torch.Size([8, 16, 3, 224, 224])


# ECOモデルの実装
2DNetと3DNetを利用してモデルを組み立てる  
(8, 16, 3, 224, 224)→(128, 3, 224, 224)と変形し，2DNetで各フレーム画像を取り扱う  
その出力を(128, 96, 28, 28)→(8, 16, 96, 28, 28)と，元の次元数に戻し，3DNetへ渡す  
3DNetの出力(8, 512)を全結合層に渡してクラス分類を行う．  
  
ECOにはFull ECOとECO Liteがあるが，ここではECO Liteを実装

In [5]:
class ECO_Lite(nn.Module):
    def __init__(self):
        super(ECO_Lite, self).__init__()
        
        self.eco_2d = ECO_2D()
        self.eco_3d = ECO_3D()
        self.fc_final = nn.Linear(in_features=512, out_features=400, bias=True)
        
    def forward(self, x):
        """
        Input (M, 16, 3, 224, 224)
        """
        
        bs, ns, c, h, w = x.shape
        
        # ECO2Dは時間方向を考慮していないためバッチに押し込んでもよい
        out = x.view(-1, c, h, w) 
        out = self.eco_2d(out)
        
        # ECO3Dに入れる前に動画ごとにフレームが分離される
        out = out.view(-1, ns, 96, 28, 28)
        out = self.eco_3d(out)
        out = self.fc_final(out)
        
        return out

In [6]:
net = ECO_Lite()
net

ECO_Lite(
  (eco_2d): ECO_2D(
    (basic_conv): BasicConv(
      (conv1_7x7_s2): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3))
      (conv1_7x7_s2_bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv1_relu_7x7): ReLU(inplace)
      (pool1_3x3_s2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=True)
      (conv2_3x3_reduce): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1))
      (conv2_3x3_reduce_bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2_relu_3x3_reduce): ReLU(inplace)
      (conv2_3x3): Conv2d(64, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (conv2_3x3_bn): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2_relu_3x3): ReLU(inplace)
      (pool2_3x3_s2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=True)
    )
    (inceptionA): InceptionA(
      (inception_3a_1x1): 

## 学習済みモデルをロード
これを訓練すると異常に時間がかかるため，転移学習する  
[ECO_Lite_rgb_model_Kinetics.pth.tar](https://drive.google.com/file/d/1XNIq7byciKgrn011jLBggd2g79jKX4uD/view)をload_weights.shで取得．  
参考：[curlやwgetで公開済みGoogle Driveデータをダウンロードする](https://qiita.com/namakemono/items/c963e75e0af3f7eed732)  
  
順番が対応していれば正しい重みデータが上書きされる．

In [9]:
def load_pretrained_ECO(model_dict, pretrained_model_dict):
    param_names = []
    for name, param in model_dict.items():
        param_names.append(name)
        
    new_state_dict = model_dict.copy()
    
    for index, (key_name, value) in enumerate(pretrained_model_dict.items()):
        name = param_names[index]
        new_state_dict[name] = value
        print(f"{key_name} → {name}")
    
    return new_state_dict

In [10]:
net_model_ECO = "../datasets/chapter9/ECO_Lite_rgb_model_Kinetics.pth.tar"
pretrained_model = torch.load(net_model_ECO, map_location='cpu')
pretrained_model_dict = pretrained_model['state_dict']

model_dict = net.state_dict()

new_state_dict = load_pretrained_ECO(model_dict, pretrained_model_dict)

net.eval()
net.load_state_dict(new_state_dict)

module.base_model.conv1_7x7_s2.weight → eco_2d.basic_conv.conv1_7x7_s2.weight
module.base_model.conv1_7x7_s2.bias → eco_2d.basic_conv.conv1_7x7_s2.bias
module.base_model.conv1_7x7_s2_bn.weight → eco_2d.basic_conv.conv1_7x7_s2_bn.weight
module.base_model.conv1_7x7_s2_bn.bias → eco_2d.basic_conv.conv1_7x7_s2_bn.bias
module.base_model.conv1_7x7_s2_bn.running_mean → eco_2d.basic_conv.conv1_7x7_s2_bn.running_mean
module.base_model.conv1_7x7_s2_bn.running_var → eco_2d.basic_conv.conv1_7x7_s2_bn.running_var
module.base_model.conv1_7x7_s2_bn.num_batches_tracked → eco_2d.basic_conv.conv1_7x7_s2_bn.num_batches_tracked
module.base_model.conv2_3x3_reduce.weight → eco_2d.basic_conv.conv2_3x3_reduce.weight
module.base_model.conv2_3x3_reduce.bias → eco_2d.basic_conv.conv2_3x3_reduce.bias
module.base_model.conv2_3x3_reduce_bn.weight → eco_2d.basic_conv.conv2_3x3_reduce_bn.weight
module.base_model.conv2_3x3_reduce_bn.bias → eco_2d.basic_conv.conv2_3x3_reduce_bn.bias
module.base_model.conv2_3x3_reduce_b

IncompatibleKeys(missing_keys=[], unexpected_keys=[])

## 推論
DataLoaderから8つの動画を取り出し，それぞれECOモデルで推論

In [11]:
batch_iterator = iter(val_dataloader)
imgs_transformeds, labels, label_ids, dir_path = next(batch_iterator)

with torch.set_grad_enabled(False):
    outputs = net(imgs_transformeds)

print(outputs.shape)

torch.Size([8, 400])


推論結果の上位を出力する処理

In [12]:
def inference(dir_path, outputs_input, id_label_dict, idx=0):
    print("Path:", dir_path[idx])
    
    outputs = outputs_input.clone()
    
    for i in range(5):
        output = outputs[idx]
        _, pred = torch.max(output, dim=0)
        class_idx = int(pred.numpy())
        print(f"Rank {i+1}: {id_label_dict[class_idx]}")
        outputs[idx][class_idx] = -1000 # 最大値だったものを消す

In [15]:
inference(dir_path, outputs, id_label_dict, 0)

Path: ../datasets/chapter9/kinetics_videos/bungee jumping/TUvSX0pYu4o_000002_000012
Rank 1: bungee jumping
Rank 2: trapezing
Rank 3: abseiling
Rank 4: swinging on something
Rank 5: climbing a rope


バンジージャンプと推定成功  
2位は空中ブランコ，3位は懸垂下降で，確かに雰囲気がバンジージャンプに似ているかもしれない．

In [18]:
inference(dir_path, outputs, id_label_dict, 1)

Path: ../datasets/chapter9/kinetics_videos/arm wrestling/BdMiTo_OtnU_000024_000034
Rank 1: arm wrestling
Rank 2: drinking beer
Rank 3: getting a tattoo
Rank 4: waxing legs
Rank 5: stretching leg


アームレスリングも成功  
ビールを飲むときにも確かに腕を豪快に使うので似た雰囲気かもしれない．

さらに自前のデータセットでファインチューニングしていくことで自前の動画でも分類が可能になる．