# pt파일로부터 fine-tune 된 ResNet-50 모델 불러오기

In [102]:
import torch
from torchvision.models.resnet import resnet50


finetuned_resnet = resnet50()
finetuned_resnet.load_state_dict(torch.load("../finetuned_resnet50/Res_Sim_RoI.pt"))

<All keys matched successfully>

# ckpt파일로부터 fine-tune 된 DeTR 모델 불러오기

In [103]:
# DeTR 모델 정의
from transformers import DetrForObjectDetection, DetrImageProcessor
import os
import torchvision
from torch.utils.data import DataLoader
from transformers import DetrForObjectDetection
import pytorch_lightning as pl


# Huggingface에서 pretrained model 로드
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
CHECKPOINT = 'facebook/detr-resnet-50'


image_processor = DetrImageProcessor.from_pretrained(CHECKPOINT)
model = DetrForObjectDetection.from_pretrained(CHECKPOINT)

# Dataset 정의
dataset_location = f"/mnt/d/Data/811_dataset"
ANNOTATION_FILE_NAME = "labels.json"
TRAIN_DIRECTORY = os.path.join(dataset_location, "train")
VAL_DIRECTORY = os.path.join(dataset_location, "val")
TEST_DIRECTORY = os.path.join(dataset_location, "test")


class CocoDetection(torchvision.datasets.CocoDetection):
    def __init__(
        self,
        dataset_directory_path: str,
        image_directory_path: str, 
        image_processor, 
        train: bool = True
    ):
        annotation_file_path = os.path.join(image_directory_path, ANNOTATION_FILE_NAME)
        super(CocoDetection, self).__init__(image_directory_path, annotation_file_path)
        self.image_processor = image_processor

    def __getitem__(self, idx):
        images, annotations = super(CocoDetection, self).__getitem__(idx)        
        image_id = self.ids[idx]
        annotations = {'image_id': image_id, 'annotations': annotations}
        encoding = self.image_processor(images=images, annotations=annotations, return_tensors="pt")
        pixel_values = encoding["pixel_values"].squeeze()
        target = encoding["labels"][0]

        return pixel_values, target


TRAIN_DATASET = CocoDetection(
    dataset_directory_path=f"{TRAIN_DIRECTORY}",
    image_directory_path=f"{TRAIN_DIRECTORY}//data", 
    image_processor=image_processor,
    train=True)
VAL_DATASET = CocoDetection(
    dataset_directory_path=f"{VAL_DIRECTORY}",
    image_directory_path=f"{VAL_DIRECTORY}//data", 
    image_processor=image_processor, 
    train=False)
TEST_DATASET = CocoDetection(
    dataset_directory_path=f"{TEST_DIRECTORY}",
    image_directory_path=f"{TEST_DIRECTORY}//data", 
    image_processor=image_processor, 
    train=False)

print("Number of training examples:", len(TRAIN_DATASET))
print("Number of validation examples:", len(VAL_DATASET))
print("Number of test examples:", len(TEST_DATASET))

#Data Loader 정의
def collate_fn(batch):
    # DETR authors employ various image sizes during training, making it not possible 
    # to directly batch together images. Hence they pad the images to the biggest 
    # resolution in a given batch, and create a corresponding binary pixel_mask 
    # which indicates which pixels are real/which are padding
    pixel_values = [item[0] for item in batch]
    encoding = image_processor.pad(pixel_values, return_tensors="pt")
    labels = [item[1] for item in batch]
    return {
        'pixel_values': encoding['pixel_values'],
        'pixel_mask': encoding['pixel_mask'],
        'labels': labels
    }

TRAIN_DATALOADER = DataLoader(dataset=TRAIN_DATASET, collate_fn=collate_fn, batch_size=4, num_workers=8, shuffle=True, pin_memory=True)
VAL_DATALOADER = DataLoader(dataset=VAL_DATASET, collate_fn=collate_fn, batch_size=4, num_workers=8, pin_memory=True)
TEST_DATALOADER = DataLoader(dataset=TEST_DATASET, collate_fn=collate_fn, batch_size=2, pin_memory=True)

#DeTR Class 정의
class Detr(pl.LightningModule):

    def __init__(self, lr=1e-4, lr_backbone=1e-5, weight_decay=1e-4):
        super().__init__()
        self.model = DetrForObjectDetection.from_pretrained(
            pretrained_model_name_or_path = CHECKPOINT,
            revision = 'no_timm',
            num_labels = 10,
            ignore_mismatched_sizes = True
        )
        
        self.lr = lr
        self.lr_backbone = lr_backbone
        self.weight_decay = weight_decay

    # def forward(self, pixel_values, pixel_mask):
    #     return self.model(pixel_values=pixel_values, pixel_mask=pixel_mask)

    # def common_step(self, batch, batch_idx):
    #     pixel_values = batch["pixel_values"]
    #     pixel_mask = batch["pixel_mask"]
    #     labels = [{k: v.to(self.device) for k, v in t.items()} for t in batch["labels"]]

    #     outputs = self.model(pixel_values=pixel_values, pixel_mask=pixel_mask, labels=labels)

    #     loss = outputs.loss
    #     loss_dict = outputs.loss_dict
  
    #     return loss, loss_dict

    # def training_step(self, batch, batch_idx):
    #     loss, loss_dict = self.common_step(batch, batch_idx)
    #     # logs metrics for each training_step, and the average across the epoch
    #     self.log("training_loss", loss, on_step=True, on_epoch=True, logger=True)
    #     for k,v in loss_dict.items():
    #         self.log("train_" + k, v.item())

    #     return loss

    # def validation_step(self, batch, batch_idx):    
    #     loss, loss_dict = self.common_step(batch, batch_idx)
    #     self.log("validation/loss", loss,on_step=True, on_epoch=True, logger=True)
    #     for k, v in loss_dict.items():
    #         self.log("validation_" + k, v.item())

    #     return loss

    # def test_step(self, batch, batch_idx):
    #     loss, loss_dict = self.common_step(batch, batch_idx)
    #     # logs metrics for each training_step, and the average across the epoch
    #     self.log("test_loss", loss, on_step=True, on_epoch=True, logger=True)
    #     for k,v in loss_dict.items():
    #         self.log("test_" + k, v.item())

    #     return loss

    # def configure_optimizers(self):
    #     # DETR authors decided to use different learning rate for backbone
    #     # you can learn more about it here:
    #     # - https://github.com/facebookresearch/detr/blob/3af9fa878e73b6894ce3596450a8d9b89d918ca9/main.py#L22-L23
    #     # - https://github.com/facebookresearch/detr/blob/3af9fa878e73b6894ce3596450a8d9b89d918ca9/main.py#L131-L139
    #     param_dicts = [
    #         {
    #             "params": [p for n, p in self.named_parameters() if "backbone" not in n and p.requires_grad]},
    #         {
    #             "params": [p for n, p in self.named_parameters() if "backbone" in n and p.requires_grad],
    #             "lr": self.lr_backbone,
    #         },
    #     ]
    #     return torch.optim.AdamW(param_dicts, lr=self.lr, weight_decay=self.weight_decay)

    def train_dataloader(self):
        return TRAIN_DATALOADER

    def val_dataloader(self):
        return VAL_DATALOADER

    def test_dataloader(self):
        return TEST_DATALOADER

Some weights of the model checkpoint at facebook/detr-resnet-50 were not used when initializing DetrForObjectDetection: ['model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer1.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing DetrForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DetrForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


loading annotations into memory...
Done (t=0.88s)
creating index...
index created!
loading annotations into memory...
Done (t=0.03s)
creating index...
index created!
loading annotations into memory...
Done (t=0.03s)
creating index...
index created!
Number of training examples: 19393
Number of validation examples: 2466
Number of test examples: 2443


In [104]:
ckpt_path = "../DeTR-Compare_frozen_layers/DeTR-Compare frozen layers/backbone+attention/checkpoints/epoch=4-step=24245.ckpt"

model = Detr.load_from_checkpoint(ckpt_path)

Some weights of DetrForObjectDetection were not initialized from the model checkpoint at facebook/detr-resnet-50 and are newly initialized because the shapes did not match:
- class_labels_classifier.weight: found shape torch.Size([92, 256]) in the checkpoint and torch.Size([11, 256]) in the model instantiated
- class_labels_classifier.bias: found shape torch.Size([92]) in the checkpoint and torch.Size([11]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# ResNet에서 DeTR로 Weights and Biases 값 복사

In [105]:
print(finetuned_resnet)
print(model)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [106]:
# finetune한 Layer4 불러오기
finetuned_layer4 = finetuned_resnet.layer4
print(finetuned_layer4)

Sequential(
  (0): Bottleneck(
    (conv1): Conv2d(1024, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (bn3): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (downsample): Sequential(
      (0): Conv2d(1024, 2048, kernel_size=(1, 1), stride=(2, 2), bias=False)
      (1): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
  )
  (1): Bottleneck(
    (conv1): Conv2d(2048, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (conv2): 

In [107]:
# DeTR backbone의 Layer4 불러오기 (ResNetStage3)
resnetstage3 = model.model.model.backbone.conv_encoder.model.encoder.stages[3]
print(resnetstage3)
print(resnetstage3.layers[0].layer[0].normalization.bias.data.size())

ResNetStage(
  (layers): Sequential(
    (0): ResNetBottleNeckLayer(
      (shortcut): ResNetShortCut(
        (convolution): Conv2d(1024, 2048, kernel_size=(1, 1), stride=(2, 2), bias=False)
        (normalization): DetrFrozenBatchNorm2d()
      )
      (layer): Sequential(
        (0): ResNetConvLayer(
          (convolution): Conv2d(1024, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (normalization): DetrFrozenBatchNorm2d()
          (activation): ReLU()
        )
        (1): ResNetConvLayer(
          (convolution): Conv2d(512, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
          (normalization): DetrFrozenBatchNorm2d()
          (activation): ReLU()
        )
        (2): ResNetConvLayer(
          (convolution): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (normalization): DetrFrozenBatchNorm2d()
          (activation): Identity()
        )
      )
      (activation): ReLU()
    )
    (1): ResNetBottleNeckLay

In [108]:
# (0) Bottleneck from finetuned_layer4
finetuned_layer4_bottleneck = list(finetuned_layer4.children())
#print(finetuned_layer4_bottleneck)
#print(finetuned_layer4_bottleneck[0].bn1.weight.data.size())
print(finetuned_layer4_bottleneck[0])

Bottleneck(
  (conv1): Conv2d(1024, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
  (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
  (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)
  (bn3): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (downsample): Sequential(
    (0): Conv2d(1024, 2048, kernel_size=(1, 1), stride=(2, 2), bias=False)
    (1): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
)


In [57]:
# ResNetStage3.layers0.layer0.convolution size = [512,1024,1,1]
resnetstage3.layers[0].layer[0].convolution.weight.data = finetuned_layer4_bottleneck[0].conv1.weight.data.clone()
#print(resnetstage3.layers[0].layer[0].convolution.weight.data)
#print(model.model.model.backbone.conv_encoder.model.encoder.stages[3].layers[0].layer[0].convolution.weight.data)

# ResNetStage3.layers0.layer1.convolution size = [512,612,3,3]
resnetstage3.layers[0].layer[1].convolution.weight.data = finetuned_layer4_bottleneck[0].conv2.weight.data.clone()

# ResNetStage3.layers0.layer2.convolution size = [2048,512,1,1]
resnetstage3.layers[0].layer[2].convolution.weight.data = finetuned_layer4_bottleneck[0].conv3.weight.data.clone()

# ResNEtStage3.layers1.layer0.convolution size = [512,2048,1,1]
resnetstage3.layers[1].layer[0].convolution.weight.data = 

In [110]:
for i in range(3):
    finetuned_block = finetuned_layer4_bottleneck[i]
    detr_block = resnetstage3.layers[i]

    # Weights for convolution layers
    detr_block.layer[0].convolution.weight.data = finetuned_block.conv1.weight.data.clone()
    detr_block.layer[1].convolution.weight.data = finetuned_block.conv2.weight.data.clone()
    detr_block.layer[2].convolution.weight.data = finetuned_block.conv3.weight.data.clone()

    # Weights and Biases for normalization layers
    detr_block.layer[0].normalization.weight.data = finetuned_block.bn1.weight.data.clone()
    detr_block.layer[0].normalization.bias.data = finetuned_block.bn1.bias.data.clone()
    detr_block.layer[1].normalization.weight.data = finetuned_block.bn2.weight.data.clone()
    detr_block.layer[1].normalization.bias.data = finetuned_block.bn2.bias.data.clone()
    detr_block.layer[2].normalization.weight.data = finetuned_block.bn3.weight.data.clone()
    detr_block.layer[2].normalization.bias.data = finetuned_block.bn3.bias.data.clone()

# Weights for ResNetShortCut.convolution layer
resnetstage3.layers[0].shortcut.convolution.weight.data = finetuned_layer4_bottleneck[0].downsample[0].weight.data.clone()

# Weights and Biases for ResNetShortCut.normalization layer
resnetstage3.layers[0].shortcut.normalization.weight.data = finetuned_layer4_bottleneck[0].downsample[1].weight.data.clone()
resnetstage3.layers[0].shortcut.normalization.bias.data = finetuned_layer4_bottleneck[0].downsample[1].bias.data.clone()
    

In [111]:
# check
print(model.model.model.backbone.conv_encoder.model.encoder.stages[3].layers[0].shortcut.normalization.weight.data)
print(finetuned_layer4_bottleneck[0].downsample[1].weight.data)

tensor([0.7504, 0.8020, 0.9884,  ..., 0.8470, 0.9360, 0.8504])
tensor([0.7504, 0.8020, 0.9884,  ..., 0.8470, 0.9360, 0.8504])


# Backbone 변경된 모델 Save

In [101]:
torch.save(model.state_dict(), "../Models/DeTR_with_ResNet_RoI.pt")