# 1. Import libraries

In [1]:
import os
import time
from matplotlib import pyplot as plt
import numpy as np
from numpy import printoptions
import pandas as pd
from PIL import Image
from sklearn.metrics import precision_score, recall_score, f1_score
import torch
from torchvision import transforms
from torchvision import models
from torch.utils.tensorboard import SummaryWriter
import torch.nn as nn
#import requests
#import tarfile
import random
#import json
import shutil
from datetime import datetime

In [2]:
import warnings
warnings.filterwarnings('always')

from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True #Enable processing cutted images(prevent OSError: image file is truncated)

# 2. Set paths
 - To load dataset
 - To save checkpoints & best checkpoints

In [3]:
#Set directories as you want.
path = "/home/ubuntu/Desktop/Project"
dataset_path = os.path.join(path, "datasets/circlin_feeds_dataset/image_dataset")

date = datetime.today().strftime("%Y%m%d")
print(f"Date today: {date}")
checkpoint_path = os.path.join(path, f"autolabeler_classifier/resnext50_model/{date}")
model_path = os.path.join(path, f"autolabeler_classifier/resnext50_model/{date}")
metric_path = os.path.join(path, f"autolabeler_classifier/resnext50_model/{date}")

# Save path for logs
# logdir = os.path.join(path, f"autolabeler_classifier/resnext50_model/{date}/logs")

Date today: 20211201


# 3. Training settings

## 3-1. Set seed number.

In [4]:
# Fix all seeds to make experiments reproducible
torch.manual_seed(2020)
torch.cuda.manual_seed(2020)
np.random.seed(2020)
random.seed(2020)
torch.backends.cudnn.deterministic = True

## 3-2. Hyperparameters
 - __Adjust: <u>mean</u>, <u>std</u>__

In [5]:
# Initialize the training parameters.
NUM_WORKERS = 8 # Number of CPU processes for data preprocessing
LEARNING_RATE = 1e-4 # Learning rate
TRAIN_BATCH_SIZE = 64
VALID_BATCH_SIZE = 64
save_freq = 1 # Save checkpoint frequency (epochs)
test_freq = 200 # Test model frequency (iterations)
EPOCHS = 36 # Number of epochs for training 
# Note: on the small subset of data overfitting happens after 30-35 epochs


#For normalization
mean = [0.485, 0.456, 0.406]
std = [0.229, 0.224, 0.225]


# Run tensorboard
# %load_ext tensorboard
# %tensorboard --logdir {logdir}

## 3-3. Loss function

In [6]:
# Loss function
#criterion = nn.BCELoss() #BCEWithLogitsLoss
def loss_fn(outputs, targets):
    return torch.nn.BCELoss()(outputs, targets) #BCEWithLogitsLoss()

## 3-4. Check GPU status & Enable distributed processing
 - __Should be improved!__ 
   - As is : Using DatParallel
   - To be: Use DistributedDataParallel

In [7]:
#Device check(for GPU computing)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

In [8]:
#For multiple GPU utilization: This should be improved...

# dist.init_process_group(
#     backend='nccl',
#     init_method='tcp://localhost:9999', #FREEPORT
#     world_size=2,
#     rank=0,
# )

# dist.init_process_group(
#     backend="nccl",
#     init_method='tcp://127.0.0.1:9999',
#     rank=0,
#     world_size=2)

## 3-5. Optimizer

In [9]:
#Optimizer
def make_optimizer(model, lr):
    optimizer = torch.optim.Adam(
        params =  model.parameters(), 
        lr=lr)

    return optimizer

# 4. Prepare dataset

## 4-1. Define target labels(46)

In [10]:
#Define taret labels
labels = ['간편식', '건강간식', '건강식', '건강음료', '걷기/산책', '격투기', '골프', 
          '기타식단', '기타운동', '농구', '달리기/조깅', '당구', '등산/등반', '루틴기록', '맨몸', '무술', 
          '배구', '배드민턴', '보조제', '보충제', '볼링', '수상스포츠', '스키/스노보드', '승마', '신체기록', 
          '야구', '온라인클래스', '요가', '운동기구', '운동용품', '웨이트', '유산소기록', '의류', '일반간식', 
          '일반식', '일반음료', '일상생활', '자전거', '종합운동', '줄넘기', '축구/풋살', '탁구', '테니스', 
          '폴댄스', '필라테스', '홈트'] #46

## 4-2. Create custom dataset

In [11]:
# class CustomDataset(torch.utils.data.Dataset):
#     def __init__(self, df, transforms):
#         # !!!!!!!!!!Broken image files cannot be opened by both PIL.Image, cv2... So remove them from list by try~except.
#         broken_urls = []
#         broken_index = []
#         for url in df['url']:
#             if df[df['url']==url].index[0] % 500 == 0:
#                 print(f"Now Doing: {df[df['url']==url].index[0]}, and {len(broken_urls)} urls({len(broken_index)} indexes) seem to be broken...")
#             try:
#                 image = Image.open(urlopen(url))
#             except:
#                 drop_index = df[df['url'] == url].index
#                 broken_urls.append(url)
#                 broken_index.append(drop_index)

#         print(f"{len(broken_urls)} files are broken... Cannot open below files: \n {broken_urls}")
#         df = df.drop(broken_index)
#         print(f"Removed broken file rows. Now you can use {len(df)} files. Data is as below: \n {df}")

#         self.transforms = transforms
#         self.df = df
#         self.feed_image = df['url'] #Series of file name
#         self.labels = self.df[labels].values #df.values: np.array #one-hot encoded: [0, 1, 0, ...., 1, 1]
        
#         #self.image_list = self.feed_image.tolist()
#         #self.label_list = self.labels.tolist()
        
#     def __len__(self):
#         return len(self.feed_image)

#     def __getitem__(self, index):
#         label = torch.FloatTensor(self.labels[index])
#         image_url = self.feed_image[index]

#         image = Image.open(urlopen(image_url)) #Input(image_url) is URL address. 
#         if self.transforms is not None:
#             image = self.transforms(image)
            
#         return image, label

    
# # train_annotations = os.path.join(img_folder, 'small_train.json')
# # train_dataset = CustomDataset(img_folder, train_annotations, train_transform)

- At Image.open in __ __getitem__ __  needs .convert('RGB') because Image.open returns grayscale.
    - https://stackoverflow.com/questions/59218671/runtimeerror-output-with-shape-1-224-224-doesnt-match-the-broadcast-shape

In [12]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, df, transforms):
        self.transforms = transforms
        self.df = df
        self.feed_image = df['url'] #Series of file name
        self.labels = self.df[labels].values #df.values: np.array #one-hot encoded: [0, 1, 0, ...., 1, 1]
        
        #self.image_list = self.feed_image.tolist()
        #self.label_list = self.labels.tolist()
        
    def __len__(self):
        return len(self.feed_image)

    def __getitem__(self, index):
        label = torch.FloatTensor(self.labels[index])
        image_url = self.feed_image[index]
        
        #Needs .convert('RGB') because Image.open returns grayscale.
        image = Image.open(image_url).convert('RGB')
        if self.transforms is not None:
            image = self.transforms(image)
            
        return image, label

    
# train_annotations = os.path.join(img_folder, 'small_train.json')
# train_dataset = CustomDataset(img_folder, train_annotations, train_transform)

In [13]:
# # We use the .tar.gz archive from this(https://github.com/thuml/HashNet/tree/master/pytorch#datasets) 
# # github repository to speed up image loading(instead of loading it from Flickr).
# # Let's download and extract it.
# img_folder = '/home/ubuntu/Desktop/ml_dl_tutorials/dataset_for_tutorial/nus_wide_images'
# if not os.path.exists(img_folder):
#     def download_file_from_google_drive(id, destination):
#         def get_confirm_token(response):
#             for key, value in response.cookies.items():
#                 if key.startswith('download_warning'):
#                     return value
#             return None

#         def save_response_content(response, destination):
#             CHUNK_SIZE = 32768
#             with open(destination, "wb") as f:
#                 for chunk in tqdm(response.iter_content(CHUNK_SIZE), desc='Downloading'):
#                     if chunk:  # filter out keep-alive new chunks
#                         f.write(chunk)

#         URL = "https://docs.google.com/uc?export=download"
#         session = requests.Session()
#         response = session.get(URL, params={'id': id}, stream=True)
#         token = get_confirm_token(response)

#         if token:
#             params = {'id': id, 'confirm': token}
#             response = session.get(URL, params=params, stream=True)
#         save_response_content(response, destination)

#     file_id = '0B7IzDz-4yH_HMFdiSE44R1lselE'
#     path_to_tar_file = str(time.time()) + '.tar.gz'
#     download_file_from_google_drive(file_id, path_to_tar_file)
#     print('Extraction')
#     with tarfile.open(path_to_tar_file) as tar_ref:
#         tar_ref.extractall(os.path.dirname(img_folder))
#     os.remove(path_to_tar_file)
# # Also, copy our pre-processed annotations to the dataset folder. 
# # Note: you can find script for generating such annotations in attachments
# copyfile('nus_wide/small_test.json', os.path.join(img_folder, 'small_test.json'))
# copyfile('nus_wide/small_train.json', os.path.join(img_folder, 'small_train.json'))

In [14]:
# # Let's take a look at the data we have. To do it we need to load the dataset without augmentations.
# dataset_val = CustomDataset(img_folder, os.path.join(img_folder, 'small_test.json'), None)
# dataset_train = CustomDataset(img_folder, os.path.join(img_folder, 'small_train.json'), None)

# # A simple function for visualization.
# def show_sample(img, binary_img_labels):
#     # Convert the binary labels back to the text representation.    
#     img_labels = np.array(dataset_val.classes)[np.argwhere(binary_img_labels > 0)[:, 0]]
#     plt.imshow(img)
#     plt.title("{}".format(', '.join(img_labels)))
#     plt.axis('off')
#     plt.show()

# for sample_id in range(5):
#     show_sample(*dataset_val[sample_id])

In [15]:
# # Calculate label distribution for the entire dataset (train + test)
# samples = dataset_val.annos + dataset_train.annos
# samples = np.array(samples)
# with printoptions(precision=3, suppress=True):
#     class_counts = np.sum(samples, axis=0)
#     # Sort labels according to their frequency in the dataset.
#     sorted_ids = np.array([i[0] for i in sorted(enumerate(class_counts), key=lambda x: x[1])], dtype=int)
#     print('Label distribution (count, class name):', list(zip(class_counts[sorted_ids].astype(int), np.array(dataset_val.classes)[sorted_ids])))
#     plt.barh(range(len(dataset_val.classes)), width=class_counts[sorted_ids])
#     plt.yticks(range(len(dataset_val.classes)), np.array(dataset_val.classes)[sorted_ids])
#     plt.gca().margins(y=0)
#     plt.grid()
#     plt.title('Label distribution')
#     plt.show()

In [16]:
#Get image dataset
dataset = os.path.join(dataset_path, "20211201_image_dataset(change_url).csv")
whole_df = pd.read_csv(dataset)
print(whole_df.columns)
print(whole_df['deidentification_x'].unique())
print(len(whole_df))

Index(['index', 'seq', 'url', 'deidentification_x', '간편식', '건강간식', '건강식',
       '건강음료', '걷기/산책', '격투기', '골프', '기타식단', '기타운동', '농구', '달리기/조깅', '당구',
       '등산/등반', '루틴기록', '맨몸', '무술', '배구', '배드민턴', '보조제', '보충제', '볼링', '수상스포츠',
       '스키/스노보드', '승마', '신체기록', '야구', '온라인클래스', '요가', '운동기구', '운동용품', '웨이트',
       '유산소기록', '의류', '일반간식', '일반식', '일반음료', '일상생활', '자전거', '종합운동', '줄넘기',
       '축구/풋살', '탁구', '테니스', '폴댄스', '필라테스', '홈트'],
      dtype='object')
['n']
215145


In [17]:
#Drop useless features/columns
copy_df = whole_df.copy()
copy_df.drop(labels=['index', 'seq', 'deidentification_x'], axis=1, inplace=True)
print(copy_df.columns)
copy_df.head(10)

Index(['url', '간편식', '건강간식', '건강식', '건강음료', '걷기/산책', '격투기', '골프', '기타식단',
       '기타운동', '농구', '달리기/조깅', '당구', '등산/등반', '루틴기록', '맨몸', '무술', '배구', '배드민턴',
       '보조제', '보충제', '볼링', '수상스포츠', '스키/스노보드', '승마', '신체기록', '야구', '온라인클래스',
       '요가', '운동기구', '운동용품', '웨이트', '유산소기록', '의류', '일반간식', '일반식', '일반음료',
       '일상생활', '자전거', '종합운동', '줄넘기', '축구/풋살', '탁구', '테니스', '폴댄스', '필라테스',
       '홈트'],
      dtype='object')


Unnamed: 0,url,간편식,건강간식,건강식,건강음료,걷기/산책,격투기,골프,기타식단,기타운동,...,일상생활,자전거,종합운동,줄넘기,축구/풋살,탁구,테니스,폴댄스,필라테스,홈트
0,/home/ubuntu/Desktop/Project/datasets/circlin_...,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,/home/ubuntu/Desktop/Project/datasets/circlin_...,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,/home/ubuntu/Desktop/Project/datasets/circlin_...,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,/home/ubuntu/Desktop/Project/datasets/circlin_...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,/home/ubuntu/Desktop/Project/datasets/circlin_...,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,/home/ubuntu/Desktop/Project/datasets/circlin_...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
6,/home/ubuntu/Desktop/Project/datasets/circlin_...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,/home/ubuntu/Desktop/Project/datasets/circlin_...,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,/home/ubuntu/Desktop/Project/datasets/circlin_...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,/home/ubuntu/Desktop/Project/datasets/circlin_...,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
#Train - validation split
train_size = 0.8
train_df = copy_df.copy().sample(frac=train_size, random_state=200).reset_index(drop=True)
val_df = copy_df.drop(train_df.index).reset_index(drop=True)
# train_dataset = CustomDataset(splitted_train_df, tokenizer, MAX_LEN)
# valid_dataset = CustomDataset(val_df, tokenizer, MAX_LEN)


# Train preprocessing
train_transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(),
    transforms.RandomAffine(degrees=20, 
                            translate=(0.2, 0.2),
                            scale=(0.5, 1.5),
                            shear=None,
                            resample=False, 
                            fillcolor=tuple(np.array(np.array(mean)*255).astype(int).tolist())),
    transforms.ToTensor(),
    transforms.Normalize(mean, std)
])
# Test preprocessing
val_transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.ToTensor(),
    transforms.Normalize(mean, std)
])
print(tuple(np.array(np.array(mean)*255).tolist()))


# valid_annotations = os.path.join(img_folder, 'small_test.json')
# train_annotations = os.path.join(img_folder, 'small_train.json')
# train_dataset = CustomDataset(img_folder, train_annotations, train_transform)
# valid_dataset = CustomDataset(img_folder, valid_annotations, val_transform)

train_dataset = CustomDataset(train_df, train_transform)
valid_dataset = CustomDataset(val_df, val_transform)

(123.675, 116.28, 103.53)


  "Argument resample is deprecated and will be removed since v0.10.0. Please, use interpolation instead"
  "Argument fillcolor is deprecated and will be removed since v0.10.0. Please, use fill instead"


In [19]:
train_data_loader =  torch.utils.data.DataLoader(train_dataset, 
                              batch_size=TRAIN_BATCH_SIZE, 
                              num_workers=NUM_WORKERS,  #0?
                              shuffle=True,
                              drop_last=True)
val_data_loader =  torch.utils.data.DataLoader(valid_dataset, 
                             batch_size=VALID_BATCH_SIZE, 
                             num_workers=NUM_WORKERS) #0?

#num_train_batches = int(np.ceil(len(train_dataset) / batch_size))

In [20]:
# #To explore file shape.
# batchlist = []
# datalist = []
# for batch_idx, data in enumerate(train_data_loader):
#     #print(batch_idx, data)
#     batchlist.append(batch_idx)
#     datalist.append(data)

# 5. Make feed image classification model

## 5-1. Define functions that save checkpoint of model

In [21]:
def load_ckp(checkpoint_fpath, model, optimizer):
    """
    checkpoint_path: path to save checkpoint
    model: model that we want to load checkpoint parameters into       
    optimizer: optimizer we defined in previous training
    """
    # load check point
    checkpoint = torch.load(checkpoint_fpath)
    # initialize state_dict from checkpoint to model
    model.load_state_dict(checkpoint['state_dict'])
    # initialize optimizer from checkpoint to optimizer
    optimizer.load_state_dict(checkpoint['optimizer'])
    # initialize valid_loss_min from checkpoint to valid_loss_min
    valid_loss_min = checkpoint['valid_loss_min']
    # return model, optimizer, epoch value, min validation loss 
    return model, optimizer, checkpoint['epoch'], valid_loss_min #valid_loss_min.item()

def save_ckp(state, is_best, checkpoint_path, best_model_path):
    """
    state: checkpoint we want to save
    is_best: is this the best checkpoint; min validation loss
    checkpoint_path: path to save checkpoint
    best_model_path: path to save best model
    """
    f_path = checkpoint_path
    # save checkpoint data to the path given, checkpoint_path
    torch.save(state, f_path)
    # if it is a best model, min validation loss
    if is_best:
        best_fpath = best_model_path
        # copy that checkpoint file to best path given, best_model_path
        shutil.copyfile(f_path, best_fpath)

## 5-2. Define Resnext50 model as a class
 - Use pytorch implemented pretrained model.

In [22]:
# Use the torchvision's implementation of ResNeXt, but add FC layer for a different number of classes (27) and a Sigmoid instead of a default Softmax.
class ResNeXt50Class(nn.Module):
    def __init__(self, n_classes):
#         super().__init__()
#         resnet = models.resnext50_32x4d(pretrained=True)
#         resnet.fc = nn.Sequential(
#             nn.Dropout(p=0.2),
#             nn.Linear(in_features=resnet.fc.in_features, out_features=n_classes)
#         )
#         self.base_model = resnet
        super(ResNeXt50Class, self).__init__()
        self.resnext_model = models.resnext50_32x4d(pretrained=True)
        self.resnext_model.fc = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(in_features=self.resnext_model.fc.in_features, 
                      out_features=n_classes)
        )
        self.sigm = nn.Sigmoid()

    def forward(self, x):
        output = self.sigm(self.resnext_model(x))
        
        return output
        #return self.sigm(self.base_model(x))

In [23]:
model = ResNeXt50Class(len(labels)) #or len(labels) #train_dataset.classes
model = model.cuda()
model = nn.DataParallel(model) #Distributed
#model = nn.parallel.DistributedDataParallel(model, device_ids=[0, 1]) #Distributed DataParallel  ===> Should use this!!!!!!!!!!!!!!!!!
model.to(device)

DataParallel(
  (module): ResNeXt50Class(
    (resnext_model): ResNet(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
          (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (conv3): Conv2d(128, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=Tru

In [24]:
# optimizer
optimizer = make_optimizer(model, LEARNING_RATE)

In [25]:
val_targets = []
val_outputs = []

## 5-3. Training
 - __<u>Add Train Loss!!!!!!!!!!</u>__

In [26]:
learning_rate = [0.000001, 0.00001, 0.0001, 0.001]
train_losses_lr = {}
avg_train_losses_lr = {}
val_losses_lr = {}
avg_val_losses_lr = {}
epoch_list = [int(x) for x in np.linspace(1, EPOCHS, EPOCHS).tolist()]
print(epoch_list)

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36]


In [27]:
def train_model(n_epochs,
                       training_loader,
                       validation_loader,
                       model,
                       optimizer,
                       checkpoint_path,
                       best_model_path,
                       metric_path,
                       date):
    # initialize tracker for minimum validation loss
    valid_loss_min = np.Inf
    train_loss_epoch = []
    avg_train_loss_epoch = []    
    val_loss_epoch = [] #append to val_loss_list
    avg_val_loss_epoch = [] #append to avg_val_list
    
    for epoch in range(1, n_epochs+1):
        train_loss = 0
        valid_loss = 0

        model.train()
        print(f'############# Epoch {epoch}: Training Start   #############')
#         for images, targets in enumerate(training_loader):
        for batch_idx, data in enumerate(training_loader):
            images, targets = data[0], data[1]
            images, targets = images.to(device), targets.to(device)
            
            outputs = model(images)

            optimizer.zero_grad()
            loss = loss_fn(outputs, targets)
            if batch_idx%5000==0:
                print(f'Epoch: {epoch}, Training Loss:  {loss.item()}')
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_loss = train_loss + ((1 / (batch_idx + 1)) * (loss.item() - train_loss))
        print('############# Epoch {}: Training End     #############'.format(epoch))
        train_loss_epoch.append(train_loss)
        print('############# Epoch {}: Validation Start   #############'.format(epoch))
        ######################    
        # validate the model #
        ######################

        model.eval()
   
        with torch.no_grad():
            for batch_idx, data in enumerate(validation_loader, 0):
                images, targets = data[0], data[1]
                images, targets = images.to(device), targets.to(device)
                outputs = model(images)

                loss = loss_fn(outputs, targets)
                valid_loss = valid_loss + ((1 / (batch_idx + 1)) * (loss.item() - valid_loss))
                val_targets.extend(targets.cpu().detach().numpy().tolist())
                val_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

            print('############# Epoch {}: Validation End     #############'.format(epoch))
           # calculate average losses
#             print('before calculate avg train loss', train_loss)
            val_loss_epoch.append(valid_loss) 
            avg_train_loss = train_loss/len(training_loader)
            avg_valid_loss = valid_loss/len(validation_loader)
            #Print training/validation statistics
            print('Epoch: {} \tAvgerage Training Loss: {:.6f} \tAverage Validation Loss: {:.6f}'.format(
                epoch, 
                avg_train_loss,
                avg_valid_loss
            ))
            avg_train_loss_epoch.append(avg_train_loss)
            avg_val_loss_epoch.append(avg_valid_loss) 
            

            # create checkpoint variable and add important data
            checkpoint = {
                    'epoch': epoch + 1,
                    'valid_loss_min': avg_valid_loss,
                    'state_dict': model.state_dict(),
                    'optimizer': optimizer.state_dict()
              }

            # save checkpoint
            #save_ckp(checkpoint, False, checkpoint_path, best_model_path)
            save_ckp(checkpoint, False,  f"{checkpoint_path}_{epoch}", best_model_path)
            

            ## TODO: save the model if validation loss has decreased
            if avg_valid_loss <= valid_loss_min:
                print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,avg_valid_loss))
                # save checkpoint as best model
                #save_ckp(checkpoint, True, checkpoint_path, best_model_path)
                save_ckp(checkpoint, True,  f"{checkpoint_path}_{epoch}", best_model_path)
                valid_loss_min = avg_valid_loss

        now = datetime.today().strftime("%Y-%m-%d %H:%M:%S")
        log_text = f"[{now}]: [Learning Rate {lr}, Epoch {epoch}] - train_loss = {train_loss}, avg_train_loss = {avg_train_loss}, validation_loss = {valid_loss}, avg_validation_loss = {avg_valid_loss}\n"
        if os.path.isfile(os.path.join(metric_path, f"metric_logs_{date}.txt")):
            with open(os.path.join(metric_path, f"metric_logs_{date}.txt"), 'a', encoding='utf-8') as f:
                f.write(log_text)
        else:
            with open(os.path.join(metric_path, f"metric_logs_{date}.txt"), 'w', encoding='utf-8') as f:
                f.write(log_text)       
        print('############# Epoch {}  Done   #############\n'.format(epoch))
#     train_loss_lr.append(train_loss_epoch)
#     avg_train_losse_lr.append(avg_train_loss_epoch)        
#     val_losses_lr.append(val_loss_epoch)
#     avg_val_losses_lr.append(avg_val_loss_epoch)

    train_losses_lr[lr] = train_loss_epoch
    avg_train_losses_lr[lr] = avg_train_loss_epoch
    val_losses_lr[lr] = val_loss_epoch
    avg_val_losses_lr[lr] = avg_val_loss_epoch
    print(f"train_losses_lr for LR {lr}: \n {train_losses_lr}")
    print(f"avg_train_losses_lr for LR {lr}: \n {avg_train_losses_lr}")
    print(f"val_losses_lr for LR {lr}: \n {val_losses_lr}")
    print(f"avg_val_losses_lr {lr}: \n {avg_val_losses_lr}")

    return model

In [28]:
# model.train()
# model = model.to(device)

# # If more than one GPU is available we can use both to speed up the training.
# if torch.cuda.device_count() > 1:
#     model = nn.DataParallel(model)

# os.makedirs(checkpoint_path, exist_ok=True)

# # # Loss function
# # criterion = nn.BCELoss() #BCEWithLogitsLoss
# # Tensoboard logger
# logger = SummaryWriter(logdir)

In [29]:
# # Use threshold to define predicted labels and invoke sklearn's metrics with different averaging strategies.
# def calculate_metrics(pred, target, threshold=0.5):
#     pred = np.array(pred > threshold, dtype=float)
#     return {'micro/precision': precision_score(y_true=target, y_pred=pred, average='micro'),
#             'micro/recall': recall_score(y_true=target, y_pred=pred, average='micro'),
#             'micro/f1': f1_score(y_true=target, y_pred=pred, average='micro'),
#             'macro/precision': precision_score(y_true=target, y_pred=pred, average='macro'),
#             'macro/recall': recall_score(y_true=target, y_pred=pred, average='macro'),
#             'macro/f1': f1_score(y_true=target, y_pred=pred, average='macro'),
#             'samples/precision': precision_score(y_true=target, y_pred=pred, average='samples'),
#             'samples/recall': recall_score(y_true=target, y_pred=pred, average='samples'),
#             'samples/f1': f1_score(y_true=target, y_pred=pred, average='samples'),
#             }

### Set checkpoint path, best model's path.

In [30]:
ckpt_path = os.path.join(checkpoint_path, "curr_ckpt")
best_model_path = os.path.join(checkpoint_path, "best_model.pt")

### Training start!

In [None]:
#For hyperparameter tuning
for lr in learning_rate:
    print('\n')
    print(f'##########################################################')
    print(f'##########################################################')    
    print(f'############### Training for learning rate {lr} START! ###############')
    print(f'##########################################################')
    print(f'##########################################################')
    print('\n')
    optimizer = make_optimizer(model, lr)
    train_model(EPOCHS,
               train_data_loader,
               val_data_loader,
               model,
               optimizer,
               os.path.join(checkpoint_path, f"curr_ckpt_{lr}"),
               best_model_path,
               metric_path,
               date)



##########################################################
##########################################################
############### Training for learning rate 1e-06 START! ###############
##########################################################
##########################################################


############# Epoch 1: Training Start   #############
Epoch: 1, Training Loss:  0.6826050877571106


  if not isinstance(inputs, collections.Container) or isinstance(inputs, torch.Tensor):


############# Epoch 1: Training End     #############
############# Epoch 1: Validation Start   #############
############# Epoch 1: Validation End     #############
Epoch: 1 	Avgerage Training Loss: 0.000126 	Average Validation Loss: 0.000217
Validation loss decreased (inf --> 0.000217).  Saving model ...
############# Epoch 1  Done   #############

############# Epoch 2: Training Start   #############
Epoch: 2, Training Loss:  0.16546469926834106
############# Epoch 2: Training End     #############
############# Epoch 2: Validation Start   #############
############# Epoch 2: Validation End     #############
Epoch: 2 	Avgerage Training Loss: 0.000045 	Average Validation Loss: 0.000122
Validation loss decreased (0.000217 --> 0.000122).  Saving model ...
############# Epoch 2  Done   #############

############# Epoch 3: Training Start   #############
Epoch: 3, Training Loss:  0.10442551225423813
############# Epoch 3: Training End     #############
############# Epoch 3: Validation S

############# Epoch 19: Training End     #############
############# Epoch 19: Validation Start   #############
############# Epoch 19: Validation End     #############
Epoch: 19 	Avgerage Training Loss: 0.000019 	Average Validation Loss: 0.000068
Validation loss decreased (0.000068 --> 0.000068).  Saving model ...
############# Epoch 19  Done   #############

############# Epoch 20: Training Start   #############
Epoch: 20, Training Loss:  0.040961649268865585
############# Epoch 20: Training End     #############
############# Epoch 20: Validation Start   #############
############# Epoch 20: Validation End     #############
Epoch: 20 	Avgerage Training Loss: 0.000019 	Average Validation Loss: 0.000067
Validation loss decreased (0.000068 --> 0.000067).  Saving model ...
############# Epoch 20  Done   #############

############# Epoch 21: Training Start   #############
Epoch: 21, Training Loss:  0.05180855467915535
############# Epoch 21: Training End     #############
############# 

Epoch: 1, Training Loss:  0.04195743799209595
############# Epoch 1: Training End     #############
############# Epoch 1: Validation Start   #############
############# Epoch 1: Validation End     #############
Epoch: 1 	Avgerage Training Loss: 0.000018 	Average Validation Loss: 0.000061
Validation loss decreased (inf --> 0.000061).  Saving model ...
############# Epoch 1  Done   #############

############# Epoch 2: Training Start   #############
Epoch: 2, Training Loss:  0.047795865684747696
############# Epoch 2: Training End     #############
############# Epoch 2: Validation Start   #############
############# Epoch 2: Validation End     #############
Epoch: 2 	Avgerage Training Loss: 0.000017 	Average Validation Loss: 0.000058
Validation loss decreased (0.000061 --> 0.000058).  Saving model ...
############# Epoch 2  Done   #############

############# Epoch 3: Training Start   #############
Epoch: 3, Training Loss:  0.0471530556678772
############# Epoch 3: Training End     ###

Epoch: 19, Training Loss:  0.025881508365273476
############# Epoch 19: Training End     #############
############# Epoch 19: Validation Start   #############
############# Epoch 19: Validation End     #############
Epoch: 19 	Avgerage Training Loss: 0.000011 	Average Validation Loss: 0.000040
Validation loss decreased (0.000041 --> 0.000040).  Saving model ...
############# Epoch 19  Done   #############

############# Epoch 20: Training Start   #############
Epoch: 20, Training Loss:  0.031863532960414886
############# Epoch 20: Training End     #############
############# Epoch 20: Validation Start   #############
############# Epoch 20: Validation End     #############
Epoch: 20 	Avgerage Training Loss: 0.000011 	Average Validation Loss: 0.000039
Validation loss decreased (0.000040 --> 0.000039).  Saving model ...
############# Epoch 20  Done   #############

############# Epoch 21: Training Start   #############
Epoch: 21, Training Loss:  0.02346322126686573
############# Epoch 2

Epoch: 1, Training Loss:  0.017691606655716896
############# Epoch 1: Training End     #############
############# Epoch 1: Validation Start   #############
############# Epoch 1: Validation End     #############
Epoch: 1 	Avgerage Training Loss: 0.000016 	Average Validation Loss: 0.000053
Validation loss decreased (inf --> 0.000053).  Saving model ...
############# Epoch 1  Done   #############

############# Epoch 2: Training Start   #############
Epoch: 2, Training Loss:  0.041801951825618744
############# Epoch 2: Training End     #############
############# Epoch 2: Validation Start   #############
############# Epoch 2: Validation End     #############
Epoch: 2 	Avgerage Training Loss: 0.000015 	Average Validation Loss: 0.000054
############# Epoch 2  Done   #############

############# Epoch 3: Training Start   #############
Epoch: 3, Training Loss:  0.0452173575758934
############# Epoch 3: Training End     #############
############# Epoch 3: Validation Start   #############
#

In [None]:
# # Initialize the training parameters.
# num_workers = 8 # Number of CPU processes for data preprocessing
# lr = 1e-4 # Learning rate
# batch_size = 32
# save_freq = 1 # Save checkpoint frequency (epochs)
# test_freq = 200 # Test model frequency (iterations)
# max_epoch_number = 35 # Number of epochs for training 
# # Note: on the small subset of data overfitting happens after 30-35 epochs

# mean = [0.485, 0.456, 0.406]
# std = [0.229, 0.224, 0.225]

# device = torch.device('cuda')

# # Run tensorboard
# %load_ext tensorboard
# %tensorboard --logdir {logdir}

In [None]:
# # Initialize the dataloaders for training.
# test_annotations = os.path.join(img_folder, 'small_test.json')
# train_annotations = os.path.join(img_folder, 'small_train.json')

# test_dataset = CustomDataset(img_folder, test_annotations, val_transform)
# train_dataset = CustomDataset(img_folder, train_annotations, train_transform)

In [None]:
# # Here is an auxiliary function for checkpoint saving.
# def checkpoint_save(model, save_path, epoch):
#     f = os.path.join(save_path, 'checkpoint-{:06d}.pt'.format(epoch))
#     if 'module' in dir(model):
#         torch.save(model.module.state_dict(), f)
#     else:
#         torch.save(model.state_dict(), f)
#     print('saved checkpoint:', f)

In [None]:
# # Run training
# epoch = 0
# iteration = 0
# while True:
#     batch_losses = []
#     for imgs, targets in train_dataloader:
#         imgs, targets = imgs.to(device), targets.to(device)

#         optimizer.zero_grad()

#         model_result = model(imgs)
#         loss = criterion(model_result, targets.type(torch.float))

#         batch_loss_value = loss.item()
#         loss.backward()
#         optimizer.step()

#         logger.add_scalar('train_loss', batch_loss_value, iteration)
#         batch_losses.append(batch_loss_value)
#         with torch.no_grad():
#             result = calculate_metrics(model_result.cpu().numpy(), targets.cpu().numpy())
#             for metric in result:
#                 logger.add_scalar('train/' + metric, result[metric], iteration)

#         if iteration % test_freq == 0:
#             model.eval()
#             with torch.no_grad():
#                 model_result = []
#                 targets = []
#                 for imgs, batch_targets in test_dataloader:
#                     imgs = imgs.to(device)
#                     model_batch_result = model(imgs)
#                     model_result.extend(model_batch_result.cpu().numpy())
#                     targets.extend(batch_targets.cpu().numpy())

#             result = calculate_metrics(np.array(model_result), np.array(targets))
#             for metric in result:
#                 logger.add_scalar('test/' + metric, result[metric], iteration)
#             print("epoch:{:2d} iter:{:3d} test: "
#                   "micro f1: {:.3f} "
#                   "macro f1: {:.3f} "
#                   "samples f1: {:.3f}".format(epoch, iteration,
#                                               result['micro/f1'],
#                                               result['macro/f1'],
#                                               result['samples/f1']))

#             model.train()
#         iteration += 1

#     loss_value = np.mean(batch_losses)
#     print("epoch:{:2d} iter:{:3d} train: loss:{:.3f}".format(epoch, iteration, loss_value))
#     if epoch % save_freq == 0:
#         checkpoint_save(model, checkpoint_path, epoch)
#     epoch += 1
#     if EPOCHS < epoch:
#         break

### Check & Visualize validation loss

In [None]:
print(f"Minimums of average validation loss per learning rate: {[min(x) for x in avg_val_losses_lr]}")
print(f"Minimum average validation loss: {min([min(x) for x in avg_val_losses_lr])}")
avg_val_losses_lr

In [None]:
print(f"Minimums of average validation loss per learning rate: {[min(x) for x in val_losses_lr]}")
print(f"Minimum of minums: {min([min(x) for x in val_losses_lr])}")
val_losses_lr

In [None]:
for i in range(len(val_losses_lr)):
    plt.plot(epoch_list, val_losses_lr[i], '-o', label=learning_rate[i])
    plt.xlabel('Epochs') #1 ~ 36
    plt.ylabel('val_loss')
    plt.legend(loc="upper right")

plt.show()

In [None]:
for i in range(len(avg_val_losses_lr)):
    plt.plot(epoch_list, avg_val_losses_lr[i], '-o', label=learning_rate[i])
    plt.xlabel('Epochs')
    plt.ylabel('avg_val_loss')
    plt.legend(loc="upper right")

plt.show()

__Check the losses and train again to get optimal model.__

__~~Loss is too big when learning rate == 0.1. So remove and redraw graphs.~~__

__Minimum validation loss:  Learning rate = ????, epoch = ??__

## 5-5. Evaluate model
 - Test with validation dataset
 - 1st important index: Precision
 - 2nd importand index: Recall
   - __First, get high & stable <u>Precision</u>, then improve <u>Recall</u>.__
 - And other indexes: F1 score, confusion matrix

### 5-5-1. Check loss, precision(recall) according to learning rate, batch size, optimizer (& epoch)
- x: learning rate / y: validation loss
- x: learning rate / y: precision or recall
- ~~x: epoch / y: validation loss~~
- ~~x: epoch / y: precision or recall~~
- __epoch = 16__

# 6. Inference

## 6-1. Define preprocessing function for input image.

In [None]:
#Resizing

## 6-2. Load saved model for inference

In [None]:
#Load model
load_model = ResNeXt50Class()
load_model = load_model.cuda() #for GPU computation
load_model = nn.DataParallel(load_model) # Distributed
best_model_path = os.path.join(checkpoint_path, "20211126/curr_ckpt_1e-05_16") #currently best model state.

best_optimizer = make_optimizer("loaded_model or best_model_path", "LEARNING_RATE") # <-- Parameter shoud be changed!

predicton_model = load_ckp(best_model_path,  #Path to the saved checkpoint
                        load_model,
                        best_optimizer)[0] #load_ckp: [model, optimizer, checkpoint['epoch'], valid_loss_min.item()]

## 6-3. Define inference function
 - Return dictionaries of predicted labels: {label1: score1, label2: score2, ....}
 - __Labeles which has lower score than threshold will be ignored.__

In [None]:
def inference(image, model, device):
    preprocessed_image = preprocessing(image)

    #for gpu computation
    if device.type == 'cuda':
        model.cuda()

    model.to(device)
    model.eval()

    with torch.no_grad():
        output = model(preprocessed_image)
        final_output = torch.sigmoid(output).cpu().detach().numpy().tolist() #1*46 list in a list
    #     print(final_output)
    #     print(train_df.columns[1:].to_list()[int(np.argmax(final_output, axis=1))])
        result_pair = zip(train_df.columns[1:].to_list(), final_output[0])
        result_dict = {}
        for label, score in result_pair:
            if score > 0.1: #Set prediction threshold
                result_dict[label] = score
    return sorted(result_dict.items(), key=(lambda x: x[1]), reverse=True)

- Reference code for inference!

In [None]:
# model.eval()
# for sample_id in [1,2,3,4,6]:
#     test_img, test_labels = test_dataset[sample_id]
#     test_img_path = os.path.join(img_folder, test_dataset.imgs[sample_id])
#     with torch.no_grad():
#         raw_pred = model(test_img.unsqueeze(0)).cpu().numpy()[0]
#         raw_pred = np.array(raw_pred > 0.5, dtype=float)

#     predicted_labels = np.array(dataset_val.classes)[np.argwhere(raw_pred > 0)[:, 0]]
#     if not len(predicted_labels):
#         predicted_labels = ['no predictions']
#     img_labels = np.array(dataset_val.classes)[np.argwhere(test_labels > 0)[:, 0]]
#     plt.imshow(Image.open(test_img_path))
#     plt.title("Predicted labels: {} \nGT labels: {}".format(', '.join(predicted_labels), ', '.join(img_labels)))
#     plt.axis('off')
#     plt.show()

## 6-4. Demo test

In [None]:
test_image = ""

In [None]:
inference(test_image, #Input sentence
         predicton_model,
          device) #CPU or GPU

# 7. Deploying model
 - Make a pipeline:
   - Process 1~6 to created a new model.
   - New model should be sent to the deploying server(automatically is best, but manually is also OK).
   - After that, deploy server should process feed text datas by the latest model.

In [None]:
#Test image open
from urllib.request import urlopen
import cv2 as cv
#image_test = Image.open('/home/ubuntu/Desktop/Project/datasets/circlin_feeds_dataset/raw_data/raw_image/103.60.126.35/var_www_html_Image/SNS/3793/3793_1578721099.png')
#image_test = Image.open(urlopen('http://103.60.126.35/Image/SNS/4000/4000_1583499888.png'))
#image_test = Image.open(urlopen('https://cyld20183.speedgabia.com/Image/SNS/46272/46272_1624977386.jpg'))
image_test = Image.open('/home/ubuntu/Desktop/Project/datasets/circlin_feeds_dataset/raw_data/raw_image/cyld20184.speedgabia.com/Image/SNS/23370/23370_1616239622_1.jpg')
#image_test = cv.imread('/home/ubuntu/Desktop/Project/datasets/circlin_feeds_dataset/raw_data/raw_image/cyld20184.speedgabia.com/Image/SNS/23370/23370_1616239622_1.jpg', cv.IMREAD_COLOR)
#image_test_RGB = cv.cvtColor(image_test, cv.COLOR_BGR2RGB)
resize_test = transforms.Compose([transforms.Resize((256, 256))])
image_resized = resize_test(image_test)

plt.imshow(image_test_RGB)
plt.title(image_test.size)
plt.axis('off')
plt.show()