<a href="https://colab.research.google.com/github/goflvhxj/CIFAR100-with-NoisyLabel/blob/main/cleanlab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **1. Import Package**

In [None]:
# pip install cleanlab pandas matplotlib torch torchvision skorch

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [None]:
import numpy as np
import torch
import warnings

SEED = 123
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.cuda.manual_seed_all(SEED)
warnings.filterwarnings("ignore")

## **2. Prepare the dataset**

In [None]:
import glob
from tqdm import tqdm
import cv2
import pandas as pd

columns = ['file_path', 'label']
all = pd.read_csv('./Yonsei-vnl-coding-assignment-vision-48hrs/dataset/data/cifar100_nl.csv', names=columns)
test = pd.read_csv('./Yonsei-vnl-coding-assignment-vision-48hrs/dataset/data/cifar100_nl_test.csv', names=columns)

# train label만 남김
dropna_train = all.dropna(axis=0)
dropna_train.head()

# 파일명 순서대로 정렬
train = dropna_train.sort_values(by=['file_path'])
test = test.sort_values(by=['file_path'])

# train 출력
print(train.head())

                                  file_path        label
26291  cifar100_nl/img/train/AAAMhtuLlg.png    telephone
16926  cifar100_nl/img/train/AAEKhSzYjb.png         crab
26854  cifar100_nl/img/train/AAFmznqStg.png  willow_tree
24289  cifar100_nl/img/train/AAHSUlwrdk.png          sea
38649  cifar100_nl/img/train/AAHorghNrg.png    streetcar


In [None]:
import pandas as pd

# class별 갯수 확인
class_counts = train['label'].value_counts().to_dict()
print(class_counts)

# 클래스에 대한 숫자형 라벨을 생성
label_map = {name: i for i, (name, count) in enumerate(class_counts.items())}

# 매핑된 라벨을 확인
print('\n',label_map)

{'pine_tree': 535, 'cockroach': 533, 'tiger': 532, 'shark': 528, 'hamster': 524, 'lizard': 522, 'beaver': 519, 'streetcar': 518, 'porcupine': 517, 'train': 516, 'mountain': 516, 'snail': 516, 'kangaroo': 515, 'elephant': 515, 'boy': 515, 'pear': 512, 'poppy': 512, 'cattle': 511, 'plate': 511, 'oak_tree': 510, 'television': 510, 'fox': 510, 'lobster': 510, 'woman': 509, 'squirrel': 509, 'camel': 509, 'sunflower': 508, 'bee': 507, 'bridge': 507, 'pickup_truck': 506, 'wardrobe': 506, 'couch': 505, 'shrew': 505, 'road': 504, 'tractor': 504, 'keyboard': 504, 'willow_tree': 504, 'clock': 503, 'can': 502, 'seal': 502, 'apple': 502, 'leopard': 501, 'wolf': 501, 'castle': 501, 'raccoon': 501, 'ray': 501, 'bus': 501, 'rabbit': 500, 'maple_tree': 500, 'bottle': 500, 'spider': 499, 'motorcycle': 499, 'sea': 499, 'chair': 498, 'bear': 497, 'lion': 497, 'rocket': 496, 'crocodile': 496, 'dinosaur': 496, 'bed': 495, 'cup': 495, 'sweet_pepper': 495, 'otter': 495, 'skunk': 494, 'aquarium_fish': 494, 'bu

In [None]:
import os
from glob import glob

def get_train_data(data_dir):
    img_path_list = []
    label_list = []
    
    # get image path
    img_path_list.extend(glob(os.path.join(data_dir, '*.png')))
    img_path_list.sort(key=lambda x:(x.split('/')[-1].split('.')[0]))
        
    # get label
    label_list.extend(train['label'])
                
    return img_path_list, label_list

def get_test_data(data_dir):
    img_path_list = []
    label_list = []
    
    # get image path
    img_path_list.extend(glob(os.path.join(data_dir, '*.png')))
    img_path_list.sort(key=lambda x:(x.split('/')[-1].split('.')[0]))
    #print(img_path_list)

    # get label
    label_list.extend(test['label'])
    
    return img_path_list, label_list

In [None]:
# train image path와 class 이름으로 된 label 받아옴
train_img_path, train_label = get_train_data('./Yonsei-vnl-coding-assignment-vision-48hrs/dataset/cifar100_nl/img/train')

print(train_img_path[:5])
print(train_label[:5])

['./Yonsei-vnl-coding-assignment-vision-48hrs/dataset/cifar100_nl/img/train/AAAMhtuLlg.png', './Yonsei-vnl-coding-assignment-vision-48hrs/dataset/cifar100_nl/img/train/AAEKhSzYjb.png', './Yonsei-vnl-coding-assignment-vision-48hrs/dataset/cifar100_nl/img/train/AAFmznqStg.png', './Yonsei-vnl-coding-assignment-vision-48hrs/dataset/cifar100_nl/img/train/AAHSUlwrdk.png', './Yonsei-vnl-coding-assignment-vision-48hrs/dataset/cifar100_nl/img/train/AAHorghNrg.png']
['telephone', 'crab', 'willow_tree', 'sea', 'streetcar']


In [None]:
# object형태의 label을 int형으로 변환
labels = train_label
map_train_labels = [label_map[label] for label in labels]
print(map_train_labels)

[83, 86, 36, 52, 7, 47, 32, 52, 65, 53, 30, 69, 80, 75, 22, 71, 26, 75, 66, 83, 6, 94, 36, 42, 38, 91, 96, 51, 19, 71, 27, 98, 46, 9, 24, 30, 65, 98, 51, 36, 3, 56, 25, 59, 68, 63, 83, 58, 13, 43, 30, 57, 12, 8, 84, 36, 1, 86, 23, 46, 5, 17, 76, 26, 40, 29, 80, 76, 33, 45, 37, 40, 64, 7, 11, 60, 10, 81, 52, 12, 99, 14, 8, 0, 91, 10, 64, 90, 96, 15, 54, 63, 90, 50, 98, 71, 5, 1, 51, 98, 45, 41, 98, 37, 71, 48, 0, 6, 55, 49, 49, 74, 8, 64, 82, 80, 60, 64, 16, 7, 21, 75, 94, 28, 20, 5, 89, 9, 68, 7, 44, 61, 84, 98, 53, 46, 38, 25, 68, 73, 45, 95, 86, 84, 45, 46, 19, 48, 30, 57, 93, 40, 6, 82, 31, 60, 99, 82, 12, 96, 85, 32, 41, 55, 53, 29, 21, 32, 12, 33, 2, 61, 32, 52, 24, 93, 91, 27, 43, 48, 95, 34, 95, 53, 40, 6, 86, 0, 52, 31, 84, 45, 93, 57, 89, 99, 16, 22, 93, 31, 75, 25, 50, 88, 73, 30, 74, 58, 75, 9, 41, 41, 51, 45, 12, 20, 19, 45, 6, 99, 61, 21, 42, 30, 63, 1, 8, 95, 48, 18, 37, 14, 97, 89, 99, 70, 32, 72, 51, 84, 3, 11, 54, 25, 31, 69, 35, 19, 30, 88, 23, 33, 95, 78, 19, 88, 80,

In [None]:
import torchvision.datasets as datasets # 이미지 데이터셋 집합체
import torchvision.transforms as transforms # 이미지 변환 툴

from torch.utils.data import DataLoader # 학습 및 배치로 모델에 넣어주기 위한 툴
from torch.utils.data import DataLoader, Dataset

import cv2

class C100Dataset(Dataset):
    def __init__(self, img_path_list, label_list, train_mode=True, transforms=None): #필요한 변수들을 선언
        self.transforms = transforms
        self.train_mode = train_mode
        self.img_path_list = img_path_list
        self.label_list = label_list

    def __getitem__(self, index): #index번째 data를 return
        img_path = self.img_path_list[index]
        # Get image data
        #print(img_path)
        image = cv2.imread(img_path)
        if self.transforms is not None:
            image = self.transforms(image)

        if self.train_mode:
            label = self.label_list[index]
            return image, label
        else:
            return image
    
    def __len__(self): #길이 return
        return len(self.img_path_list)

In [None]:
train_transform = transforms.Compose([
                    transforms.ToPILImage(), # Numpy배열에서 PIL이미지로
                    transforms.Resize([224, 224]), # 이미지 사이즈 변형
                    transforms.RandomHorizontalFlip(),
                    transforms.ToTensor(), # 이미지 데이터를 tensor
                    transforms.Normalize(mean=(0.4409, 0.4865, 0.5070), std=(0.2761, 0.2564, 0.2673)) # 이미지 정규화
                    ])

In [None]:
train_dataset = C100Dataset(train_img_path, map_train_labels, train_mode=True, transforms=train_transform) 
train_loader = DataLoader(train_dataset, batch_size = 49999, shuffle=True, num_workers=0)

X_images, labels = next(iter(train_loader))

X_images.shape

torch.Size([49999, 3, 224, 224])

In [None]:
# import torchvision.transforms as transforms # 이미지 변환 툴

# cifar100 = []
# for i in tqdm(range(len(train_img_path))):
#     cifar100.append(cv2.imread(train_img_path[i]))

In [None]:
import numpy as np
from PIL import Image,ImageFilter
# X : 이미지를 numpy 배열로 펼쳐놓은 것

# X = np.array(cifar100).reshape(len(cifar100), 3, 32, 32).astype('float32')
X = np.array(X_images).reshape(len(X_images), 3, 224, 224).astype('float32')

X /= 255.0

print(X.shape)
print(X[0])

(49999, 3, 224, 224)
[[[-0.00241901 -0.00241901 -0.00241901 ... -0.00091512 -0.00091512
   -0.00091512]
  [-0.00241901 -0.00241901 -0.00241901 ... -0.00091512 -0.00091512
   -0.00091512]
  [-0.00241901 -0.00241901 -0.00241901 ... -0.00091512 -0.00091512
   -0.00091512]
  ...
  [-0.0045356  -0.0045356  -0.0045356  ... -0.00353301 -0.00353301
   -0.00353301]
  [-0.0045356  -0.0045356  -0.0045356  ... -0.00353301 -0.00353301
   -0.00353301]
  [-0.0045356  -0.0045356  -0.0045356  ... -0.00353301 -0.00353301
   -0.00353301]]

 [[-0.00312237 -0.00312237 -0.00312237 ... -0.00066322 -0.00066322
   -0.00066322]
  [-0.00312237 -0.00312237 -0.00312237 ... -0.00066322 -0.00066322
   -0.00066322]
  [-0.00312237 -0.00312237 -0.00312237 ... -0.00066322 -0.00066322
   -0.00066322]
  ...
  [-0.00480179 -0.00480179 -0.00480179 ... -0.00336229 -0.00336229
   -0.00336229]
  [-0.00480179 -0.00480179 -0.00480179 ... -0.00336229 -0.00336229
   -0.00336229]
  [-0.00480179 -0.00480179 -0.00480179 ... -0.003362

In [None]:
# y : X의 라벨 값

y = map_train_labels

print(X.shape)
print(len(y))

(49999, 3, 224, 224)
49999


In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# train_data = list(zip(X_train, y_train))
# test_data = list(zip(X_test, y_test))

## **3. Define an image classification model**

In [None]:
from torch import nn

class ClassifierModule(nn.Module):
    def __init__(self):
        super().__init__()

        self.cnn = nn.Sequential(
            nn.Conv2d(3, 6, 3),
            nn.ReLU(),
            nn.BatchNorm2d(6),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(6, 16, 3),
            nn.ReLU(),
            nn.BatchNorm2d(16),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )
        self.out = nn.Sequential(
            nn.Flatten(),
            nn.LazyLinear(128),
            nn.ReLU(),
            nn.Linear(128, 100),
            nn.Softmax(dim=-1),
        )

    def forward(self, X):
        X = self.cnn(X)
        X = self.out(X)
        return X

In [None]:
import torch
from torchsummary import summary

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = ClassifierModule().to(device)

summary(model, (3, 224, 224), device=device.type)



----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1          [-1, 6, 222, 222]             168
              ReLU-2          [-1, 6, 222, 222]               0
       BatchNorm2d-3          [-1, 6, 222, 222]              12
         MaxPool2d-4          [-1, 6, 111, 111]               0
            Conv2d-5         [-1, 16, 109, 109]             880
              ReLU-6         [-1, 16, 109, 109]               0
       BatchNorm2d-7         [-1, 16, 109, 109]              32
         MaxPool2d-8           [-1, 16, 54, 54]               0
           Flatten-9                [-1, 46656]               0
           Linear-10                  [-1, 128]       5,972,096
             ReLU-11                  [-1, 128]               0
           Linear-12                  [-1, 100]          12,900
          Softmax-13                  [-1, 100]               0
Total params: 5,986,088
Trainable param

In [None]:
from torch import nn
# VGG16 Model 
class VGG16(nn.Module):
    def __init__(self):
        super(VGG16, self).__init__()

        self.layer1 = nn.Sequential(
            nn.Conv2d(in_channels = 3, out_channels = 64, kernel_size = 3, stride = 1, padding = 1),
            nn.BatchNorm2d(num_features = 64),
            nn.ReLU()
        )

        self.layer2 = nn.Sequential(
            nn.Conv2d(in_channels = 64, out_channels = 64, kernel_size = 3, stride = 1, padding = 1),
            nn.BatchNorm2d(num_features = 64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 2, stride = 2)
        )

        self.layer3 = nn.Sequential(
            nn.Conv2d(64, 128, kernel_size = 3, stride = 1, padding = 1),
            nn.BatchNorm2d(128),
            nn.ReLU()
        )

        self.layer4 = nn.Sequential(
            nn.Conv2d(128, 128, kernel_size = 3, stride = 1, padding = 1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 2, stride = 2)
        )

        self.layer5 = nn.Sequential(
            nn.Conv2d(128, 256, kernel_size = 3, stride = 1, padding = 1),
            nn.BatchNorm2d(256),
            nn.ReLU()
        )

        self.layer6 = nn.Sequential(
            nn.Conv2d(256, 256, kernel_size = 3, stride = 1, padding = 1),
            nn.BatchNorm2d(256),
            nn.ReLU()
        )

        self.layer7 = nn.Sequential(
            nn.Conv2d(256, 256, kernel_size = 3, stride = 1, padding = 1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 2, stride = 2)
        ) 

        self.layer8 = nn.Sequential(
            nn.Conv2d(256, 512, kernel_size = 3, stride = 1, padding = 1),
            nn.BatchNorm2d(512),
            nn.ReLU()
        )

        self.layer9 = nn.Sequential(
            nn.Conv2d(512, 512, kernel_size = 3, stride = 1, padding = 1),
            nn.BatchNorm2d(512),
            nn.ReLU()
        )

        self.layer10 = nn.Sequential(
            nn.Conv2d(512, 512, kernel_size = 3, stride = 1, padding = 1),
            nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 2, stride = 2)
        )            
        
        self.layer11 = nn.Sequential(
            nn.Conv2d(512, 512, kernel_size = 3, stride = 1, padding = 1),
            nn.BatchNorm2d(512),
            nn.ReLU()
        )

        self.layer12 = nn.Sequential(
            nn.Conv2d(512, 512, kernel_size = 3, stride = 1, padding = 1),
            nn.BatchNorm2d(512),
            nn.ReLU()
        )

        self.layer13 = nn.Sequential(
            nn.Conv2d(512, 512, kernel_size = 3, stride = 1, padding = 1),
            nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 2, stride = 2)
        )            
        
        self.layer14 = nn.Sequential(
            nn.AdaptiveAvgPool2d(output_size = (7,7))
        )

        self.fc1 = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(512*7*7, 4096),
            nn.ReLU()
        )

        self.fc2 = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(4096, 4096),
            nn.ReLU()
        )

        self.fc3 = nn.Sequential(
            nn.Linear(4096, 100)
        )

    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = self.layer5(out)
        out = self.layer6(out)
        out = self.layer7(out)
        out = self.layer8(out)
        out = self.layer9(out)
        out = self.layer10(out)
        out = self.layer11(out)    
        out = self.layer12(out)
        out = self.layer13(out)
#         out = self.layer14(out)
        out = out.view(out.size(0), -1) # 배치사이즈 dimension은 유지하고 나머지 부분은 쭉 펴진 것
        out = self.fc1(out)
        out = self.fc2(out)
        out = self.fc3(out)
        return out

In [None]:
from torchsummary import summary

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = VGG16().to(device)

summary(model, (3, 224, 224), device=device.type)

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 64, 224, 224]           1,792
       BatchNorm2d-2         [-1, 64, 224, 224]             128
              ReLU-3         [-1, 64, 224, 224]               0
            Conv2d-4         [-1, 64, 224, 224]          36,928
       BatchNorm2d-5         [-1, 64, 224, 224]             128
              ReLU-6         [-1, 64, 224, 224]               0
         MaxPool2d-7         [-1, 64, 112, 112]               0
            Conv2d-8        [-1, 128, 112, 112]          73,856
       BatchNorm2d-9        [-1, 128, 112, 112]             256
             ReLU-10        [-1, 128, 112, 112]               0
           Conv2d-11        [-1, 128, 112, 112]         147,584
      BatchNorm2d-12        [-1, 128, 112, 112]             256
             ReLU-13        [-1, 128, 112, 112]               0
        MaxPool2d-14          [-1, 128,

In [None]:
from torchvision import models
import torch
import torch.nn as nn
from torchsummary import summary

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

resnet18_pretrained = models.resnet18(pretrained=False).to(device) # true 옵션으로 사전 학습된 모델을 로드

# fc layer 출력 노드 개수를 num_ftrs에 저장
num_ftrs = resnet18_pretrained.fc.in_features
# # FC layer를 class = 100으로 설정
resnet18_pretrained.fc = nn.Linear(num_ftrs, 100)

model = resnet18_pretrained.to(device)

summary(model, (3,224,224), device = device.type)
# resnet18_pretrained.to(device)

# print(resnet18_pretrained)

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 64, 112, 112]           9,408
       BatchNorm2d-2         [-1, 64, 112, 112]             128
              ReLU-3         [-1, 64, 112, 112]               0
         MaxPool2d-4           [-1, 64, 56, 56]               0
            Conv2d-5           [-1, 64, 56, 56]          36,864
       BatchNorm2d-6           [-1, 64, 56, 56]             128
              ReLU-7           [-1, 64, 56, 56]               0
            Conv2d-8           [-1, 64, 56, 56]          36,864
       BatchNorm2d-9           [-1, 64, 56, 56]             128
             ReLU-10           [-1, 64, 56, 56]               0
       BasicBlock-11           [-1, 64, 56, 56]               0
           Conv2d-12           [-1, 64, 56, 56]          36,864
      BatchNorm2d-13           [-1, 64, 56, 56]             128
             ReLU-14           [-1, 64,

In [None]:
from skorch import NeuralNetClassifier

model_skorch = NeuralNetClassifier(model, max_epochs = 20)

## **4. K-fold 교차 검정 이용하여 예측확률 계산**

In [None]:
from sklearn.model_selection import cross_val_predict
import torch


num_crossval_folds = 5
pred_probs = cross_val_predict(model_skorch, X, y, cv=num_crossval_folds, method = 'predict_proba')

  epoch    train_loss    valid_acc    valid_loss       dur
-------  ------------  -----------  ------------  --------
      1           nan       [32m0.0104[0m           nan  202.7369
      2           nan       0.0104           nan  202.1931
      3           nan       0.0104           nan  198.5056
  epoch    train_loss    valid_acc    valid_loss       dur
-------  ------------  -----------  ------------  --------
      1           nan       [32m0.0103[0m           nan  196.8019


In [None]:
from sklearn.metrics import accuracy_score

predicted_labels = pred_probs.argmax(axis=1)
acc = accuracy_score(y, predicted_labels)

print(f"Cross-validated estimate of accuracy on held-out data: {acc}")

## **5. Use cleanlab to find label issues**

In [None]:
from cleanlab.filter import find_label_issues

ranked_label_issues = find_label_issues(y, pred_probs, return_indices_ranked_by="self_confidence")

print(f"Cleanlab found {len(ranked_label_issues)} label issues.")
print("Here are the indices of the top 15 most likely label errors:\n"
      f"{ranked_label_issues[:15]}")

In [None]:
import matplotlib.pyplot as plt

def plot_examples(id_iter, nrows=1, ncols=1):
    for count, id in enumerate(id_iter):
        plt.subplot(nrows, ncols, count + 1)
        plt.imshow(X[id].reshape(32, 32), cmap="gray")
        plt.title(f"id: {id} \n label: {y[id]}")
        plt.axis("off")

    plt.tight_layout(h_pad=2.0)

In [None]:
plot_examples(ranked_label_issues[range(15)], 3, 5)