In [36]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Import

In [37]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [38]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import CosineAnnealingLR
from torchvision import models, transforms
from torch.utils.data import DataLoader, Dataset
from PIL import Image
from sklearn.ensemble import IsolationForest
from tqdm import tqdm
import random

---

In [39]:
# GPU 사용 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

---

### Pretrained CNN을 한번 학습을 한 후 feature를 뽑아서 IsolationForest Model을 학습하는 코드

In [40]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(42) # Seed 고정

---

In [41]:
# train데이터 로딩 클래스 정의
class CustomDataset(Dataset):
    def __init__(self, csv_file, transform=None):
        """
        Args:
            csv_file (string): csv 파일의 경로.
            transform (callable, optional): 샘플에 적용될 Optional transform.
        """
        self.df = pd.read_csv(csv_file)
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
      img_path = '/content/drive/MyDrive/제로베이스/프로젝트/딥러닝 프로젝트/반도체 소자 이상탐지/open/train/' + self.df['img_path'].iloc[idx].split('/')[-1] # 폴더의 경로가 필요해서
      image = Image.open(img_path)
      if self.transform:
        image = self.transform(image)
      target = torch.tensor([0.]).float()
      return image, target

# 이미지 전처리 및 임베딩
transform = transforms.Compose([
    transforms.Resize((512, 512)), # 244 -> 512로 변경
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

train_data = CustomDataset(csv_file='/content/drive/MyDrive/제로베이스/프로젝트/딥러닝 프로젝트/반도체 소자 이상탐지/open/dataset/train.csv', transform=transform)
train_loader = DataLoader(train_data, batch_size=32, shuffle=False)

In [42]:
len(train_data)

426

---

## resnet18 모델 사용

In [43]:
model = models.resnet18(pretrained=True)
model.fc = nn.Linear(in_features=512, out_features=1, bias=True)
model = model.to(device)

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.001)
scheduler = CosineAnnealingLR(optimizer, T_max=100, eta_min=0.00001)



In [44]:
def train(model, train_loader, criterion, optimizer, scheduler, num_epochs=4):
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        running_corrects = 0
        total = 0

        for images, labels in train_loader:
            images = images.to(device)
            labels = labels.to(device)

            outputs = model(images)
            loss = criterion(outputs, labels.view(-1, 1))

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            predictions = (torch.sigmoid(outputs) > 0.5).float()
            running_corrects += torch.sum(predictions == labels.view(-1, 1)).item()
            total += labels.size(0)

        scheduler.step()

        epoch_loss = running_loss / len(train_loader)
        epoch_acc = running_corrects / total

        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_acc:.4f}')

---

In [45]:
# 모델 학습 실행
train(model, train_loader, criterion, optimizer, scheduler, num_epochs=4)

Epoch 1/4, Loss: 0.0972, Accuracy: 0.9624
Epoch 2/4, Loss: 0.0011, Accuracy: 1.0000
Epoch 3/4, Loss: 0.0005, Accuracy: 1.0000
Epoch 4/4, Loss: 0.0004, Accuracy: 1.0000


## Export Embedding Vector

In [46]:
model.eval()  # 추론 모드로 설정

# 특성 추출을 위한 모델의 마지막 레이어 수정
model = torch.nn.Sequential(*(list(model.children())[:-1]))

model.to(device)

# 이미지를 임베딩 벡터로 변환
def get_embeddings(dataloader, model):
    embeddings = []
    with torch.no_grad():
        for images, _ in tqdm(dataloader):
            images = images.to(device)
            emb = model(images)
            embeddings.append(emb.cpu().numpy().squeeze())
    return np.concatenate(embeddings, axis=0)

train_embeddings = get_embeddings(train_loader, model)

100%|██████████| 14/14 [02:24<00:00, 10.32s/it]


## Anomaly Detection Model Fitting

In [47]:
 # 코드된 추가
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, f1_score

---

In [48]:
# 차원 변경(4치원에서 2차원으로 차원 변경)
train_embeddings = train_embeddings.reshape(train_embeddings.shape[0], -1)

---

In [49]:
# Isolation Forest 모델 학습
clf = IsolationForest(random_state=42)
clf.fit(train_embeddings)

## Inference

In [50]:
class CustomDataset_test(Dataset):
    def __init__(self, csv_file, transform=None):
        """
        Args:
            csv_file (string): csv 파일의 경로.
            transform (callable, optional): 샘플에 적용될 Optional transform.
        """
        self.df = pd.read_csv(csv_file)
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_path = '/content/drive/MyDrive/제로베이스/프로젝트/딥러닝 프로젝트/반도체 소자 이상탐지/open/test/' + self.df['img_path'].iloc[idx].split('/')[-1]
        image = Image.open(img_path)
        if self.transform:
          image = self.transform(image)
        target = torch.tensor([0.]).float()
        return image, target

test_data = CustomDataset_test(csv_file='/content/drive/MyDrive/제로베이스/프로젝트/딥러닝 프로젝트/반도체 소자 이상탐지/open/dataset/test.csv', transform=transform)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False)

---

In [51]:
# # 테스트 데이터에 대해 이상 탐지 수행
test_embeddings = get_embeddings(test_loader, model)

# 차원 안맞아서 4차원에서 2차원으로 차원 변경
test_embeddings = test_embeddings.reshape(test_embeddings.shape[0], -1)

test_pred = clf.predict(test_embeddings)

# Isolation Forest의 예측 결과(이상 = -1, 정상 = 1)를 이상 = 1, 정상 = 0으로 변환
test_pred = np.where(test_pred == -1, 1, 0)

100%|██████████| 4/4 [01:47<00:00, 26.81s/it]


---

In [53]:
test_pred

array([1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1])

## Submission

In [55]:
submit = pd.read_csv('/content/drive/MyDrive/제로베이스/프로젝트/딥러닝 프로젝트/반도체 소자 이상탐지/open/dataset/sample_submission.csv')
submit['label'] = test_pred
submit.tail(10)

Unnamed: 0,id,label
90,TEST_090,1
91,TEST_091,1
92,TEST_092,1
93,TEST_093,1
94,TEST_094,1
95,TEST_095,1
96,TEST_096,0
97,TEST_097,0
98,TEST_098,1
99,TEST_099,1


In [56]:
# 갯수가 27~28개 뜨면 높음!
sum(submit['label'] == 1)

39

In [60]:
submit.to_csv('/content/drive/MyDrive/제로베이스/프로젝트/딥러닝 프로젝트/반도체 소자 이상탐지/open/dataset/final_submit.csv', index=False)